From f452f572878aebaf197585e5da1b45bb937ba0c1 Mon Sep 17 00:00:00 2001 From: Alex J Date: Thu, 9 Apr 2026 09:24:30 +0100 Subject: [PATCH 01/12] chore: making new branch (#1478) merging personal branch to a new branch. Add conditional cronjob and increase backoff limit Refactor time-stamper.sh script inclusion in ConfigMap Time stamper cron (#1479) Change chart version from 1.13.0 to 0.1.0 removing not fit for purpose test Update time-stamper.sh removing the logic for if the pvc wasn't mounted. now only annotates mounted pvcs. Base logic for pvc auto deletion Sorting out the weekly cronjob for pvc auto deletion, also adding someting to value yaml to turn it off Added Del perm del s changing name to be more readable fix: if there isn't a last_used check if not null Fix for time-stamper.sh as it was annotating all PVCs pvc deletion test fix removing test yaml added affinity and tolerations to cronjobs added an affinity to the cronjob. --- helm/blueapi/README.md | 8 +- helm/blueapi/files/scripts/pvc-deletion.sh | 29 +++ helm/blueapi/files/scripts/time-stamper.sh | 10 ++ helm/blueapi/templates/configmap.yaml | 4 +- .../blueapi/templates/cronjob-configmaps.yaml | 22 +++ helm/blueapi/templates/cronjob.yaml | 169 ++++++++++++++++++ helm/blueapi/values.schema.json | 18 +- helm/blueapi/values.yaml | 27 +-- 8 files changed, 270 insertions(+), 17 deletions(-) create mode 100644 helm/blueapi/files/scripts/pvc-deletion.sh create mode 100644 helm/blueapi/files/scripts/time-stamper.sh create mode 100644 helm/blueapi/templates/cronjob-configmaps.yaml create mode 100644 helm/blueapi/templates/cronjob.yaml diff --git a/helm/blueapi/README.md b/helm/blueapi/README.md index 3862290fb8..17e5066071 100644 --- a/helm/blueapi/README.md +++ b/helm/blueapi/README.md @@ -32,8 +32,12 @@ A Helm chart deploying a worker pod that runs Bluesky plans | podAnnotations | object | `{}` | | | podLabels | object | `{}` | | | podSecurityContext | object | `{}` | | +| pvcAutoDeletion.enabled | bool | `true` | | | readinessProbe | object | `{"failureThreshold":2,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode. | -| resources | object | `{"limits":{"cpu":"2000m","memory":"4000Mi"},"requests":{"cpu":"200m","memory":"400Mi"}}` | Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be >= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less. | +| resources.limits.cpu | string | `"2000m"` | | +| resources.limits.memory | string | `"4000Mi"` | | +| resources.requests.cpu | string | `"200m"` | | +| resources.requests.memory | string | `"400Mi"` | | | restartOnConfigChange | bool | `true` | If enabled the blueapi pod will restart on changes to `worker` | | securityContext.runAsNonRoot | bool | `true` | | | securityContext.runAsUser | int | `1000` | | @@ -44,6 +48,7 @@ A Helm chart deploying a worker pod that runs Bluesky plans | serviceAccount.create | bool | `false` | | | serviceAccount.name | string | `""` | | | startupProbe | object | `{"failureThreshold":5,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | A more lenient livenessProbe to allow the service to start fully. This is automatically disabled when in debug mode. | +| timeStampCron.enabled | bool | `true` | | | tolerations | list | `[]` | May be required to run on specific nodes (e.g. the control machine) | | tracing | object | `{"fastapi":{"excludedURLs":"/healthz"},"otlp":{"enabled":false,"protocol":"http/protobuf","server":{"host":"http://opentelemetry-collector.tracing","port":4318}}}` | Exclude health probe requests from tracing by default to prevent spamming | | volumeMounts | list | `[{"mountPath":"/config","name":"worker-config","readOnly":true}]` | Additional volumeMounts on the output StatefulSet definition. Define how volumes are mounted to the container referenced by using the same name. | @@ -51,6 +56,5 @@ A Helm chart deploying a worker pod that runs Bluesky plans | worker | object | `{"api":{"url":"http://0.0.0.0:8000/"},"env":{"sources":[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]},"logging":{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"},"scratch":{"repositories":[],"root":"/workspace"},"stomp":{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}}` | Config for the worker goes here, will be mounted into a config file | | worker.api.url | string | `"http://0.0.0.0:8000/"` | 0.0.0.0 required to allow non-loopback traffic If using hostNetwork, the port must be free on the host | | worker.env.sources | list | `[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]` | modules (must be installed in the venv) to fetch devices/plans from | -| worker.logging | object | `{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"}` | Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` | | worker.scratch | object | `{"repositories":[],"root":"/workspace"}` | If initContainer is enabled the default branch of python projects in this section are installed into the venv *without their dependencies* | | worker.stomp | object | `{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}` | Message bus configuration for returning status to GDA/forwarding documents downstream Password may be in the form ${ENV_VAR} to be fetched from an environment variable e.g. mounted from a SealedSecret | diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh new file mode 100644 index 0000000000..d912a1542d --- /dev/null +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -0,0 +1,29 @@ +#!/bin/sh +# Get all PVCs by running pods +ALL_PVCS=$(kubectl get pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | sort -u) +BLUEAPI_PVCS=$( echo $ALL_PVCS | tr ' ' '\n' | grep blueapi-scratch) +NOW=$(date +%s) +#loop through all pvcs. +for pvc in $BLUEAPI_PVCS; do + #check if pvc has last-used annotation + if kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}' + then + #get last used annotation + LAST_USED=$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') + #checking if its not null + if [ -n "$LAST_USED" ]; then + #check if last_used is older than 3 months + if [ $(($NOW - LAST_USED)) -gt 7884000 ]; then + #checking if the pvc is protected, if it is protected skip deletion + if [ "$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then + echo "PVC $pvc is protected, skipping deletion" + continue + fi + #PVC has not been used for more than three months, delete it + kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE + fi + fi + else + echo "PVC $pvc does not have last-used annotation, skipping deletion" + fi +done diff --git a/helm/blueapi/files/scripts/time-stamper.sh b/helm/blueapi/files/scripts/time-stamper.sh new file mode 100644 index 0000000000..40de8d006f --- /dev/null +++ b/helm/blueapi/files/scripts/time-stamper.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Get all PVCs currently mounted by running pods +MOUNTED_PVCS=$(kubectl get pods -n $RELEASE_NAMESPACE \ + -o=jsonpath='{.items[*].spec.volumes[*].persistentVolumeClaim.claimName}' | tr ' ' '\n' | sort -u) +BLUEAPI_PVCS=$( echo $MOUNTED_PVCS | tr ' ' '\n' | grep blueapi-scratch) +#loop through all the pvcs annotating ones thare are mounted +NOW=$(date +%s) +for pvc in $BLUEAPI_PVCS; do + kubectl annotate --overwrite pvc "$pvc" -n $RELEASE_NAMESPACE last-used="$NOW" +done diff --git a/helm/blueapi/templates/configmap.yaml b/helm/blueapi/templates/configmap.yaml index aa813e6485..93ba1447ea 100644 --- a/helm/blueapi/templates/configmap.yaml +++ b/helm/blueapi/templates/configmap.yaml @@ -31,6 +31,6 @@ data: init_config.yaml: |- scratch: {{- toYaml .Values.worker.scratch | nindent 6 }} -{{- end }} ---- +--- +{{- end }} diff --git a/helm/blueapi/templates/cronjob-configmaps.yaml b/helm/blueapi/templates/cronjob-configmaps.yaml new file mode 100644 index 0000000000..188bb1a5f7 --- /dev/null +++ b/helm/blueapi/templates/cronjob-configmaps.yaml @@ -0,0 +1,22 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-stamper-script +data: + {{- $files := .Files }} + time-stamper.sh: |- +{{ $files.Get "files/scripts/time-stamper.sh" | indent 4 }} +--- +{{- end }} + +{{- if .Values.pvcAutoDeletion.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name : {{include "blueapi.fullname" . }}-pvc-auto-deletion-script +data: + {{- $files := .Files }} + pvc-deletion.sh: |- +{{ $files.Get "files/scripts/pvc-deletion.sh" | indent 4 }} +{{- end }} diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml new file mode 100644 index 0000000000..3dea62180b --- /dev/null +++ b/helm/blueapi/templates/cronjob.yaml @@ -0,0 +1,169 @@ +{{- if .Values.timeStampCron.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["pods", "persistentvolumeclaims"] + verbs: ["get", "list", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-last-used-stamper + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-last-used-stamper + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + schedule: "*/5 * * * *" + + jobTemplate: + spec: + # amount of attempts of labeling a pvc + backoffLimit: 3 + # job stops after 180 seconds + activeDeadlineSeconds: 180 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-last-used-stamper + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: {{include "blueapi.fullname" . }}-pvc-stamper-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-stamper-script + defaultMode: 0555 + containers: + - name: last-used-stamper + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-stamper-script + mountPath: /scripts + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: ["/scripts/time-stamper.sh"] + restartPolicy: OnFailure +{{- end }} +{{- if .Values.pvcAutoDeletion.enabled }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +rules: +- apiGroups: [""] + resources: ["pods", "persistentvolumeclaims"] + verbs: ["get", "list", "patch","delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +subjects: +- kind: ServiceAccount + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + namespace: {{ .Release.Namespace }} +spec: + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 1 + schedule: "@weekly" + + jobTemplate: + spec: + # amount of attempts of labeling a pvc + backoffLimit: 3 + # job stops after 300 seconds + activeDeadlineSeconds: 300 + template: + spec: + serviceAccountName: {{ include "blueapi.fullname" . }}-pvc-auto-deletion + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + volumes: + - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + configMap: + name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + defaultMode: 0555 + containers: + - name: pvc-auto-deletion + env: + - name: RELEASE_NAME + value: {{ .Release.Name }} + - name: RELEASE_NAMESPACE + value: {{ .Release.Namespace }} + volumeMounts: + - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script + mountPath: /scripts + image: bitnami/kubectl:latest + imagePullPolicy: IfNotPresent + command: ["/scripts/pvc-deletion.sh"] + restartPolicy: OnFailure +{{- end }} diff --git a/helm/blueapi/values.schema.json b/helm/blueapi/values.schema.json index 3159f6713e..654e1178d4 100644 --- a/helm/blueapi/values.schema.json +++ b/helm/blueapi/values.schema.json @@ -174,6 +174,14 @@ "podSecurityContext": { "type": "object" }, + "pvcAutoDeletion": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "readinessProbe": { "description": "Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode.", "type": "object", @@ -198,7 +206,6 @@ } }, "resources": { - "description": "Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be \u003e= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less.", "type": "object", "properties": { "limits": { @@ -292,6 +299,14 @@ } } }, + "timeStampCron": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } + }, "tolerations": { "description": "May be required to run on specific nodes (e.g. the control machine)", "type": "array" @@ -389,7 +404,6 @@ } }, "logging": { - "description": "Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi`", "type": "object", "properties": { "graylog": { diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 876b37a989..c7e6e2fa1e 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -36,8 +36,7 @@ podAnnotations: {} # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ podLabels: {} -podSecurityContext: {} - # fsGroup: 2000 +podSecurityContext: {} # fsGroup: 2000 securityContext: # https://github.com/DiamondLightSource/blueapi/issues/1096 @@ -48,7 +47,7 @@ securityContext: # drop: # - ALL -# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ + # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ service: # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types # -- To make blueapi available on an IP outside of the cluster prior to an Ingress being created, change this to LoadBalancer @@ -76,13 +75,13 @@ ingress: # hosts: # - chart-example.local -# -- Sets the compute resources available to the pod. -# These defaults are appropriate when using debug mode or an internal PVC and therefore -# running VS Code server in the pod. -# In the Diamond cluster, requests must be >= 0.1*limits -# When not using either of the above, the limits may be lowered. -# When idle but connected, blueapi consumes ~400MB of memory and 1% cpu -# and may struggle when allocated less. + # -- Sets the compute resources available to the pod. + # These defaults are appropriate when using debug mode or an internal PVC and therefore + # running VS Code server in the pod. + # In the Diamond cluster, requests must be >= 0.1*limits + # When not using either of the above, the limits may be lowered. + # When idle but connected, blueapi consumes ~400MB of memory and 1% cpu + # and may struggle when allocated less. resources: # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little @@ -205,7 +204,7 @@ worker: repositories: [] # - name: "dodal" # remote_url: https://github.com/DiamondLightSource/dodal.git - # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` + # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` logging: level: "INFO" graylog: @@ -224,6 +223,12 @@ initContainer: # -- Size of persistent volume size: "1Gi" +timeStampCron: + enabled: true + +pvcAutoDeletion: + enabled: true + debug: # -- If enabled, runs debugpy, allowing port-forwarding to expose port 5678 or attached vscode instance enabled: false From db3dc7a76f8ca9e0af70e4321b049094e55d9a0f Mon Sep 17 00:00:00 2001 From: alexj9837 <52531949+Alexj9837@users.noreply.github.com> Date: Wed, 29 Apr 2026 16:03:43 +0000 Subject: [PATCH 02/12] reduced the history of successful jobs, updated deletion script --- helm/blueapi/files/scripts/pvc-deletion.sh | 13 +++++-------- helm/blueapi/templates/cronjob.yaml | 6 +++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh index d912a1542d..dfcfb4d22a 100644 --- a/helm/blueapi/files/scripts/pvc-deletion.sh +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -6,24 +6,21 @@ NOW=$(date +%s) #loop through all pvcs. for pvc in $BLUEAPI_PVCS; do #check if pvc has last-used annotation - if kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}' - then #get last used annotation - LAST_USED=$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') + LAST_USED=$(kubectl get pvc "$pvc" -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.last-used}') #checking if its not null if [ -n "$LAST_USED" ]; then #check if last_used is older than 3 months if [ $(($NOW - LAST_USED)) -gt 7884000 ]; then #checking if the pvc is protected, if it is protected skip deletion - if [ "$(kubectl get pvc $pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then - echo "PVC $pvc is protected, skipping deletion" + if [ "$(kubectl get pvc "$pvc" -n $RELEASE_NAMESPACE -o=jsonpath='{.metadata.annotations.protected}')" = "true" ]; then + echo " PVC $pvc is protected, skipping deletion" continue fi #PVC has not been used for more than three months, delete it kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE fi - fi - else - echo "PVC $pvc does not have last-used annotation, skipping deletion" + else + echo " $pvc has no last-used annotation" fi done diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml index 3dea62180b..630b897e31 100644 --- a/helm/blueapi/templates/cronjob.yaml +++ b/helm/blueapi/templates/cronjob.yaml @@ -37,7 +37,7 @@ metadata: namespace: {{ .Release.Namespace }} spec: concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 + successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 schedule: "*/5 * * * *" @@ -122,13 +122,13 @@ metadata: namespace: {{ .Release.Namespace }} spec: concurrencyPolicy: Forbid - successfulJobsHistoryLimit: 3 + successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 schedule: "@weekly" jobTemplate: spec: - # amount of attempts of labeling a pvc + # amount of attempts for pvc deletion backoffLimit: 3 # job stops after 300 seconds activeDeadlineSeconds: 300 From 12991dad55afd7c990c8e1bb32432c5d3882fbdd Mon Sep 17 00:00:00 2001 From: Alex J <52531949+Alexj9837@users.noreply.github.com> Date: Wed, 6 May 2026 13:46:53 +0100 Subject: [PATCH 03/12] Update helm/blueapi/templates/cronjob.yaml Co-authored-by: Zoheb Shaikh --- helm/blueapi/templates/cronjob.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml index 630b897e31..16d4fa5013 100644 --- a/helm/blueapi/templates/cronjob.yaml +++ b/helm/blueapi/templates/cronjob.yaml @@ -162,7 +162,7 @@ spec: volumeMounts: - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script mountPath: /scripts - image: bitnami/kubectl:latest + image: rancher/kubectl@sha256:05d2b313e2f397e0ade252136aed47abd72d56ead11d1b027ac70f66362c8495 # v1.36.0 imagePullPolicy: IfNotPresent command: ["/scripts/pvc-deletion.sh"] restartPolicy: OnFailure From 767b7a9ace68e951fdd1e09c93f80590c0d74cb2 Mon Sep 17 00:00:00 2001 From: Alex J <52531949+Alexj9837@users.noreply.github.com> Date: Wed, 6 May 2026 14:15:45 +0100 Subject: [PATCH 04/12] Update comment formatting in values.yaml --- helm/blueapi/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index c7e6e2fa1e..6f22e03611 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -47,7 +47,7 @@ securityContext: # drop: # - ALL - # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ +# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/ service: # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types # -- To make blueapi available on an IP outside of the cluster prior to an Ingress being created, change this to LoadBalancer From 329d1b8e851f0407525bedf73747cec2231c05c1 Mon Sep 17 00:00:00 2001 From: Alex J <52531949+Alexj9837@users.noreply.github.com> Date: Wed, 6 May 2026 14:16:22 +0100 Subject: [PATCH 05/12] Update podSecurityContext in values.yaml Commented out the fsGroup setting in podSecurityContext. --- helm/blueapi/values.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 6f22e03611..5b19b48767 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -36,7 +36,8 @@ podAnnotations: {} # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ podLabels: {} -podSecurityContext: {} # fsGroup: 2000 +podSecurityContext: {} +# fsGroup: 2000 securityContext: # https://github.com/DiamondLightSource/blueapi/issues/1096 From d48694f346aa62a61969c9ca9ebdf535fd716d95 Mon Sep 17 00:00:00 2001 From: Alex J <52531949+Alexj9837@users.noreply.github.com> Date: Wed, 6 May 2026 14:23:09 +0100 Subject: [PATCH 06/12] Update values.yaml for pod security context and comments --- helm/blueapi/values.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 5b19b48767..6909d429c6 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -36,7 +36,7 @@ podAnnotations: {} # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ podLabels: {} -podSecurityContext: {} +podSecurityContext: {} # fsGroup: 2000 securityContext: @@ -76,13 +76,13 @@ ingress: # hosts: # - chart-example.local - # -- Sets the compute resources available to the pod. - # These defaults are appropriate when using debug mode or an internal PVC and therefore - # running VS Code server in the pod. - # In the Diamond cluster, requests must be >= 0.1*limits - # When not using either of the above, the limits may be lowered. - # When idle but connected, blueapi consumes ~400MB of memory and 1% cpu - # and may struggle when allocated less. +# -- Sets the compute resources available to the pod. +# These defaults are appropriate when using debug mode or an internal PVC and therefore +# running VS Code server in the pod. +# In the Diamond cluster, requests must be >= 0.1*limits +# When not using either of the above, the limits may be lowered. +# When idle but connected, blueapi consumes ~400MB of memory and 1% cpu +# and may struggle when allocated less. resources: # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little @@ -205,7 +205,7 @@ worker: repositories: [] # - name: "dodal" # remote_url: https://github.com/DiamondLightSource/dodal.git - # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` + # -- Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` logging: level: "INFO" graylog: From 77b81a401f2f3de671e888194ba138a5eecdd27d Mon Sep 17 00:00:00 2001 From: alexj9837 <52531949+Alexj9837@users.noreply.github.com> Date: Thu, 7 May 2026 07:01:16 +0000 Subject: [PATCH 07/12] changes made to feedback given --- helm/blueapi/files/scripts/pvc-deletion.sh | 7 ++++--- helm/blueapi/files/scripts/time-stamper.sh | 5 +++-- helm/blueapi/templates/configmap.yaml | 1 - helm/blueapi/templates/cronjob.yaml | 10 +++++++--- helm/blueapi/values.schema.json | 2 ++ 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh index dfcfb4d22a..7e4a6cb0a6 100644 --- a/helm/blueapi/files/scripts/pvc-deletion.sh +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -1,7 +1,8 @@ -#!/bin/sh +#!/bin/bash +set -eou pipefail # Get all PVCs by running pods ALL_PVCS=$(kubectl get pvc -n $RELEASE_NAMESPACE -o=jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | sort -u) -BLUEAPI_PVCS=$( echo $ALL_PVCS | tr ' ' '\n' | grep blueapi-scratch) +BLUEAPI_PVCS=$( echo $ALL_PVCS | tr ' ' '\n' | grep "^$RELEASE_FULLNAME-scratch-" || true) NOW=$(date +%s) #loop through all pvcs. for pvc in $BLUEAPI_PVCS; do @@ -18,7 +19,7 @@ for pvc in $BLUEAPI_PVCS; do continue fi #PVC has not been used for more than three months, delete it - kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE + kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE fi else echo " $pvc has no last-used annotation" diff --git a/helm/blueapi/files/scripts/time-stamper.sh b/helm/blueapi/files/scripts/time-stamper.sh index 40de8d006f..2f5d0a4b47 100644 --- a/helm/blueapi/files/scripts/time-stamper.sh +++ b/helm/blueapi/files/scripts/time-stamper.sh @@ -1,8 +1,9 @@ -#!/bin/sh +#!/bin/bash +set -eou pipefail # Get all PVCs currently mounted by running pods MOUNTED_PVCS=$(kubectl get pods -n $RELEASE_NAMESPACE \ -o=jsonpath='{.items[*].spec.volumes[*].persistentVolumeClaim.claimName}' | tr ' ' '\n' | sort -u) -BLUEAPI_PVCS=$( echo $MOUNTED_PVCS | tr ' ' '\n' | grep blueapi-scratch) +BLUEAPI_PVCS=$( echo $MOUNTED_PVCS | tr ' ' '\n' | grep "^$RELEASE_FULLNAME-scratch-"|| true) #loop through all the pvcs annotating ones thare are mounted NOW=$(date +%s) for pvc in $BLUEAPI_PVCS; do diff --git a/helm/blueapi/templates/configmap.yaml b/helm/blueapi/templates/configmap.yaml index 93ba1447ea..584130bf60 100644 --- a/helm/blueapi/templates/configmap.yaml +++ b/helm/blueapi/templates/configmap.yaml @@ -32,5 +32,4 @@ data: scratch: {{- toYaml .Values.worker.scratch | nindent 6 }} ---- {{- end }} diff --git a/helm/blueapi/templates/cronjob.yaml b/helm/blueapi/templates/cronjob.yaml index 16d4fa5013..17b5aedfc7 100644 --- a/helm/blueapi/templates/cronjob.yaml +++ b/helm/blueapi/templates/cronjob.yaml @@ -39,7 +39,7 @@ spec: concurrencyPolicy: Forbid successfulJobsHistoryLimit: 1 failedJobsHistoryLimit: 1 - schedule: "*/5 * * * *" + schedule: "@daily" jobTemplate: spec: @@ -74,10 +74,12 @@ spec: value: {{ .Release.Name }} - name: RELEASE_NAMESPACE value: {{ .Release.Namespace }} + - name: RELEASE_FULLNAME + value: {{include "blueapi.fullname" . }} volumeMounts: - name: {{include "blueapi.fullname" . }}-pvc-stamper-script mountPath: /scripts - image: bitnami/kubectl:latest + image: rancher/kubectl@sha256:05d2b313e2f397e0ade252136aed47abd72d56ead11d1b027ac70f66362c8495 # v1.36.0 imagePullPolicy: IfNotPresent command: ["/scripts/time-stamper.sh"] restartPolicy: OnFailure @@ -98,7 +100,7 @@ metadata: namespace: {{ .Release.Namespace }} rules: - apiGroups: [""] - resources: ["pods", "persistentvolumeclaims"] + resources: ["persistentvolumeclaims"] verbs: ["get", "list", "patch","delete"] --- apiVersion: rbac.authorization.k8s.io/v1 @@ -159,6 +161,8 @@ spec: value: {{ .Release.Name }} - name: RELEASE_NAMESPACE value: {{ .Release.Namespace }} + - name: RELEASE_FULLNAME + value: {{include "blueapi.fullname" . }} volumeMounts: - name: {{include "blueapi.fullname" . }}-pvc-auto-deletion-script mountPath: /scripts diff --git a/helm/blueapi/values.schema.json b/helm/blueapi/values.schema.json index 654e1178d4..5457b34e92 100644 --- a/helm/blueapi/values.schema.json +++ b/helm/blueapi/values.schema.json @@ -206,6 +206,7 @@ } }, "resources": { + "description": "Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be \u003e= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less.", "type": "object", "properties": { "limits": { @@ -404,6 +405,7 @@ } }, "logging": { + "description": "Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi`", "type": "object", "properties": { "graylog": { From 94e4e27da2f8ff6ec27478d5ea18a8902e9797b5 Mon Sep 17 00:00:00 2001 From: alexj9837 <52531949+Alexj9837@users.noreply.github.com> Date: Thu, 7 May 2026 07:03:28 +0000 Subject: [PATCH 08/12] added docs for protected and timestamp --- helm/blueapi/README.md | 10 ++++------ helm/blueapi/values.schema.json | 2 ++ helm/blueapi/values.yaml | 3 +++ 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/helm/blueapi/README.md b/helm/blueapi/README.md index 17e5066071..7bce521f4b 100644 --- a/helm/blueapi/README.md +++ b/helm/blueapi/README.md @@ -32,12 +32,9 @@ A Helm chart deploying a worker pod that runs Bluesky plans | podAnnotations | object | `{}` | | | podLabels | object | `{}` | | | podSecurityContext | object | `{}` | | -| pvcAutoDeletion.enabled | bool | `true` | | +| pvcAutoDeletion | object | `{"enabled":true}` | If enabled, runs a weekly CronJob that deletes blueapi scratch PVCs unused for more than 3 months. To protect a PVC from deletion, set the annotation ""protected" to "true" on it. | | readinessProbe | object | `{"failureThreshold":2,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | Readiness probe, if configured kubernetes will not route traffic to this pod if failed consecutively. This could allow the service time to recover if it is being overwhelmed by traffic, but without the to ability to load balance or scale up/outwards, upstream services will need to know to back off. This is automatically disabled when in debug mode. | -| resources.limits.cpu | string | `"2000m"` | | -| resources.limits.memory | string | `"4000Mi"` | | -| resources.requests.cpu | string | `"200m"` | | -| resources.requests.memory | string | `"400Mi"` | | +| resources | object | `{"limits":{"cpu":"2000m","memory":"4000Mi"},"requests":{"cpu":"200m","memory":"400Mi"}}` | Sets the compute resources available to the pod. These defaults are appropriate when using debug mode or an internal PVC and therefore running VS Code server in the pod. In the Diamond cluster, requests must be >= 0.1*limits When not using either of the above, the limits may be lowered. When idle but connected, blueapi consumes ~400MB of memory and 1% cpu and may struggle when allocated less. | | restartOnConfigChange | bool | `true` | If enabled the blueapi pod will restart on changes to `worker` | | securityContext.runAsNonRoot | bool | `true` | | | securityContext.runAsUser | int | `1000` | | @@ -48,7 +45,7 @@ A Helm chart deploying a worker pod that runs Bluesky plans | serviceAccount.create | bool | `false` | | | serviceAccount.name | string | `""` | | | startupProbe | object | `{"failureThreshold":5,"httpGet":{"path":"/healthz","port":"http"},"periodSeconds":10}` | A more lenient livenessProbe to allow the service to start fully. This is automatically disabled when in debug mode. | -| timeStampCron.enabled | bool | `true` | | +| timeStampCron | object | `{"enabled":true}` | If enabled, runs a daily CronJob that stamps blueapi scratch PVCs with a last-used annotation when mounted by a running pod | | tolerations | list | `[]` | May be required to run on specific nodes (e.g. the control machine) | | tracing | object | `{"fastapi":{"excludedURLs":"/healthz"},"otlp":{"enabled":false,"protocol":"http/protobuf","server":{"host":"http://opentelemetry-collector.tracing","port":4318}}}` | Exclude health probe requests from tracing by default to prevent spamming | | volumeMounts | list | `[{"mountPath":"/config","name":"worker-config","readOnly":true}]` | Additional volumeMounts on the output StatefulSet definition. Define how volumes are mounted to the container referenced by using the same name. | @@ -56,5 +53,6 @@ A Helm chart deploying a worker pod that runs Bluesky plans | worker | object | `{"api":{"url":"http://0.0.0.0:8000/"},"env":{"sources":[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]},"logging":{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"},"scratch":{"repositories":[],"root":"/workspace"},"stomp":{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}}` | Config for the worker goes here, will be mounted into a config file | | worker.api.url | string | `"http://0.0.0.0:8000/"` | 0.0.0.0 required to allow non-loopback traffic If using hostNetwork, the port must be free on the host | | worker.env.sources | list | `[{"kind":"planFunctions","module":"dodal.plans"},{"kind":"planFunctions","module":"dodal.plan_stubs.wrapped"}]` | modules (must be installed in the venv) to fetch devices/plans from | +| worker.logging | object | `{"graylog":{"enabled":false,"url":"tcp://graylog-log-target.diamond.ac.uk:12231/"},"level":"INFO"}` | Configures logging. Port 12231 is the `dodal` input on graylog which will be renamed `blueapi` | | worker.scratch | object | `{"repositories":[],"root":"/workspace"}` | If initContainer is enabled the default branch of python projects in this section are installed into the venv *without their dependencies* | | worker.stomp | object | `{"auth":{"password":"guest","username":"guest"},"enabled":false,"url":"tcp://rabbitmq:61613/"}` | Message bus configuration for returning status to GDA/forwarding documents downstream Password may be in the form ${ENV_VAR} to be fetched from an environment variable e.g. mounted from a SealedSecret | diff --git a/helm/blueapi/values.schema.json b/helm/blueapi/values.schema.json index 5457b34e92..d3bec08569 100644 --- a/helm/blueapi/values.schema.json +++ b/helm/blueapi/values.schema.json @@ -175,6 +175,7 @@ "type": "object" }, "pvcAutoDeletion": { + "description": "If enabled, runs a weekly CronJob that deletes blueapi scratch PVCs unused for more than 3 months. To protect a PVC from deletion, set the annotation \"\"protected\" to \"true\" on it.", "type": "object", "properties": { "enabled": { @@ -301,6 +302,7 @@ } }, "timeStampCron": { + "description": "If enabled, runs a daily CronJob that stamps blueapi scratch PVCs with a last-used annotation when mounted by a running pod", "type": "object", "properties": { "enabled": { diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 6909d429c6..16775d44a0 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -224,9 +224,12 @@ initContainer: # -- Size of persistent volume size: "1Gi" +# -- If enabled, runs a daily CronJob that stamps blueapi scratch PVCs with a last-used annotation when mounted by a running pod timeStampCron: enabled: true +# -- If enabled, runs a weekly CronJob that deletes blueapi scratch PVCs unused for more than 3 months. +# To protect a PVC from deletion, set the annotation ""protected" to "true" on it. pvcAutoDeletion: enabled: true From 6a1bc446dddcb057b3861d57a95b46ad68b0b38a Mon Sep 17 00:00:00 2001 From: Alex J <52531949+Alexj9837@users.noreply.github.com> Date: Thu, 7 May 2026 08:40:55 +0100 Subject: [PATCH 09/12] Remove unnecessary newline in configmap.yaml --- helm/blueapi/templates/configmap.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/helm/blueapi/templates/configmap.yaml b/helm/blueapi/templates/configmap.yaml index 584130bf60..4b02addffc 100644 --- a/helm/blueapi/templates/configmap.yaml +++ b/helm/blueapi/templates/configmap.yaml @@ -31,5 +31,4 @@ data: init_config.yaml: |- scratch: {{- toYaml .Values.worker.scratch | nindent 6 }} - {{- end }} From 5f79ad6fb95e3f68f56dd2127b27d74db8f2ef26 Mon Sep 17 00:00:00 2001 From: alexj9837 <52531949+Alexj9837@users.noreply.github.com> Date: Thu, 7 May 2026 07:46:16 +0000 Subject: [PATCH 10/12] revert configmap changes --- helm/blueapi/templates/configmap.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/helm/blueapi/templates/configmap.yaml b/helm/blueapi/templates/configmap.yaml index 4b02addffc..aa813e6485 100644 --- a/helm/blueapi/templates/configmap.yaml +++ b/helm/blueapi/templates/configmap.yaml @@ -32,3 +32,5 @@ data: scratch: {{- toYaml .Values.worker.scratch | nindent 6 }} {{- end }} + +--- From 54e6d998de62b34cb1143941aefd6577a68f1680 Mon Sep 17 00:00:00 2001 From: alexj9837 <52531949+Alexj9837@users.noreply.github.com> Date: Thu, 7 May 2026 07:54:57 +0000 Subject: [PATCH 11/12] adding wait to the deletion logic --- helm/blueapi/files/scripts/pvc-deletion.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/blueapi/files/scripts/pvc-deletion.sh b/helm/blueapi/files/scripts/pvc-deletion.sh index 7e4a6cb0a6..44a04adc52 100644 --- a/helm/blueapi/files/scripts/pvc-deletion.sh +++ b/helm/blueapi/files/scripts/pvc-deletion.sh @@ -19,7 +19,7 @@ for pvc in $BLUEAPI_PVCS; do continue fi #PVC has not been used for more than three months, delete it - kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE + kubectl delete pvc "$pvc" -n $RELEASE_NAMESPACE --wait=true fi else echo " $pvc has no last-used annotation" From b06cd29b6cc6c808929c28e5a1f3c8f9f4defc53 Mon Sep 17 00:00:00 2001 From: Alex J <52531949+Alexj9837@users.noreply.github.com> Date: Thu, 7 May 2026 08:58:02 +0100 Subject: [PATCH 12/12] Modify podSecurityContext with fsGroup comment Updated podSecurityContext to include fsGroup comment. --- helm/blueapi/values.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/helm/blueapi/values.yaml b/helm/blueapi/values.yaml index 159c101ed5..4892751f4a 100644 --- a/helm/blueapi/values.yaml +++ b/helm/blueapi/values.yaml @@ -36,8 +36,7 @@ podAnnotations: {} # For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ podLabels: {} -podSecurityContext: {} -# fsGroup: 2000 +podSecurityContext: {} # fsGroup: 2000 securityContext: # https://github.com/DiamondLightSource/blueapi/issues/1096