From d119bc51e48d70ee73e5c4e1744658bd169ac02f Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 24 Jun 2026 15:01:48 +0300 Subject: [PATCH 1/6] clusterpolicy: drop deviceTaint variable from the DRA section The deviceTaint option was added due to confusion. It does not enable/disable device taints. The underlying cmdline argument was meant for testing only. Signed-off-by: Tuomas Katila --- README.md | 1 - api/v1alpha1/clusterpolicy_types.go | 4 ---- charts/gpu-base-operator-policy/README.md | 1 - charts/gpu-base-operator-policy/templates/clusterpolicy.yaml | 1 - charts/gpu-base-operator-policy/values.yaml | 1 - charts/gpu-base-operator/crds/clusterpolicies.yaml | 5 ----- config/crd/bases/intel.com_clusterpolicies.yaml | 5 ----- config/samples/dra/clusterpolicy.yaml | 1 - internal/controller/dra_controller.go | 4 ---- internal/controller/dra_controller_test.go | 4 +--- 10 files changed, 1 insertion(+), 26 deletions(-) diff --git a/README.md b/README.md index 1263769..0631ede 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,6 @@ Used when `spec.resourceRegistration: dra`. |Field|Description|Default| |---|---|---| -|`spec.dra.deviceTaints`|Apply taints to GPU devices reported as unhealthy by health monitoring|`false`| |`spec.dra.podHealthCheck`|Enable health check for DRA Pod|true| #### Health monitoring (`spec.health`) diff --git a/api/v1alpha1/clusterpolicy_types.go b/api/v1alpha1/clusterpolicy_types.go index a6c490b..a58a3a1 100644 --- a/api/v1alpha1/clusterpolicy_types.go +++ b/api/v1alpha1/clusterpolicy_types.go @@ -77,10 +77,6 @@ type DynamicResourceAllocationSpec struct { LogLevel int32 `json:"logLevel,omitempty"` - // DeviceTaints controls whether DRA applies taints to the GPU devices if - // the devices are indicated as unhealthy by the health monitoring. - DeviceTaints bool `json:"deviceTaints,omitempty"` - // Enable DRA Pod's health check. // +kubebuilder:default=true PodHealthCheck bool `json:"podHealthCheck,omitempty"` diff --git a/charts/gpu-base-operator-policy/README.md b/charts/gpu-base-operator-policy/README.md index 99ab1bc..6187fd8 100644 --- a/charts/gpu-base-operator-policy/README.md +++ b/charts/gpu-base-operator-policy/README.md @@ -41,7 +41,6 @@ See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_h | dp.denyIDs | [] | Denied PCI Device IDs | | dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.10.0 | DRA driver image. | | dra.logLevel | 2 | DRA log level. | -| dra.deviceTaints | false | Enable device taints. | | dra.podHealthCheck | true | Health check for DRA Pod. | | xpu.image | ghcr.io/intel/xpumanager/xpumd:v2.0.0 | XPU manager image. | | xpu.logLevel | 2 | XPU manager log level. | diff --git a/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml b/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml index 2703ddf..737355d 100644 --- a/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml +++ b/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml @@ -46,7 +46,6 @@ spec: dra: image: {{ .Values.dra.image }} logLevel: {{ .Values.dra.logLevel }} - deviceTaints: {{ .Values.dra.deviceTaints | default false }} podHealthCheck: {{ .Values.dra.podHealthCheck | default true }} manageBinding: {{ .Values.dra.manageBinding | default false }} {{- else }} diff --git a/charts/gpu-base-operator-policy/values.yaml b/charts/gpu-base-operator-policy/values.yaml index c0bf14e..fc465f2 100644 --- a/charts/gpu-base-operator-policy/values.yaml +++ b/charts/gpu-base-operator-policy/values.yaml @@ -19,7 +19,6 @@ dp: dra: image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197 logLevel: 2 - deviceTaints: false podHealthCheck: true manageBinding: false diff --git a/charts/gpu-base-operator/crds/clusterpolicies.yaml b/charts/gpu-base-operator/crds/clusterpolicies.yaml index 27e7ed5..870f9c5 100644 --- a/charts/gpu-base-operator/crds/clusterpolicies.yaml +++ b/charts/gpu-base-operator/crds/clusterpolicies.yaml @@ -90,11 +90,6 @@ spec: description: DynamicResourceAllocationSpec defines the desired state of DynamicResourceAllocation. properties: - deviceTaints: - description: |- - DeviceTaints controls whether DRA applies taints to the GPU devices if - the devices are indicated as unhealthy by the health monitoring. - type: boolean image: type: string logLevel: diff --git a/config/crd/bases/intel.com_clusterpolicies.yaml b/config/crd/bases/intel.com_clusterpolicies.yaml index 27e7ed5..870f9c5 100644 --- a/config/crd/bases/intel.com_clusterpolicies.yaml +++ b/config/crd/bases/intel.com_clusterpolicies.yaml @@ -90,11 +90,6 @@ spec: description: DynamicResourceAllocationSpec defines the desired state of DynamicResourceAllocation. properties: - deviceTaints: - description: |- - DeviceTaints controls whether DRA applies taints to the GPU devices if - the devices are indicated as unhealthy by the health monitoring. - type: boolean image: type: string logLevel: diff --git a/config/samples/dra/clusterpolicy.yaml b/config/samples/dra/clusterpolicy.yaml index 6fa9a30..b28ea04 100644 --- a/config/samples/dra/clusterpolicy.yaml +++ b/config/samples/dra/clusterpolicy.yaml @@ -14,5 +14,4 @@ spec: dra: image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197 logLevel: 2 - deviceTaints: false podHealthCheck: true diff --git a/internal/controller/dra_controller.go b/internal/controller/dra_controller.go index 38389aa..1f3fee3 100644 --- a/internal/controller/dra_controller.go +++ b/internal/controller/dra_controller.go @@ -487,10 +487,6 @@ func (r *DRAReconciler) generateArgs(spec *v1alpha.ClusterPolicy) []string { if spec.Spec.HealthinessSpec != nil { args = append(args, "--health-monitoring=true") - - if spec.Spec.DynamicResourceAllocationSpec.DeviceTaints { - args = append(args, "--ignore-health-warning=false") - } } if spec.Spec.DynamicResourceAllocationSpec.PodHealthCheck { diff --git a/internal/controller/dra_controller_test.go b/internal/controller/dra_controller_test.go index 98b86e2..d34a237 100644 --- a/internal/controller/dra_controller_test.go +++ b/internal/controller/dra_controller_test.go @@ -491,7 +491,6 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() { DynamicResourceAllocationSpec: v1alpha.DynamicResourceAllocationSpec{ LogLevel: 3, PodHealthCheck: true, - DeviceTaints: true, }, HealthinessSpec: &v1alpha.HealthinessSpec{ CheckIntervalSeconds: 67, @@ -504,11 +503,10 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() { args := controller.generateArgs(cp) - Expect(args).To(HaveLen(5)) + Expect(args).To(HaveLen(4)) Expect(args).To(ContainElement("-v=3")) Expect(args).To(ContainElement("--health-monitoring=true")) Expect(args).To(ContainElement("--healthcheck-port=51516")) - Expect(args).To(ContainElement("--ignore-health-warning=false")) Expect(args).To(ContainElement("--manage-binding=false")) }) From 06b5e1700b0ab599979a20a838d6025eae3779fc Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 24 Jun 2026 15:02:31 +0300 Subject: [PATCH 2/6] e2e: check helm uninstall Previous namespace creation caused a helm uninstall failure which was not detected with e2e. Also replace "--wait" with a namespace removal wait. Signed-off-by: Tuomas Katila --- test/e2e/e2e_test.go | 14 ++++++++++---- test/e2e/e2e_utils.go | 11 +++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index adb0ac2..6d835e7 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -498,8 +498,9 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() { AfterEach(func() { By("remove clusterpolicy") - cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait") - utils.Run(cmd) + cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait", "--ignore-not-found") + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to uninstall clusterpolicy") // TODO: Find a better way to ensure that the xpumanager pods are gone By("wait for the xpumanager pods to vanish") @@ -518,8 +519,13 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() { Eventually(waitForKueueObjectsToClear, time.Second*60).Should(Succeed()) By("remove operator") - cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName, "--wait") - utils.Run(cmd) + cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to uninstall operator") + + Eventually(func(g Gomega) { + waitUntilNamespaceGone(g, namespace) + }, 1*time.Minute, 3*time.Second).Should(Succeed()) By("remove gpu resource slices after each test") cmd = exec.Command("kubectl", "delete", "resourceslices", "--all") diff --git a/test/e2e/e2e_utils.go b/test/e2e/e2e_utils.go index 87ecb0d..f3a8710 100644 --- a/test/e2e/e2e_utils.go +++ b/test/e2e/e2e_utils.go @@ -224,6 +224,17 @@ func waitUntilResourceSlicesAreGone(g Gomega) { g.Expect(lines).To(ContainElement("No resources found"), "Expected no ResourceSlices to be present") } +func waitUntilNamespaceGone(g Gomega, namespace string) { + cmd := exec.Command("kubectl", "get", "namespace", namespace, "--ignore-not-found", "-o", "name") + output, err := utils.Run(cmd) + if err != nil { + return + } + + g.Expect(err).NotTo(HaveOccurred(), "Failed to query namespace") + g.Expect(strings.TrimSpace(output)).To(BeEmpty(), "expected namespace "+namespace+" to be gone") +} + // removeNFDLabels removes all node-feature-discovery labels from every node. // It is called as cleanup after tests that deploy NFD, because NFD labels // persist on nodes even after the NFD workloads are deleted. From 118e1cf3663f6a048d58447020a8c6cf123bf3da Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 24 Jun 2026 15:04:13 +0300 Subject: [PATCH 3/6] helm: nfd: don't run post delete cleanup As the chart is now handling the namespace creation, NFD's post delete cleanup caused the helm uninstall to return non zero. Signed-off-by: Tuomas Katila --- charts/gpu-base-operator/Chart.yaml | 1 + charts/gpu-base-operator/values.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/charts/gpu-base-operator/Chart.yaml b/charts/gpu-base-operator/Chart.yaml index 295c794..5d97519 100644 --- a/charts/gpu-base-operator/Chart.yaml +++ b/charts/gpu-base-operator/Chart.yaml @@ -7,6 +7,7 @@ appVersion: "0.0.1" dependencies: - name: node-feature-discovery + alias: nfd condition: nfd.install tags: - nfd diff --git a/charts/gpu-base-operator/values.yaml b/charts/gpu-base-operator/values.yaml index 472e077..8f7a648 100644 --- a/charts/gpu-base-operator/values.yaml +++ b/charts/gpu-base-operator/values.yaml @@ -4,6 +4,7 @@ createNamespace: true nfd: install: false + postDeleteCleanup: false operator: image: From d42406a7b9682e34854bbc40e4a461046ebe1485 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 24 Jun 2026 15:06:46 +0300 Subject: [PATCH 4/6] xpum: change temperature handling and ignore ecc states memory "ecc_disabled" state was converted into a warning use "critical" limit instead of "warning" for temps Signed-off-by: Tuomas Katila --- config/deployments/xpum/otel-config.yaml | 38 +++++++------------- internal/controller/xpumanager_controller.go | 10 +++--- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/config/deployments/xpum/otel-config.yaml b/config/deployments/xpum/otel-config.yaml index 02e8774..9defe15 100644 --- a/config/deployments/xpum/otel-config.yaml +++ b/config/deployments/xpum/otel-config.yaml @@ -37,9 +37,9 @@ processors: states: # Default state - state_name: "ok" - - state_name: "warning" + - state_name: "critical" conditions: - - value: 105.0 + - value: 100.0 - name: "Intel GPU memory temperature health" source_metric: "hw.temperature" parent_metric: "hw.gpu.info" @@ -58,31 +58,9 @@ processors: states: # Default state - state_name: "ok" - - state_name: "warning" + - state_name: "critical" conditions: - - value: 85.0 - - - name: "Intel GPU power limit" - source_metric: "hw.power" - parent_metric: "hw.gpu.info" - parent_ref_attribute: "hw.id" - parent_filters: - - key: "pci.vendor_id" - values: ["8086"] - component_filters: - - key: "hw.sensor_location" - values: ["card"] - copy_attributes: ["hw.id", "hw.name", "pci.bdf", "hw.sensor_location"] - add_attributes: - hw.type: "power" - # Ordered list of state rules. The rules should be ordered by increasing severity. - # All rules are evaluated, the last matching one will be active. - states: - # Default state - - state_name: "ok" - - state_name: "warning" - conditions: - - value: 150.0 + - value: 100.0 exporters: intelxpuinfo: @@ -106,6 +84,14 @@ exporters: "*": severity: warning message: "Unexpected memory health state, mapped to warning" + # Filter out (by using empty state_mapping) ECC states that are not indicative of device health + - health_domain: "gpu" + filters: + - key: hw.type + values: [gpu] + - key: hw.state + values: [ecc_disabled, ecc_enabled, ecc_available, ecc_unavailable] + state_mapping: {} - health_domain: "{{ .hw_type }}" state_mapping: unknown: diff --git a/internal/controller/xpumanager_controller.go b/internal/controller/xpumanager_controller.go index 794e88e..7adbff4 100644 --- a/internal/controller/xpumanager_controller.go +++ b/internal/controller/xpumanager_controller.go @@ -250,9 +250,9 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s for _, loc := range filter.Values { switch loc { case "gpu": - setWarningThreshold(rule, float64(health.CoreTemperatureThreshold)) + setCriticalThreshold(rule, float64(health.CoreTemperatureThreshold)) case "memory": - setWarningThreshold(rule, float64(health.MemoryTemperatureThreshold)) + setCriticalThreshold(rule, float64(health.MemoryTemperatureThreshold)) } } } @@ -268,11 +268,11 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s return string(out), nil } -// setWarningThreshold updates the condition values on the "warning" state of a rule. +// setCriticalThreshold updates the condition values on the "critical" state of a rule. // All conditions are set to the same threshold, overriding any device-specific defaults. -func setWarningThreshold(rule *deployments.StatusRule, threshold float64) { +func setCriticalThreshold(rule *deployments.StatusRule, threshold float64) { for i := range rule.States { - if rule.States[i].StateName == "warning" { + if rule.States[i].StateName == "critical" { for j := range rule.States[i].Conditions { rule.States[i].Conditions[j].Value = threshold } From a3969ab5e4251120460513e4dab3109f47ffe764 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 24 Jun 2026 14:55:21 +0300 Subject: [PATCH 5/6] workflow: validate also release-* PRs Signed-off-by: Tuomas Katila --- .github/workflows/validate-public.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/validate-public.yaml b/.github/workflows/validate-public.yaml index 4c74e02..5aac5fe 100644 --- a/.github/workflows/validate-public.yaml +++ b/.github/workflows/validate-public.yaml @@ -7,6 +7,7 @@ on: pull_request: branches: - 'main' + - 'release-*' workflow_dispatch: permissions: {} From 081354ff9716d37b2e60f8b97ebd792a0a85190b Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 24 Jun 2026 14:56:51 +0300 Subject: [PATCH 6/6] doc: update readmes after the component updates Signed-off-by: Tuomas Katila --- README.md | 3 ++- charts/gpu-base-operator-policy/README.md | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0631ede..73981ba 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,7 @@ Used when `spec.resourceRegistration: dra`. |Field|Description|Default| |---|---|---| |`spec.dra.podHealthCheck`|Enable health check for DRA Pod|true| +|`spec.dra.manageBinding`|Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads|`false`| #### Health monitoring (`spec.health`) @@ -195,7 +196,7 @@ Applies to both DP and DRA unless noted. Thresholds that are exceeded mark the G |Field|Description|Default| |---|---|---| -|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`xe_monitoring`| +|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`monitoring`| |`spec.xpu.configMapOverride`|Name of a ConfigMap in the operator namespace containing a custom OpenTelemetry Collector `config.yaml`|—| #### Kueue (`spec.kueue`) diff --git a/charts/gpu-base-operator-policy/README.md b/charts/gpu-base-operator-policy/README.md index 6187fd8..98c6d52 100644 --- a/charts/gpu-base-operator-policy/README.md +++ b/charts/gpu-base-operator-policy/README.md @@ -39,12 +39,13 @@ See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_h | dp.byPathMode | single | DP by-path mounting mode | | dp.allowIDs | [] | Allowed PCI Device IDs | | dp.denyIDs | [] | Denied PCI Device IDs | -| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.10.0 | DRA driver image. | +| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0 | DRA driver image. | | dra.logLevel | 2 | DRA log level. | | dra.podHealthCheck | true | Health check for DRA Pod. | +| dra.manageBinding | false | Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads. | | xpu.image | ghcr.io/intel/xpumanager/xpumd:v2.0.0 | XPU manager image. | | xpu.logLevel | 2 | XPU manager log level. | -| xpu.monitoringResource | xe_monitoring | Monitoring resource for XPUMD with device plugin. | +| xpu.monitoringResource | monitoring | Monitoring resource for XPUMD with device plugin. | | xpu.configMapOverride | "" | Override the default XPUM configuration ConfigMap name. | | kueue.equalResources | [] | List of ClusterQueue configurations. | | pullSecret | null | Image pull secret. |