From d119bc51e48d70ee73e5c4e1744658bd169ac02f Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Wed, 24 Jun 2026 15:01:48 +0300
Subject: [PATCH 1/6] clusterpolicy: drop deviceTaint variable from the DRA
 section

The deviceTaint option was added due to confusion. It does not
enable/disable device taints. The underlying cmdline argument
was meant for testing only.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 README.md                                                    | 1 -
 api/v1alpha1/clusterpolicy_types.go                          | 4 ----
 charts/gpu-base-operator-policy/README.md                    | 1 -
 charts/gpu-base-operator-policy/templates/clusterpolicy.yaml | 1 -
 charts/gpu-base-operator-policy/values.yaml                  | 1 -
 charts/gpu-base-operator/crds/clusterpolicies.yaml           | 5 -----
 config/crd/bases/intel.com_clusterpolicies.yaml              | 5 -----
 config/samples/dra/clusterpolicy.yaml                        | 1 -
 internal/controller/dra_controller.go                        | 4 ----
 internal/controller/dra_controller_test.go                   | 4 +---
 10 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 1263769..0631ede 100644
--- a/README.md
+++ b/README.md
@@ -179,7 +179,6 @@ Used when `spec.resourceRegistration: dra`.
 
 |Field|Description|Default|
 |---|---|---|
-|`spec.dra.deviceTaints`|Apply taints to GPU devices reported as unhealthy by health monitoring|`false`|
 |`spec.dra.podHealthCheck`|Enable health check for DRA Pod|true|
 
 #### Health monitoring (`spec.health`)
diff --git a/api/v1alpha1/clusterpolicy_types.go b/api/v1alpha1/clusterpolicy_types.go
index a6c490b..a58a3a1 100644
--- a/api/v1alpha1/clusterpolicy_types.go
+++ b/api/v1alpha1/clusterpolicy_types.go
@@ -77,10 +77,6 @@ type DynamicResourceAllocationSpec struct {
 
 	LogLevel int32 `json:"logLevel,omitempty"`
 
-	// DeviceTaints controls whether DRA applies taints to the GPU devices if
-	// the devices are indicated as unhealthy by the health monitoring.
-	DeviceTaints bool `json:"deviceTaints,omitempty"`
-
 	// Enable DRA Pod's health check.
 	// +kubebuilder:default=true
 	PodHealthCheck bool `json:"podHealthCheck,omitempty"`
diff --git a/charts/gpu-base-operator-policy/README.md b/charts/gpu-base-operator-policy/README.md
index 99ab1bc..6187fd8 100644
--- a/charts/gpu-base-operator-policy/README.md
+++ b/charts/gpu-base-operator-policy/README.md
@@ -41,7 +41,6 @@ See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_h
 | dp.denyIDs | [] | Denied PCI Device IDs |
 | dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.10.0 | DRA driver image. |
 | dra.logLevel | 2 | DRA log level. |
-| dra.deviceTaints | false | Enable device taints. |
 | dra.podHealthCheck | true | Health check for DRA Pod. |
 | xpu.image | ghcr.io/intel/xpumanager/xpumd:v2.0.0 | XPU manager image. |
 | xpu.logLevel | 2 | XPU manager log level. |
diff --git a/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml b/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml
index 2703ddf..737355d 100644
--- a/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml
+++ b/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml
@@ -46,7 +46,6 @@ spec:
   dra:
     image: {{ .Values.dra.image }}
     logLevel: {{ .Values.dra.logLevel }}
-    deviceTaints: {{ .Values.dra.deviceTaints | default false }}
     podHealthCheck: {{ .Values.dra.podHealthCheck | default true }}
     manageBinding: {{ .Values.dra.manageBinding | default false }}
 {{- else }}
diff --git a/charts/gpu-base-operator-policy/values.yaml b/charts/gpu-base-operator-policy/values.yaml
index c0bf14e..fc465f2 100644
--- a/charts/gpu-base-operator-policy/values.yaml
+++ b/charts/gpu-base-operator-policy/values.yaml
@@ -19,7 +19,6 @@ dp:
 dra:
   image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197
   logLevel: 2
-  deviceTaints: false
   podHealthCheck: true
   manageBinding: false
 
diff --git a/charts/gpu-base-operator/crds/clusterpolicies.yaml b/charts/gpu-base-operator/crds/clusterpolicies.yaml
index 27e7ed5..870f9c5 100644
--- a/charts/gpu-base-operator/crds/clusterpolicies.yaml
+++ b/charts/gpu-base-operator/crds/clusterpolicies.yaml
@@ -90,11 +90,6 @@ spec:
                 description: DynamicResourceAllocationSpec defines the desired state
                   of DynamicResourceAllocation.
                 properties:
-                  deviceTaints:
-                    description: |-
-                      DeviceTaints controls whether DRA applies taints to the GPU devices if
-                      the devices are indicated as unhealthy by the health monitoring.
-                    type: boolean
                   image:
                     type: string
                   logLevel:
diff --git a/config/crd/bases/intel.com_clusterpolicies.yaml b/config/crd/bases/intel.com_clusterpolicies.yaml
index 27e7ed5..870f9c5 100644
--- a/config/crd/bases/intel.com_clusterpolicies.yaml
+++ b/config/crd/bases/intel.com_clusterpolicies.yaml
@@ -90,11 +90,6 @@ spec:
                 description: DynamicResourceAllocationSpec defines the desired state
                   of DynamicResourceAllocation.
                 properties:
-                  deviceTaints:
-                    description: |-
-                      DeviceTaints controls whether DRA applies taints to the GPU devices if
-                      the devices are indicated as unhealthy by the health monitoring.
-                    type: boolean
                   image:
                     type: string
                   logLevel:
diff --git a/config/samples/dra/clusterpolicy.yaml b/config/samples/dra/clusterpolicy.yaml
index 6fa9a30..b28ea04 100644
--- a/config/samples/dra/clusterpolicy.yaml
+++ b/config/samples/dra/clusterpolicy.yaml
@@ -14,5 +14,4 @@ spec:
   dra:
     image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197
     logLevel: 2
-    deviceTaints: false
     podHealthCheck: true
diff --git a/internal/controller/dra_controller.go b/internal/controller/dra_controller.go
index 38389aa..1f3fee3 100644
--- a/internal/controller/dra_controller.go
+++ b/internal/controller/dra_controller.go
@@ -487,10 +487,6 @@ func (r *DRAReconciler) generateArgs(spec *v1alpha.ClusterPolicy) []string {
 
 	if spec.Spec.HealthinessSpec != nil {
 		args = append(args, "--health-monitoring=true")
-
-		if spec.Spec.DynamicResourceAllocationSpec.DeviceTaints {
-			args = append(args, "--ignore-health-warning=false")
-		}
 	}
 
 	if spec.Spec.DynamicResourceAllocationSpec.PodHealthCheck {
diff --git a/internal/controller/dra_controller_test.go b/internal/controller/dra_controller_test.go
index 98b86e2..d34a237 100644
--- a/internal/controller/dra_controller_test.go
+++ b/internal/controller/dra_controller_test.go
@@ -491,7 +491,6 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() {
 					DynamicResourceAllocationSpec: v1alpha.DynamicResourceAllocationSpec{
 						LogLevel:       3,
 						PodHealthCheck: true,
-						DeviceTaints:   true,
 					},
 					HealthinessSpec: &v1alpha.HealthinessSpec{
 						CheckIntervalSeconds:       67,
@@ -504,11 +503,10 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() {
 
 			args := controller.generateArgs(cp)
 
-			Expect(args).To(HaveLen(5))
+			Expect(args).To(HaveLen(4))
 			Expect(args).To(ContainElement("-v=3"))
 			Expect(args).To(ContainElement("--health-monitoring=true"))
 			Expect(args).To(ContainElement("--healthcheck-port=51516"))
-			Expect(args).To(ContainElement("--ignore-health-warning=false"))
 			Expect(args).To(ContainElement("--manage-binding=false"))
 		})
 

From 06b5e1700b0ab599979a20a838d6025eae3779fc Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Wed, 24 Jun 2026 15:02:31 +0300
Subject: [PATCH 2/6] e2e: check helm uninstall

Previous namespace creation caused a helm uninstall failure
which was not detected with e2e.
Also replace "--wait" with a namespace removal wait.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 test/e2e/e2e_test.go  | 14 ++++++++++----
 test/e2e/e2e_utils.go | 11 +++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
index adb0ac2..6d835e7 100644
--- a/test/e2e/e2e_test.go
+++ b/test/e2e/e2e_test.go
@@ -498,8 +498,9 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() {
 
 		AfterEach(func() {
 			By("remove clusterpolicy")
-			cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait")
-			utils.Run(cmd)
+			cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait", "--ignore-not-found")
+			_, err := utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "Failed to uninstall clusterpolicy")
 
 			// TODO: Find a better way to ensure that the xpumanager pods are gone
 			By("wait for the xpumanager pods to vanish")
@@ -518,8 +519,13 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() {
 			Eventually(waitForKueueObjectsToClear, time.Second*60).Should(Succeed())
 
 			By("remove operator")
-			cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName, "--wait")
-			utils.Run(cmd)
+			cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName)
+			_, err = utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "Failed to uninstall operator")
+
+			Eventually(func(g Gomega) {
+				waitUntilNamespaceGone(g, namespace)
+			}, 1*time.Minute, 3*time.Second).Should(Succeed())
 
 			By("remove gpu resource slices after each test")
 			cmd = exec.Command("kubectl", "delete", "resourceslices", "--all")
diff --git a/test/e2e/e2e_utils.go b/test/e2e/e2e_utils.go
index 87ecb0d..f3a8710 100644
--- a/test/e2e/e2e_utils.go
+++ b/test/e2e/e2e_utils.go
@@ -224,6 +224,17 @@ func waitUntilResourceSlicesAreGone(g Gomega) {
 	g.Expect(lines).To(ContainElement("No resources found"), "Expected no ResourceSlices to be present")
 }
 
+func waitUntilNamespaceGone(g Gomega, namespace string) {
+	cmd := exec.Command("kubectl", "get", "namespace", namespace, "--ignore-not-found", "-o", "name")
+	output, err := utils.Run(cmd)
+	if err != nil {
+		return
+	}
+
+	g.Expect(err).NotTo(HaveOccurred(), "Failed to query namespace")
+	g.Expect(strings.TrimSpace(output)).To(BeEmpty(), "expected namespace "+namespace+" to be gone")
+}
+
 // removeNFDLabels removes all node-feature-discovery labels from every node.
 // It is called as cleanup after tests that deploy NFD, because NFD labels
 // persist on nodes even after the NFD workloads are deleted.

From 118e1cf3663f6a048d58447020a8c6cf123bf3da Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Wed, 24 Jun 2026 15:04:13 +0300
Subject: [PATCH 3/6] helm: nfd: don't run post delete cleanup

As the chart is now handling the namespace creation, NFD's
post delete cleanup caused the helm uninstall to return non zero.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 charts/gpu-base-operator/Chart.yaml  | 1 +
 charts/gpu-base-operator/values.yaml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/charts/gpu-base-operator/Chart.yaml b/charts/gpu-base-operator/Chart.yaml
index 295c794..5d97519 100644
--- a/charts/gpu-base-operator/Chart.yaml
+++ b/charts/gpu-base-operator/Chart.yaml
@@ -7,6 +7,7 @@ appVersion: "0.0.1"
 
 dependencies:
   - name: node-feature-discovery
+    alias: nfd
     condition: nfd.install
     tags:
       - nfd
diff --git a/charts/gpu-base-operator/values.yaml b/charts/gpu-base-operator/values.yaml
index 472e077..8f7a648 100644
--- a/charts/gpu-base-operator/values.yaml
+++ b/charts/gpu-base-operator/values.yaml
@@ -4,6 +4,7 @@ createNamespace: true
 
 nfd:
   install: false
+  postDeleteCleanup: false
 
 operator:
   image:

From d42406a7b9682e34854bbc40e4a461046ebe1485 Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Wed, 24 Jun 2026 15:06:46 +0300
Subject: [PATCH 4/6] xpum: change temperature handling and ignore ecc states

memory "ecc_disabled" state was converted into a warning
use "critical" limit instead of "warning" for temps

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 config/deployments/xpum/otel-config.yaml     | 38 +++++++-------------
 internal/controller/xpumanager_controller.go | 10 +++---
 2 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/config/deployments/xpum/otel-config.yaml b/config/deployments/xpum/otel-config.yaml
index 02e8774..9defe15 100644
--- a/config/deployments/xpum/otel-config.yaml
+++ b/config/deployments/xpum/otel-config.yaml
@@ -37,9 +37,9 @@ processors:
         states:
           # Default state
           - state_name: "ok"
-          - state_name: "warning"
+          - state_name: "critical"
             conditions:
-              - value: 105.0
+              - value: 100.0
       - name: "Intel GPU memory temperature health"
         source_metric: "hw.temperature"
         parent_metric: "hw.gpu.info"
@@ -58,31 +58,9 @@ processors:
         states:
           # Default state
           - state_name: "ok"
-          - state_name: "warning"
+          - state_name: "critical"
             conditions:
-              - value: 85.0
-
-      - name: "Intel GPU power limit"
-        source_metric: "hw.power"
-        parent_metric: "hw.gpu.info"
-        parent_ref_attribute: "hw.id"
-        parent_filters:
-          - key: "pci.vendor_id"
-            values: ["8086"]
-        component_filters:
-          - key: "hw.sensor_location"
-            values: ["card"]
-        copy_attributes: ["hw.id", "hw.name", "pci.bdf", "hw.sensor_location"]
-        add_attributes:
-          hw.type: "power"
-        # Ordered list of state rules. The rules should be ordered by increasing severity.
-        # All rules are evaluated, the last matching one will be active.
-        states:
-          # Default state
-          - state_name: "ok"
-          - state_name: "warning"
-            conditions:
-              - value: 150.0
+              - value: 100.0
 
 exporters:
   intelxpuinfo:
@@ -106,6 +84,14 @@ exporters:
           "*":
             severity: warning
             message: "Unexpected memory health state, mapped to warning"
+      # Filter out (by using empty state_mapping) ECC states that are not indicative of device health
+      - health_domain: "gpu"
+        filters:
+          - key: hw.type
+            values: [gpu]
+          - key: hw.state
+            values: [ecc_disabled, ecc_enabled, ecc_available, ecc_unavailable]
+        state_mapping: {}
       - health_domain: "{{ .hw_type }}"
         state_mapping:
           unknown:
diff --git a/internal/controller/xpumanager_controller.go b/internal/controller/xpumanager_controller.go
index 794e88e..7adbff4 100644
--- a/internal/controller/xpumanager_controller.go
+++ b/internal/controller/xpumanager_controller.go
@@ -250,9 +250,9 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s
 					for _, loc := range filter.Values {
 						switch loc {
 						case "gpu":
-							setWarningThreshold(rule, float64(health.CoreTemperatureThreshold))
+							setCriticalThreshold(rule, float64(health.CoreTemperatureThreshold))
 						case "memory":
-							setWarningThreshold(rule, float64(health.MemoryTemperatureThreshold))
+							setCriticalThreshold(rule, float64(health.MemoryTemperatureThreshold))
 						}
 					}
 				}
@@ -268,11 +268,11 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s
 	return string(out), nil
 }
 
-// setWarningThreshold updates the condition values on the "warning" state of a rule.
+// setCriticalThreshold updates the condition values on the "critical" state of a rule.
 // All conditions are set to the same threshold, overriding any device-specific defaults.
-func setWarningThreshold(rule *deployments.StatusRule, threshold float64) {
+func setCriticalThreshold(rule *deployments.StatusRule, threshold float64) {
 	for i := range rule.States {
-		if rule.States[i].StateName == "warning" {
+		if rule.States[i].StateName == "critical" {
 			for j := range rule.States[i].Conditions {
 				rule.States[i].Conditions[j].Value = threshold
 			}

From a3969ab5e4251120460513e4dab3109f47ffe764 Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Wed, 24 Jun 2026 14:55:21 +0300
Subject: [PATCH 5/6] workflow: validate also release-* PRs

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 .github/workflows/validate-public.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/validate-public.yaml b/.github/workflows/validate-public.yaml
index 4c74e02..5aac5fe 100644
--- a/.github/workflows/validate-public.yaml
+++ b/.github/workflows/validate-public.yaml
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
       - 'main'
+      - 'release-*'
   workflow_dispatch:
 
 permissions: {}

From 081354ff9716d37b2e60f8b97ebd792a0a85190b Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Wed, 24 Jun 2026 14:56:51 +0300
Subject: [PATCH 6/6] doc: update readmes after the component updates

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 README.md                                 | 3 ++-
 charts/gpu-base-operator-policy/README.md | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0631ede..73981ba 100644
--- a/README.md
+++ b/README.md
@@ -180,6 +180,7 @@ Used when `spec.resourceRegistration: dra`.
 |Field|Description|Default|
 |---|---|---|
 |`spec.dra.podHealthCheck`|Enable health check for DRA Pod|true|
+|`spec.dra.manageBinding`|Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads|`false`|
 
 #### Health monitoring (`spec.health`)
 
@@ -195,7 +196,7 @@ Applies to both DP and DRA unless noted. Thresholds that are exceeded mark the G
 
 |Field|Description|Default|
 |---|---|---|
-|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`xe_monitoring`|
+|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`monitoring`|
 |`spec.xpu.configMapOverride`|Name of a ConfigMap in the operator namespace containing a custom OpenTelemetry Collector `config.yaml`|—|
 
 #### Kueue (`spec.kueue`)
diff --git a/charts/gpu-base-operator-policy/README.md b/charts/gpu-base-operator-policy/README.md
index 6187fd8..98c6d52 100644
--- a/charts/gpu-base-operator-policy/README.md
+++ b/charts/gpu-base-operator-policy/README.md
@@ -39,12 +39,13 @@ See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_h
 | dp.byPathMode | single | DP by-path mounting mode |
 | dp.allowIDs | [] | Allowed PCI Device IDs |
 | dp.denyIDs | [] | Denied PCI Device IDs |
-| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.10.0 | DRA driver image. |
+| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0 | DRA driver image. |
 | dra.logLevel | 2 | DRA log level. |
 | dra.podHealthCheck | true | Health check for DRA Pod. |
+| dra.manageBinding | false | Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads. |
 | xpu.image | ghcr.io/intel/xpumanager/xpumd:v2.0.0 | XPU manager image. |
 | xpu.logLevel | 2 | XPU manager log level. |
-| xpu.monitoringResource | xe_monitoring | Monitoring resource for XPUMD with device plugin. |
+| xpu.monitoringResource | monitoring | Monitoring resource for XPUMD with device plugin. |
 | xpu.configMapOverride | "" | Override the default XPUM configuration ConfigMap name. |
 | kueue.equalResources | [] | List of ClusterQueue configurations. |
 | pullSecret | null | Image pull secret. |