intel · pfl · Jun 25, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
@@ -7,6 +7,7 @@ on:
   pull_request:
     branches:
       - 'main'
+      - 'release-*'
   workflow_dispatch:
 
 permissions: {}

@@ -179,8 +179,8 @@ Used when `spec.resourceRegistration: dra`.
 
 |Field|Description|Default|
 |---|---|---|
-|`spec.dra.deviceTaints`|Apply taints to GPU devices reported as unhealthy by health monitoring|`false`|
 |`spec.dra.podHealthCheck`|Enable health check for DRA Pod|true|
+|`spec.dra.manageBinding`|Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads|`false`|
 
 #### Health monitoring (`spec.health`)
 
@@ -196,7 +196,7 @@ Applies to both DP and DRA unless noted. Thresholds that are exceeded mark the G
 
 |Field|Description|Default|
 |---|---|---|
-|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`xe_monitoring`|
+|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`monitoring`|
 |`spec.xpu.configMapOverride`|Name of a ConfigMap in the operator namespace containing a custom OpenTelemetry Collector `config.yaml`|—|
 
 #### Kueue (`spec.kueue`)

@@ -77,10 +77,6 @@ type DynamicResourceAllocationSpec struct {
 
 	LogLevel int32 `json:"logLevel,omitempty"`
 
-	// DeviceTaints controls whether DRA applies taints to the GPU devices if
-	// the devices are indicated as unhealthy by the health monitoring.
-	DeviceTaints bool `json:"deviceTaints,omitempty"`
-
 	// Enable DRA Pod's health check.
 	// +kubebuilder:default=true
 	PodHealthCheck bool `json:"podHealthCheck,omitempty"`

@@ -39,13 +39,13 @@ See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_h
 | dp.byPathMode | single | DP by-path mounting mode |
 | dp.allowIDs | [] | Allowed PCI Device IDs |
 | dp.denyIDs | [] | Denied PCI Device IDs |
-| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.10.0 | DRA driver image. |
+| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0 | DRA driver image. |
 | dra.logLevel | 2 | DRA log level. |
-| dra.deviceTaints | false | Enable device taints. |
 | dra.podHealthCheck | true | Health check for DRA Pod. |
+| dra.manageBinding | false | Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads. |
 | xpu.image | ghcr.io/intel/xpumanager/xpumd:v2.0.0 | XPU manager image. |
 | xpu.logLevel | 2 | XPU manager log level. |
-| xpu.monitoringResource | xe_monitoring | Monitoring resource for XPUMD with device plugin. |
+| xpu.monitoringResource | monitoring | Monitoring resource for XPUMD with device plugin. |
 | xpu.configMapOverride | "" | Override the default XPUM configuration ConfigMap name. |
 | kueue.equalResources | [] | List of ClusterQueue configurations. |
 | pullSecret | null | Image pull secret. |

@@ -46,7 +46,6 @@ spec:
   dra:
     image: {{ .Values.dra.image }}
     logLevel: {{ .Values.dra.logLevel }}
-    deviceTaints: {{ .Values.dra.deviceTaints | default false }}
     podHealthCheck: {{ .Values.dra.podHealthCheck | default true }}
     manageBinding: {{ .Values.dra.manageBinding | default false }}
 {{- else }}

@@ -19,7 +19,6 @@ dp:
 dra:
   image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197
   logLevel: 2
-  deviceTaints: false
   podHealthCheck: true
   manageBinding: false
 

@@ -7,6 +7,7 @@ appVersion: "0.0.1"
 
 dependencies:
   - name: node-feature-discovery
+    alias: nfd
     condition: nfd.install
     tags:
       - nfd

@@ -90,11 +90,6 @@ spec:
                 description: DynamicResourceAllocationSpec defines the desired state
                   of DynamicResourceAllocation.
                 properties:
-                  deviceTaints:
-                    description: |-
-                      DeviceTaints controls whether DRA applies taints to the GPU devices if
-                      the devices are indicated as unhealthy by the health monitoring.
-                    type: boolean
                   image:
                     type: string
                   logLevel:

@@ -4,6 +4,7 @@ createNamespace: true
 
 nfd:
   install: false
+  postDeleteCleanup: false
 
 operator:
   image:

@@ -90,11 +90,6 @@ spec:
                 description: DynamicResourceAllocationSpec defines the desired state
                   of DynamicResourceAllocation.
                 properties:
-                  deviceTaints:
-                    description: |-
-                      DeviceTaints controls whether DRA applies taints to the GPU devices if
-                      the devices are indicated as unhealthy by the health monitoring.
-                    type: boolean
                   image:
                     type: string
                   logLevel:

@@ -37,9 +37,9 @@ processors:
         states:
           # Default state
           - state_name: "ok"
-          - state_name: "warning"
+          - state_name: "critical"
             conditions:
-              - value: 105.0
+              - value: 100.0
       - name: "Intel GPU memory temperature health"
         source_metric: "hw.temperature"
         parent_metric: "hw.gpu.info"
@@ -58,31 +58,9 @@ processors:
         states:
           # Default state
           - state_name: "ok"
-          - state_name: "warning"
+          - state_name: "critical"
             conditions:
-              - value: 85.0
-
-      - name: "Intel GPU power limit"
-        source_metric: "hw.power"
-        parent_metric: "hw.gpu.info"
-        parent_ref_attribute: "hw.id"
-        parent_filters:
-          - key: "pci.vendor_id"
-            values: ["8086"]
-        component_filters:
-          - key: "hw.sensor_location"
-            values: ["card"]
-        copy_attributes: ["hw.id", "hw.name", "pci.bdf", "hw.sensor_location"]
-        add_attributes:
-          hw.type: "power"
-        # Ordered list of state rules. The rules should be ordered by increasing severity.
-        # All rules are evaluated, the last matching one will be active.
-        states:
-          # Default state
-          - state_name: "ok"
-          - state_name: "warning"
-            conditions:
-              - value: 150.0
+              - value: 100.0
 
 exporters:
   intelxpuinfo:
@@ -106,6 +84,14 @@ exporters:
           "*":
             severity: warning
             message: "Unexpected memory health state, mapped to warning"
+      # Filter out (by using empty state_mapping) ECC states that are not indicative of device health
+      - health_domain: "gpu"
+        filters:
+          - key: hw.type
+            values: [gpu]
+          - key: hw.state
+            values: [ecc_disabled, ecc_enabled, ecc_available, ecc_unavailable]
+        state_mapping: {}
       - health_domain: "{{ .hw_type }}"
         state_mapping:
           unknown:

@@ -14,5 +14,4 @@ spec:
   dra:
     image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197
     logLevel: 2
-    deviceTaints: false
     podHealthCheck: true
@@ -487,10 +487,6 @@ func (r *DRAReconciler) generateArgs(spec *v1alpha.ClusterPolicy) []string {
 
 	if spec.Spec.HealthinessSpec != nil {
 		args = append(args, "--health-monitoring=true")
-
-		if spec.Spec.DynamicResourceAllocationSpec.DeviceTaints {
-			args = append(args, "--ignore-health-warning=false")
-		}
 	}
 
 	if spec.Spec.DynamicResourceAllocationSpec.PodHealthCheck {

@@ -491,7 +491,6 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() {
 					DynamicResourceAllocationSpec: v1alpha.DynamicResourceAllocationSpec{
 						LogLevel:       3,
 						PodHealthCheck: true,
-						DeviceTaints:   true,
 					},
 					HealthinessSpec: &v1alpha.HealthinessSpec{
 						CheckIntervalSeconds:       67,
@@ -504,11 +503,10 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() {
 
 			args := controller.generateArgs(cp)
 
-			Expect(args).To(HaveLen(5))
+			Expect(args).To(HaveLen(4))
 			Expect(args).To(ContainElement("-v=3"))
 			Expect(args).To(ContainElement("--health-monitoring=true"))
 			Expect(args).To(ContainElement("--healthcheck-port=51516"))
-			Expect(args).To(ContainElement("--ignore-health-warning=false"))
 			Expect(args).To(ContainElement("--manage-binding=false"))
 		})
 

@@ -250,9 +250,9 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s
 					for _, loc := range filter.Values {
 						switch loc {
 						case "gpu":
-							setWarningThreshold(rule, float64(health.CoreTemperatureThreshold))
+							setCriticalThreshold(rule, float64(health.CoreTemperatureThreshold))
 						case "memory":
-							setWarningThreshold(rule, float64(health.MemoryTemperatureThreshold))
+							setCriticalThreshold(rule, float64(health.MemoryTemperatureThreshold))
 						}
 					}
 				}
@@ -268,11 +268,11 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s
 	return string(out), nil
 }
 
-// setWarningThreshold updates the condition values on the "warning" state of a rule.
+// setCriticalThreshold updates the condition values on the "critical" state of a rule.
 // All conditions are set to the same threshold, overriding any device-specific defaults.
-func setWarningThreshold(rule *deployments.StatusRule, threshold float64) {
+func setCriticalThreshold(rule *deployments.StatusRule, threshold float64) {
 	for i := range rule.States {
-		if rule.States[i].StateName == "warning" {
+		if rule.States[i].StateName == "critical" {
 			for j := range rule.States[i].Conditions {
 				rule.States[i].Conditions[j].Value = threshold
 			}

@@ -498,8 +498,9 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() {
 
 		AfterEach(func() {
 			By("remove clusterpolicy")
-			cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait")
-			utils.Run(cmd)
+			cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait", "--ignore-not-found")
+			_, err := utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "Failed to uninstall clusterpolicy")
 
 			// TODO: Find a better way to ensure that the xpumanager pods are gone
 			By("wait for the xpumanager pods to vanish")
@@ -518,8 +519,13 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() {
 			Eventually(waitForKueueObjectsToClear, time.Second*60).Should(Succeed())
 
 			By("remove operator")
-			cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName, "--wait")
-			utils.Run(cmd)
+			cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName)
+			_, err = utils.Run(cmd)
+			Expect(err).NotTo(HaveOccurred(), "Failed to uninstall operator")
+
+			Eventually(func(g Gomega) {
+				waitUntilNamespaceGone(g, namespace)
+			}, 1*time.Minute, 3*time.Second).Should(Succeed())
 
 			By("remove gpu resource slices after each test")
 			cmd = exec.Command("kubectl", "delete", "resourceslices", "--all")

@@ -224,6 +224,17 @@ func waitUntilResourceSlicesAreGone(g Gomega) {
 	g.Expect(lines).To(ContainElement("No resources found"), "Expected no ResourceSlices to be present")
 }
 
+func waitUntilNamespaceGone(g Gomega, namespace string) {
+	cmd := exec.Command("kubectl", "get", "namespace", namespace, "--ignore-not-found", "-o", "name")
+	output, err := utils.Run(cmd)
+	if err != nil {
+		return
+	}
+
+	g.Expect(err).NotTo(HaveOccurred(), "Failed to query namespace")
+	g.Expect(strings.TrimSpace(output)).To(BeEmpty(), "expected namespace "+namespace+" to be gone")
+}
+
 // removeNFDLabels removes all node-feature-discovery labels from every node.
 // It is called as cleanup after tests that deploy NFD, because NFD labels
 // persist on nodes even after the NFD workloads are deleted.
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ createNamespace: true @@
     nfd:
       install: false
+      postDeleteCleanup: false
     operator:
       image:
@@ Expand Down @@