diff --git a/.github/workflows/validate-public.yaml b/.github/workflows/validate-public.yaml index 4c74e02..5aac5fe 100644 --- a/.github/workflows/validate-public.yaml +++ b/.github/workflows/validate-public.yaml @@ -7,6 +7,7 @@ on: pull_request: branches: - 'main' + - 'release-*' workflow_dispatch: permissions: {} diff --git a/README.md b/README.md index 1263769..73981ba 100644 --- a/README.md +++ b/README.md @@ -179,8 +179,8 @@ Used when `spec.resourceRegistration: dra`. |Field|Description|Default| |---|---|---| -|`spec.dra.deviceTaints`|Apply taints to GPU devices reported as unhealthy by health monitoring|`false`| |`spec.dra.podHealthCheck`|Enable health check for DRA Pod|true| +|`spec.dra.manageBinding`|Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads|`false`| #### Health monitoring (`spec.health`) @@ -196,7 +196,7 @@ Applies to both DP and DRA unless noted. Thresholds that are exceeded mark the G |Field|Description|Default| |---|---|---| -|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`xe_monitoring`| +|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`monitoring`| |`spec.xpu.configMapOverride`|Name of a ConfigMap in the operator namespace containing a custom OpenTelemetry Collector `config.yaml`|—| #### Kueue (`spec.kueue`) diff --git a/api/v1alpha1/clusterpolicy_types.go b/api/v1alpha1/clusterpolicy_types.go index a6c490b..a58a3a1 100644 --- a/api/v1alpha1/clusterpolicy_types.go +++ b/api/v1alpha1/clusterpolicy_types.go @@ -77,10 +77,6 @@ type DynamicResourceAllocationSpec struct { LogLevel int32 `json:"logLevel,omitempty"` - // DeviceTaints controls whether DRA applies taints to the GPU devices if - // the devices are indicated as unhealthy by the health monitoring. - DeviceTaints bool `json:"deviceTaints,omitempty"` - // Enable DRA Pod's health check. // +kubebuilder:default=true PodHealthCheck bool `json:"podHealthCheck,omitempty"` diff --git a/charts/gpu-base-operator-policy/README.md b/charts/gpu-base-operator-policy/README.md index 99ab1bc..98c6d52 100644 --- a/charts/gpu-base-operator-policy/README.md +++ b/charts/gpu-base-operator-policy/README.md @@ -39,13 +39,13 @@ See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_h | dp.byPathMode | single | DP by-path mounting mode | | dp.allowIDs | [] | Allowed PCI Device IDs | | dp.denyIDs | [] | Denied PCI Device IDs | -| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.10.0 | DRA driver image. | +| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0 | DRA driver image. | | dra.logLevel | 2 | DRA log level. | -| dra.deviceTaints | false | Enable device taints. | | dra.podHealthCheck | true | Health check for DRA Pod. | +| dra.manageBinding | false | Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads. | | xpu.image | ghcr.io/intel/xpumanager/xpumd:v2.0.0 | XPU manager image. | | xpu.logLevel | 2 | XPU manager log level. | -| xpu.monitoringResource | xe_monitoring | Monitoring resource for XPUMD with device plugin. | +| xpu.monitoringResource | monitoring | Monitoring resource for XPUMD with device plugin. | | xpu.configMapOverride | "" | Override the default XPUM configuration ConfigMap name. | | kueue.equalResources | [] | List of ClusterQueue configurations. | | pullSecret | null | Image pull secret. | diff --git a/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml b/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml index 2703ddf..737355d 100644 --- a/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml +++ b/charts/gpu-base-operator-policy/templates/clusterpolicy.yaml @@ -46,7 +46,6 @@ spec: dra: image: {{ .Values.dra.image }} logLevel: {{ .Values.dra.logLevel }} - deviceTaints: {{ .Values.dra.deviceTaints | default false }} podHealthCheck: {{ .Values.dra.podHealthCheck | default true }} manageBinding: {{ .Values.dra.manageBinding | default false }} {{- else }} diff --git a/charts/gpu-base-operator-policy/values.yaml b/charts/gpu-base-operator-policy/values.yaml index c0bf14e..fc465f2 100644 --- a/charts/gpu-base-operator-policy/values.yaml +++ b/charts/gpu-base-operator-policy/values.yaml @@ -19,7 +19,6 @@ dp: dra: image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197 logLevel: 2 - deviceTaints: false podHealthCheck: true manageBinding: false diff --git a/charts/gpu-base-operator/Chart.yaml b/charts/gpu-base-operator/Chart.yaml index 295c794..5d97519 100644 --- a/charts/gpu-base-operator/Chart.yaml +++ b/charts/gpu-base-operator/Chart.yaml @@ -7,6 +7,7 @@ appVersion: "0.0.1" dependencies: - name: node-feature-discovery + alias: nfd condition: nfd.install tags: - nfd diff --git a/charts/gpu-base-operator/crds/clusterpolicies.yaml b/charts/gpu-base-operator/crds/clusterpolicies.yaml index 27e7ed5..870f9c5 100644 --- a/charts/gpu-base-operator/crds/clusterpolicies.yaml +++ b/charts/gpu-base-operator/crds/clusterpolicies.yaml @@ -90,11 +90,6 @@ spec: description: DynamicResourceAllocationSpec defines the desired state of DynamicResourceAllocation. properties: - deviceTaints: - description: |- - DeviceTaints controls whether DRA applies taints to the GPU devices if - the devices are indicated as unhealthy by the health monitoring. - type: boolean image: type: string logLevel: diff --git a/charts/gpu-base-operator/values.yaml b/charts/gpu-base-operator/values.yaml index 472e077..8f7a648 100644 --- a/charts/gpu-base-operator/values.yaml +++ b/charts/gpu-base-operator/values.yaml @@ -4,6 +4,7 @@ createNamespace: true nfd: install: false + postDeleteCleanup: false operator: image: diff --git a/config/crd/bases/intel.com_clusterpolicies.yaml b/config/crd/bases/intel.com_clusterpolicies.yaml index 27e7ed5..870f9c5 100644 --- a/config/crd/bases/intel.com_clusterpolicies.yaml +++ b/config/crd/bases/intel.com_clusterpolicies.yaml @@ -90,11 +90,6 @@ spec: description: DynamicResourceAllocationSpec defines the desired state of DynamicResourceAllocation. properties: - deviceTaints: - description: |- - DeviceTaints controls whether DRA applies taints to the GPU devices if - the devices are indicated as unhealthy by the health monitoring. - type: boolean image: type: string logLevel: diff --git a/config/deployments/xpum/otel-config.yaml b/config/deployments/xpum/otel-config.yaml index 02e8774..9defe15 100644 --- a/config/deployments/xpum/otel-config.yaml +++ b/config/deployments/xpum/otel-config.yaml @@ -37,9 +37,9 @@ processors: states: # Default state - state_name: "ok" - - state_name: "warning" + - state_name: "critical" conditions: - - value: 105.0 + - value: 100.0 - name: "Intel GPU memory temperature health" source_metric: "hw.temperature" parent_metric: "hw.gpu.info" @@ -58,31 +58,9 @@ processors: states: # Default state - state_name: "ok" - - state_name: "warning" + - state_name: "critical" conditions: - - value: 85.0 - - - name: "Intel GPU power limit" - source_metric: "hw.power" - parent_metric: "hw.gpu.info" - parent_ref_attribute: "hw.id" - parent_filters: - - key: "pci.vendor_id" - values: ["8086"] - component_filters: - - key: "hw.sensor_location" - values: ["card"] - copy_attributes: ["hw.id", "hw.name", "pci.bdf", "hw.sensor_location"] - add_attributes: - hw.type: "power" - # Ordered list of state rules. The rules should be ordered by increasing severity. - # All rules are evaluated, the last matching one will be active. - states: - # Default state - - state_name: "ok" - - state_name: "warning" - conditions: - - value: 150.0 + - value: 100.0 exporters: intelxpuinfo: @@ -106,6 +84,14 @@ exporters: "*": severity: warning message: "Unexpected memory health state, mapped to warning" + # Filter out (by using empty state_mapping) ECC states that are not indicative of device health + - health_domain: "gpu" + filters: + - key: hw.type + values: [gpu] + - key: hw.state + values: [ecc_disabled, ecc_enabled, ecc_available, ecc_unavailable] + state_mapping: {} - health_domain: "{{ .hw_type }}" state_mapping: unknown: diff --git a/config/samples/dra/clusterpolicy.yaml b/config/samples/dra/clusterpolicy.yaml index 6fa9a30..b28ea04 100644 --- a/config/samples/dra/clusterpolicy.yaml +++ b/config/samples/dra/clusterpolicy.yaml @@ -14,5 +14,4 @@ spec: dra: image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197 logLevel: 2 - deviceTaints: false podHealthCheck: true diff --git a/internal/controller/dra_controller.go b/internal/controller/dra_controller.go index 38389aa..1f3fee3 100644 --- a/internal/controller/dra_controller.go +++ b/internal/controller/dra_controller.go @@ -487,10 +487,6 @@ func (r *DRAReconciler) generateArgs(spec *v1alpha.ClusterPolicy) []string { if spec.Spec.HealthinessSpec != nil { args = append(args, "--health-monitoring=true") - - if spec.Spec.DynamicResourceAllocationSpec.DeviceTaints { - args = append(args, "--ignore-health-warning=false") - } } if spec.Spec.DynamicResourceAllocationSpec.PodHealthCheck { diff --git a/internal/controller/dra_controller_test.go b/internal/controller/dra_controller_test.go index 98b86e2..d34a237 100644 --- a/internal/controller/dra_controller_test.go +++ b/internal/controller/dra_controller_test.go @@ -491,7 +491,6 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() { DynamicResourceAllocationSpec: v1alpha.DynamicResourceAllocationSpec{ LogLevel: 3, PodHealthCheck: true, - DeviceTaints: true, }, HealthinessSpec: &v1alpha.HealthinessSpec{ CheckIntervalSeconds: 67, @@ -504,11 +503,10 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() { args := controller.generateArgs(cp) - Expect(args).To(HaveLen(5)) + Expect(args).To(HaveLen(4)) Expect(args).To(ContainElement("-v=3")) Expect(args).To(ContainElement("--health-monitoring=true")) Expect(args).To(ContainElement("--healthcheck-port=51516")) - Expect(args).To(ContainElement("--ignore-health-warning=false")) Expect(args).To(ContainElement("--manage-binding=false")) }) diff --git a/internal/controller/xpumanager_controller.go b/internal/controller/xpumanager_controller.go index 794e88e..7adbff4 100644 --- a/internal/controller/xpumanager_controller.go +++ b/internal/controller/xpumanager_controller.go @@ -250,9 +250,9 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s for _, loc := range filter.Values { switch loc { case "gpu": - setWarningThreshold(rule, float64(health.CoreTemperatureThreshold)) + setCriticalThreshold(rule, float64(health.CoreTemperatureThreshold)) case "memory": - setWarningThreshold(rule, float64(health.MemoryTemperatureThreshold)) + setCriticalThreshold(rule, float64(health.MemoryTemperatureThreshold)) } } } @@ -268,11 +268,11 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s return string(out), nil } -// setWarningThreshold updates the condition values on the "warning" state of a rule. +// setCriticalThreshold updates the condition values on the "critical" state of a rule. // All conditions are set to the same threshold, overriding any device-specific defaults. -func setWarningThreshold(rule *deployments.StatusRule, threshold float64) { +func setCriticalThreshold(rule *deployments.StatusRule, threshold float64) { for i := range rule.States { - if rule.States[i].StateName == "warning" { + if rule.States[i].StateName == "critical" { for j := range rule.States[i].Conditions { rule.States[i].Conditions[j].Value = threshold } diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index adb0ac2..6d835e7 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -498,8 +498,9 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() { AfterEach(func() { By("remove clusterpolicy") - cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait") - utils.Run(cmd) + cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait", "--ignore-not-found") + _, err := utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to uninstall clusterpolicy") // TODO: Find a better way to ensure that the xpumanager pods are gone By("wait for the xpumanager pods to vanish") @@ -518,8 +519,13 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() { Eventually(waitForKueueObjectsToClear, time.Second*60).Should(Succeed()) By("remove operator") - cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName, "--wait") - utils.Run(cmd) + cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to uninstall operator") + + Eventually(func(g Gomega) { + waitUntilNamespaceGone(g, namespace) + }, 1*time.Minute, 3*time.Second).Should(Succeed()) By("remove gpu resource slices after each test") cmd = exec.Command("kubectl", "delete", "resourceslices", "--all") diff --git a/test/e2e/e2e_utils.go b/test/e2e/e2e_utils.go index 87ecb0d..f3a8710 100644 --- a/test/e2e/e2e_utils.go +++ b/test/e2e/e2e_utils.go @@ -224,6 +224,17 @@ func waitUntilResourceSlicesAreGone(g Gomega) { g.Expect(lines).To(ContainElement("No resources found"), "Expected no ResourceSlices to be present") } +func waitUntilNamespaceGone(g Gomega, namespace string) { + cmd := exec.Command("kubectl", "get", "namespace", namespace, "--ignore-not-found", "-o", "name") + output, err := utils.Run(cmd) + if err != nil { + return + } + + g.Expect(err).NotTo(HaveOccurred(), "Failed to query namespace") + g.Expect(strings.TrimSpace(output)).To(BeEmpty(), "expected namespace "+namespace+" to be gone") +} + // removeNFDLabels removes all node-feature-discovery labels from every node. // It is called as cleanup after tests that deploy NFD, because NFD labels // persist on nodes even after the NFD workloads are deleted.