Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/validate-public.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
pull_request:
branches:
- 'main'
- 'release-*'
workflow_dispatch:

permissions: {}
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ Used when `spec.resourceRegistration: dra`.

|Field|Description|Default|
|---|---|---|
|`spec.dra.deviceTaints`|Apply taints to GPU devices reported as unhealthy by health monitoring|`false`|
|`spec.dra.podHealthCheck`|Enable health check for DRA Pod|true|
|`spec.dra.manageBinding`|Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads|`false`|

#### Health monitoring (`spec.health`)

Expand All @@ -196,7 +196,7 @@ Applies to both DP and DRA unless noted. Thresholds that are exceeded mark the G

|Field|Description|Default|
|---|---|---|
|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`xe_monitoring`|
|`spec.xpu.monitoringResource`|Set XPUMD resource for Device Plugin use.|`monitoring`|
|`spec.xpu.configMapOverride`|Name of a ConfigMap in the operator namespace containing a custom OpenTelemetry Collector `config.yaml`|—|

#### Kueue (`spec.kueue`)
Expand Down
4 changes: 0 additions & 4 deletions api/v1alpha1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,6 @@ type DynamicResourceAllocationSpec struct {

LogLevel int32 `json:"logLevel,omitempty"`

// DeviceTaints controls whether DRA applies taints to the GPU devices if
// the devices are indicated as unhealthy by the health monitoring.
DeviceTaints bool `json:"deviceTaints,omitempty"`

// Enable DRA Pod's health check.
// +kubebuilder:default=true
PodHealthCheck bool `json:"podHealthCheck,omitempty"`
Expand Down
6 changes: 3 additions & 3 deletions charts/gpu-base-operator-policy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ See [Customizing the Chart Before Installing](https://helm.sh/docs/intro/using_h
| dp.byPathMode | single | DP by-path mounting mode |
| dp.allowIDs | [] | Allowed PCI Device IDs |
| dp.denyIDs | [] | Denied PCI Device IDs |
| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.10.0 | DRA driver image. |
| dra.image | ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0 | DRA driver image. |
| dra.logLevel | 2 | DRA log level. |
| dra.deviceTaints | false | Enable device taints. |
| dra.podHealthCheck | true | Health check for DRA Pod. |
| dra.manageBinding | false | Allow DRA plugin to manage device binding between xe/i915 and vfio drivers. Needed for dynamic switching between normal and KubeVirt workloads. |
| xpu.image | ghcr.io/intel/xpumanager/xpumd:v2.0.0 | XPU manager image. |
| xpu.logLevel | 2 | XPU manager log level. |
| xpu.monitoringResource | xe_monitoring | Monitoring resource for XPUMD with device plugin. |
| xpu.monitoringResource | monitoring | Monitoring resource for XPUMD with device plugin. |
| xpu.configMapOverride | "" | Override the default XPUM configuration ConfigMap name. |
| kueue.equalResources | [] | List of ClusterQueue configurations. |
| pullSecret | null | Image pull secret. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ spec:
dra:
image: {{ .Values.dra.image }}
logLevel: {{ .Values.dra.logLevel }}
deviceTaints: {{ .Values.dra.deviceTaints | default false }}
podHealthCheck: {{ .Values.dra.podHealthCheck | default true }}
manageBinding: {{ .Values.dra.manageBinding | default false }}
{{- else }}
Expand Down
1 change: 0 additions & 1 deletion charts/gpu-base-operator-policy/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ dp:
dra:
image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197
logLevel: 2
deviceTaints: false
podHealthCheck: true
manageBinding: false

Expand Down
1 change: 1 addition & 0 deletions charts/gpu-base-operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ appVersion: "0.0.1"

dependencies:
- name: node-feature-discovery
alias: nfd
condition: nfd.install
tags:
- nfd
Expand Down
5 changes: 0 additions & 5 deletions charts/gpu-base-operator/crds/clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,6 @@ spec:
description: DynamicResourceAllocationSpec defines the desired state
of DynamicResourceAllocation.
properties:
deviceTaints:
description: |-
DeviceTaints controls whether DRA applies taints to the GPU devices if
the devices are indicated as unhealthy by the health monitoring.
type: boolean
image:
type: string
logLevel:
Expand Down
1 change: 1 addition & 0 deletions charts/gpu-base-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ createNamespace: true

nfd:
install: false
postDeleteCleanup: false

operator:
image:
Expand Down
5 changes: 0 additions & 5 deletions config/crd/bases/intel.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,6 @@ spec:
description: DynamicResourceAllocationSpec defines the desired state
of DynamicResourceAllocation.
properties:
deviceTaints:
description: |-
DeviceTaints controls whether DRA applies taints to the GPU devices if
the devices are indicated as unhealthy by the health monitoring.
type: boolean
image:
type: string
logLevel:
Expand Down
38 changes: 12 additions & 26 deletions config/deployments/xpum/otel-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ processors:
states:
# Default state
- state_name: "ok"
- state_name: "warning"
- state_name: "critical"
conditions:
- value: 105.0
- value: 100.0
- name: "Intel GPU memory temperature health"
source_metric: "hw.temperature"
parent_metric: "hw.gpu.info"
Expand All @@ -58,31 +58,9 @@ processors:
states:
# Default state
- state_name: "ok"
- state_name: "warning"
- state_name: "critical"
conditions:
- value: 85.0

- name: "Intel GPU power limit"
source_metric: "hw.power"
parent_metric: "hw.gpu.info"
parent_ref_attribute: "hw.id"
parent_filters:
- key: "pci.vendor_id"
values: ["8086"]
component_filters:
- key: "hw.sensor_location"
values: ["card"]
copy_attributes: ["hw.id", "hw.name", "pci.bdf", "hw.sensor_location"]
add_attributes:
hw.type: "power"
# Ordered list of state rules. The rules should be ordered by increasing severity.
# All rules are evaluated, the last matching one will be active.
states:
# Default state
- state_name: "ok"
- state_name: "warning"
conditions:
- value: 150.0
- value: 100.0

exporters:
intelxpuinfo:
Expand All @@ -106,6 +84,14 @@ exporters:
"*":
severity: warning
message: "Unexpected memory health state, mapped to warning"
# Filter out (by using empty state_mapping) ECC states that are not indicative of device health
- health_domain: "gpu"
filters:
- key: hw.type
values: [gpu]
- key: hw.state
values: [ecc_disabled, ecc_enabled, ecc_available, ecc_unavailable]
state_mapping: {}
- health_domain: "{{ .hw_type }}"
state_mapping:
unknown:
Expand Down
1 change: 0 additions & 1 deletion config/samples/dra/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ spec:
dra:
image: ghcr.io/intel/intel-resource-drivers-for-kubernetes/intel-gpu-resource-driver:v0.11.0@sha256:49f38fcbee4f98d748b537b1a728a12f39e56fd208c86dc3fa32ab2162c21197
logLevel: 2
deviceTaints: false
podHealthCheck: true
4 changes: 0 additions & 4 deletions internal/controller/dra_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -487,10 +487,6 @@ func (r *DRAReconciler) generateArgs(spec *v1alpha.ClusterPolicy) []string {

if spec.Spec.HealthinessSpec != nil {
args = append(args, "--health-monitoring=true")

if spec.Spec.DynamicResourceAllocationSpec.DeviceTaints {
args = append(args, "--ignore-health-warning=false")
}
}

if spec.Spec.DynamicResourceAllocationSpec.PodHealthCheck {
Expand Down
4 changes: 1 addition & 3 deletions internal/controller/dra_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,6 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() {
DynamicResourceAllocationSpec: v1alpha.DynamicResourceAllocationSpec{
LogLevel: 3,
PodHealthCheck: true,
DeviceTaints: true,
},
HealthinessSpec: &v1alpha.HealthinessSpec{
CheckIntervalSeconds: 67,
Expand All @@ -504,11 +503,10 @@ var _ = Describe("ClusterPolicy Controller for DRA", func() {

args := controller.generateArgs(cp)

Expect(args).To(HaveLen(5))
Expect(args).To(HaveLen(4))
Expect(args).To(ContainElement("-v=3"))
Expect(args).To(ContainElement("--health-monitoring=true"))
Expect(args).To(ContainElement("--healthcheck-port=51516"))
Expect(args).To(ContainElement("--ignore-health-warning=false"))
Expect(args).To(ContainElement("--manage-binding=false"))
})

Expand Down
10 changes: 5 additions & 5 deletions internal/controller/xpumanager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,9 +250,9 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s
for _, loc := range filter.Values {
switch loc {
case "gpu":
setWarningThreshold(rule, float64(health.CoreTemperatureThreshold))
setCriticalThreshold(rule, float64(health.CoreTemperatureThreshold))
case "memory":
setWarningThreshold(rule, float64(health.MemoryTemperatureThreshold))
setCriticalThreshold(rule, float64(health.MemoryTemperatureThreshold))
}
}
}
Expand All @@ -268,11 +268,11 @@ func (r *XpuManagerReconciler) buildOTelConfigData(cp *v1alpha.ClusterPolicy) (s
return string(out), nil
}

// setWarningThreshold updates the condition values on the "warning" state of a rule.
// setCriticalThreshold updates the condition values on the "critical" state of a rule.
// All conditions are set to the same threshold, overriding any device-specific defaults.
func setWarningThreshold(rule *deployments.StatusRule, threshold float64) {
func setCriticalThreshold(rule *deployments.StatusRule, threshold float64) {
for i := range rule.States {
if rule.States[i].StateName == "warning" {
if rule.States[i].StateName == "critical" {
for j := range rule.States[i].Conditions {
rule.States[i].Conditions[j].Value = threshold
}
Expand Down
14 changes: 10 additions & 4 deletions test/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,8 +498,9 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() {

AfterEach(func() {
By("remove clusterpolicy")
cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait")
utils.Run(cmd)
cmd := exec.Command("helm", "uninstall", "-n", namespace, helmPolicyName, "--wait", "--ignore-not-found")
_, err := utils.Run(cmd)
Expect(err).NotTo(HaveOccurred(), "Failed to uninstall clusterpolicy")

// TODO: Find a better way to ensure that the xpumanager pods are gone
By("wait for the xpumanager pods to vanish")
Expand All @@ -518,8 +519,13 @@ var _ = Describe("Helm", Ordered, Label("helm"), func() {
Eventually(waitForKueueObjectsToClear, time.Second*60).Should(Succeed())

By("remove operator")
cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName, "--wait")
utils.Run(cmd)
cmd = exec.Command("helm", "uninstall", "-n", namespace, helmOperatorName)
_, err = utils.Run(cmd)
Expect(err).NotTo(HaveOccurred(), "Failed to uninstall operator")

Eventually(func(g Gomega) {
waitUntilNamespaceGone(g, namespace)
}, 1*time.Minute, 3*time.Second).Should(Succeed())

By("remove gpu resource slices after each test")
cmd = exec.Command("kubectl", "delete", "resourceslices", "--all")
Expand Down
11 changes: 11 additions & 0 deletions test/e2e/e2e_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,17 @@ func waitUntilResourceSlicesAreGone(g Gomega) {
g.Expect(lines).To(ContainElement("No resources found"), "Expected no ResourceSlices to be present")
}

func waitUntilNamespaceGone(g Gomega, namespace string) {
cmd := exec.Command("kubectl", "get", "namespace", namespace, "--ignore-not-found", "-o", "name")
output, err := utils.Run(cmd)
if err != nil {
return
}

g.Expect(err).NotTo(HaveOccurred(), "Failed to query namespace")
g.Expect(strings.TrimSpace(output)).To(BeEmpty(), "expected namespace "+namespace+" to be gone")
}

// removeNFDLabels removes all node-feature-discovery labels from every node.
// It is called as cleanup after tests that deploy NFD, because NFD labels
// persist on nodes even after the NFD workloads are deleted.
Expand Down