From 94fa74e630c264a50a5f30d4bb73a0da2fbaa390 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Thu, 18 Jun 2026 10:46:17 +0300 Subject: [PATCH] kueue: update to 0.18 Adapt FeatureFlags Add a cel selector to the test Pod to verify functionality Signed-off-by: Tuomas Katila --- charts/gpu-base-operator/Chart.yaml | 2 +- charts/gpu-base-operator/values.yaml | 108 ++++++++------------------- config/test/dra-kueue/claim.yaml | 3 + 3 files changed, 37 insertions(+), 76 deletions(-) diff --git a/charts/gpu-base-operator/Chart.yaml b/charts/gpu-base-operator/Chart.yaml index a0e45a7..295c794 100644 --- a/charts/gpu-base-operator/Chart.yaml +++ b/charts/gpu-base-operator/Chart.yaml @@ -17,5 +17,5 @@ dependencies: condition: kueue.install tags: - kueue - version: ~0.17 + version: ~0.18 repository: oci://registry.k8s.io/kueue/charts diff --git a/charts/gpu-base-operator/values.yaml b/charts/gpu-base-operator/values.yaml index bb33b19..a480dca 100644 --- a/charts/gpu-base-operator/values.yaml +++ b/charts/gpu-base-operator/values.yaml @@ -27,7 +27,7 @@ kueue: install: false controllerManager: featureGates: - - name: DynamicResourceAllocation + - name: KueueDRAIntegrationExtendedResource enabled: true managerConfig: # -- controller_manager_config.yaml. @@ -36,89 +36,47 @@ kueue: controllerManagerConfigYaml: |- apiVersion: config.kueue.x-k8s.io/v1beta2 kind: Configuration - health: - healthProbeBindAddress: :8081 - metrics: - bindAddress: :8443 - # enableClusterQueueResources: true - webhook: - port: 9443 - leaderElection: - leaderElect: true - resourceName: c1f6bfd2.kueue.x-k8s.io + clientConnection: + burst: 100 + qps: 50 controller: groupKindConcurrency: + ClusterQueue.kueue.x-k8s.io: 1 Job.batch: 5 - Pod: 5 - Workload.kueue.x-k8s.io: 5 LocalQueue.kueue.x-k8s.io: 1 - ClusterQueue.kueue.x-k8s.io: 1 + Pod: 5 ResourceFlavor.kueue.x-k8s.io: 1 - clientConnection: - qps: 50 - burst: 100 - #pprofBindAddress: :8083 - #waitForPodsReady: - # timeout: 5m - # recoveryTimeout: 3m - # blockAdmission: false - # requeuingStrategy: - # timestamp: Eviction - # backoffLimitCount: null # null indicates infinite requeuing - # backoffBaseSeconds: 60 - # backoffMaxSeconds: 3600 - #manageJobsWithoutQueueName: true - #managedJobsNamespaceSelector: - # matchExpressions: - # - key: kubernetes.io/metadata.name - # operator: NotIn - # values: [ kube-system, kueue-system ] - #internalCertManagement: - # enable: false - # webhookServiceName: "" - # webhookSecretName: "" + Workload.kueue.x-k8s.io: 5 + health: + healthProbeBindAddress: :8081 integrations: frameworks: - - "batch/job" - - "kubeflow.org/mpijob" - - "ray.io/rayjob" - - "ray.io/raycluster" - - "jobset.x-k8s.io/jobset" - - "trainer.kubeflow.org/trainjob" - - "kubeflow.org/paddlejob" - - "kubeflow.org/pytorchjob" - - "kubeflow.org/tfjob" - - "kubeflow.org/xgboostjob" - - "kubeflow.org/jaxjob" - - "workload.codeflare.dev/appwrapper" - - "pod" - - "deployment" - - "statefulset" - - "leaderworkerset.x-k8s.io/leaderworkerset" - # externalFrameworks: - # - "Foo.v1.example.com" - #fairSharing: - # preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] - #admissionFairSharing: - # usageHalfLifeTime: "168h" # 7 days - # usageSamplingInterval: "5m" - # resourceWeights: # optional, defaults to 1 for all resources if not specified - # cpu: 0 # if you want to completely ignore cpu usage - # memory: 0 # ignore completely memory usage - # example.com/gpu: 100 # and you care only about GPUs usage + - batch/job + - kubeflow.org/mpijob + - ray.io/rayjob + - ray.io/rayservice + - ray.io/raycluster + - jobset.x-k8s.io/jobset + - trainer.kubeflow.org/trainjob + - kubeflow.org/paddlejob + - kubeflow.org/pytorchjob + - kubeflow.org/tfjob + - kubeflow.org/xgboostjob + - kubeflow.org/jaxjob + - workload.codeflare.dev/appwrapper + - pod + - deployment + - statefulset + - leaderworkerset.x-k8s.io/leaderworkerset + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + metrics: + bindAddress: :8443 + webhook: + port: 9443 resources: deviceClassMappings: - deviceClassNames: - "gpu.intel.com" name: "dra.gpu.intel.com" - # excludeResourcePrefixes: [] - # transformations: - # - input: nvidia.com/mig-4g.5gb - # strategy: Replace | Retain - # outputs: - # example.com/accelerator-memory: 5Gi - # example.com/accelerator-gpc: 4 - #objectRetentionPolicies: - # workloads: - # afterFinished: null # null indicates infinite retention, 0s means no retention at all - # afterDeactivatedByKueue: null # null indicates infinite retention, 0s means no retention at all diff --git a/config/test/dra-kueue/claim.yaml b/config/test/dra-kueue/claim.yaml index d0c9a13..bdda1a0 100644 --- a/config/test/dra-kueue/claim.yaml +++ b/config/test/dra-kueue/claim.yaml @@ -9,3 +9,6 @@ spec: - name: gpu exactly: deviceClassName: gpu.intel.com + selectors: + - cel: + expression: "device.attributes['gpu.intel.com'].driver == 'xe' || device.attributes['gpu.intel.com'].driver == 'i915'"