From 94fa74e630c264a50a5f30d4bb73a0da2fbaa390 Mon Sep 17 00:00:00 2001
From: Tuomas Katila <tuomas.katila@intel.com>
Date: Thu, 18 Jun 2026 10:46:17 +0300
Subject: [PATCH] kueue: update to 0.18

Adapt FeatureFlags
Add a cel selector to the test Pod to verify functionality

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
---
 charts/gpu-base-operator/Chart.yaml  |   2 +-
 charts/gpu-base-operator/values.yaml | 108 ++++++++-------------------
 config/test/dra-kueue/claim.yaml     |   3 +
 3 files changed, 37 insertions(+), 76 deletions(-)

diff --git a/charts/gpu-base-operator/Chart.yaml b/charts/gpu-base-operator/Chart.yaml
index a0e45a7..295c794 100644
--- a/charts/gpu-base-operator/Chart.yaml
+++ b/charts/gpu-base-operator/Chart.yaml
@@ -17,5 +17,5 @@ dependencies:
     condition: kueue.install
     tags:
       - kueue
-    version: ~0.17
+    version: ~0.18
     repository: oci://registry.k8s.io/kueue/charts
diff --git a/charts/gpu-base-operator/values.yaml b/charts/gpu-base-operator/values.yaml
index bb33b19..a480dca 100644
--- a/charts/gpu-base-operator/values.yaml
+++ b/charts/gpu-base-operator/values.yaml
@@ -27,7 +27,7 @@ kueue:
   install: false
   controllerManager:
     featureGates:
-    - name: DynamicResourceAllocation
+    - name: KueueDRAIntegrationExtendedResource
       enabled: true
   managerConfig:
     # -- controller_manager_config.yaml.
@@ -36,89 +36,47 @@ kueue:
     controllerManagerConfigYaml: |-
       apiVersion: config.kueue.x-k8s.io/v1beta2
       kind: Configuration
-      health:
-        healthProbeBindAddress: :8081
-      metrics:
-        bindAddress: :8443
-      # enableClusterQueueResources: true
-      webhook:
-        port: 9443
-      leaderElection:
-        leaderElect: true
-        resourceName: c1f6bfd2.kueue.x-k8s.io
+      clientConnection:
+        burst: 100
+        qps: 50
       controller:
         groupKindConcurrency:
+          ClusterQueue.kueue.x-k8s.io: 1
           Job.batch: 5
-          Pod: 5
-          Workload.kueue.x-k8s.io: 5
           LocalQueue.kueue.x-k8s.io: 1
-          ClusterQueue.kueue.x-k8s.io: 1
+          Pod: 5
           ResourceFlavor.kueue.x-k8s.io: 1
-      clientConnection:
-        qps: 50
-        burst: 100
-      #pprofBindAddress: :8083
-      #waitForPodsReady:
-      #  timeout: 5m
-      #  recoveryTimeout: 3m
-      #  blockAdmission: false
-      #  requeuingStrategy:
-      #    timestamp: Eviction
-      #    backoffLimitCount: null # null indicates infinite requeuing
-      #    backoffBaseSeconds: 60
-      #    backoffMaxSeconds: 3600
-      #manageJobsWithoutQueueName: true
-      #managedJobsNamespaceSelector:
-      #  matchExpressions:
-      #    - key: kubernetes.io/metadata.name
-      #      operator: NotIn
-      #      values: [ kube-system, kueue-system ]
-      #internalCertManagement:
-      #  enable: false
-      #  webhookServiceName: ""
-      #  webhookSecretName: ""
+          Workload.kueue.x-k8s.io: 5
+      health:
+        healthProbeBindAddress: :8081
       integrations:
         frameworks:
-        - "batch/job"
-        - "kubeflow.org/mpijob"
-        - "ray.io/rayjob"
-        - "ray.io/raycluster"
-        - "jobset.x-k8s.io/jobset"
-        - "trainer.kubeflow.org/trainjob"
-        - "kubeflow.org/paddlejob"
-        - "kubeflow.org/pytorchjob"
-        - "kubeflow.org/tfjob"
-        - "kubeflow.org/xgboostjob"
-        - "kubeflow.org/jaxjob"
-        - "workload.codeflare.dev/appwrapper"
-        - "pod"
-        - "deployment"
-        - "statefulset"
-        - "leaderworkerset.x-k8s.io/leaderworkerset"
-      #  externalFrameworks:
-      #  - "Foo.v1.example.com"
-      #fairSharing:
-      #  preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare]
-      #admissionFairSharing:
-      #  usageHalfLifeTime: "168h" # 7 days
-      #  usageSamplingInterval: "5m"
-      #  resourceWeights: # optional, defaults to 1 for all resources if not specified
-      #    cpu: 0    # if you want to completely ignore cpu usage
-      #    memory: 0 # ignore completely memory usage
-      #    example.com/gpu: 100 # and you care only about GPUs usage
+        - batch/job
+        - kubeflow.org/mpijob
+        - ray.io/rayjob
+        - ray.io/rayservice
+        - ray.io/raycluster
+        - jobset.x-k8s.io/jobset
+        - trainer.kubeflow.org/trainjob
+        - kubeflow.org/paddlejob
+        - kubeflow.org/pytorchjob
+        - kubeflow.org/tfjob
+        - kubeflow.org/xgboostjob
+        - kubeflow.org/jaxjob
+        - workload.codeflare.dev/appwrapper
+        - pod
+        - deployment
+        - statefulset
+        - leaderworkerset.x-k8s.io/leaderworkerset
+      leaderElection:
+        leaderElect: true
+        resourceName: c1f6bfd2.kueue.x-k8s.io
+      metrics:
+        bindAddress: :8443
+      webhook:
+        port: 9443
       resources:
         deviceClassMappings:
         - deviceClassNames:
           - "gpu.intel.com"
           name: "dra.gpu.intel.com"
-      #  excludeResourcePrefixes: []
-      # transformations:
-      # - input: nvidia.com/mig-4g.5gb
-      #   strategy: Replace | Retain
-      #   outputs:
-      #     example.com/accelerator-memory: 5Gi
-      #     example.com/accelerator-gpc: 4
-      #objectRetentionPolicies:
-      #  workloads:
-      #    afterFinished: null # null indicates infinite retention, 0s means no retention at all
-      #    afterDeactivatedByKueue: null # null indicates infinite retention, 0s means no retention at all
diff --git a/config/test/dra-kueue/claim.yaml b/config/test/dra-kueue/claim.yaml
index d0c9a13..bdda1a0 100644
--- a/config/test/dra-kueue/claim.yaml
+++ b/config/test/dra-kueue/claim.yaml
@@ -9,3 +9,6 @@ spec:
       - name: gpu
         exactly:
           deviceClassName: gpu.intel.com
+          selectors:
+          - cel:
+              expression: "device.attributes['gpu.intel.com'].driver == 'xe' || device.attributes['gpu.intel.com'].driver == 'i915'"