Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/gpu-base-operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ dependencies:
condition: kueue.install
tags:
- kueue
version: ~0.17
version: ~0.18
repository: oci://registry.k8s.io/kueue/charts
108 changes: 33 additions & 75 deletions charts/gpu-base-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ kueue:
install: false
controllerManager:
featureGates:
- name: DynamicResourceAllocation
- name: KueueDRAIntegrationExtendedResource
enabled: true
managerConfig:
# -- controller_manager_config.yaml.
Expand All @@ -36,89 +36,47 @@ kueue:
controllerManagerConfigYaml: |-
apiVersion: config.kueue.x-k8s.io/v1beta2
kind: Configuration
health:
healthProbeBindAddress: :8081
metrics:
bindAddress: :8443
# enableClusterQueueResources: true
webhook:
port: 9443
leaderElection:
leaderElect: true
resourceName: c1f6bfd2.kueue.x-k8s.io
clientConnection:
burst: 100
qps: 50
controller:
groupKindConcurrency:
ClusterQueue.kueue.x-k8s.io: 1
Job.batch: 5
Pod: 5
Workload.kueue.x-k8s.io: 5
LocalQueue.kueue.x-k8s.io: 1
ClusterQueue.kueue.x-k8s.io: 1
Pod: 5
ResourceFlavor.kueue.x-k8s.io: 1
clientConnection:
qps: 50
burst: 100
#pprofBindAddress: :8083
#waitForPodsReady:
# timeout: 5m
# recoveryTimeout: 3m
# blockAdmission: false
# requeuingStrategy:
# timestamp: Eviction
# backoffLimitCount: null # null indicates infinite requeuing
# backoffBaseSeconds: 60
# backoffMaxSeconds: 3600
#manageJobsWithoutQueueName: true
#managedJobsNamespaceSelector:
# matchExpressions:
# - key: kubernetes.io/metadata.name
# operator: NotIn
# values: [ kube-system, kueue-system ]
#internalCertManagement:
# enable: false
# webhookServiceName: ""
# webhookSecretName: ""
Workload.kueue.x-k8s.io: 5
health:
healthProbeBindAddress: :8081
integrations:
frameworks:
- "batch/job"
- "kubeflow.org/mpijob"
- "ray.io/rayjob"
- "ray.io/raycluster"
- "jobset.x-k8s.io/jobset"
- "trainer.kubeflow.org/trainjob"
- "kubeflow.org/paddlejob"
- "kubeflow.org/pytorchjob"
- "kubeflow.org/tfjob"
- "kubeflow.org/xgboostjob"
- "kubeflow.org/jaxjob"
- "workload.codeflare.dev/appwrapper"
- "pod"
- "deployment"
- "statefulset"
- "leaderworkerset.x-k8s.io/leaderworkerset"
# externalFrameworks:
# - "Foo.v1.example.com"
#fairSharing:
# preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare]
#admissionFairSharing:
# usageHalfLifeTime: "168h" # 7 days
# usageSamplingInterval: "5m"
# resourceWeights: # optional, defaults to 1 for all resources if not specified
# cpu: 0 # if you want to completely ignore cpu usage
# memory: 0 # ignore completely memory usage
# example.com/gpu: 100 # and you care only about GPUs usage
- batch/job
- kubeflow.org/mpijob
- ray.io/rayjob
- ray.io/rayservice
- ray.io/raycluster
- jobset.x-k8s.io/jobset
- trainer.kubeflow.org/trainjob
- kubeflow.org/paddlejob
- kubeflow.org/pytorchjob
- kubeflow.org/tfjob
- kubeflow.org/xgboostjob
- kubeflow.org/jaxjob
- workload.codeflare.dev/appwrapper
- pod
- deployment
- statefulset
- leaderworkerset.x-k8s.io/leaderworkerset
leaderElection:
leaderElect: true
resourceName: c1f6bfd2.kueue.x-k8s.io
metrics:
bindAddress: :8443
webhook:
port: 9443
resources:
deviceClassMappings:
- deviceClassNames:
- "gpu.intel.com"
name: "dra.gpu.intel.com"
# excludeResourcePrefixes: []
# transformations:
# - input: nvidia.com/mig-4g.5gb
# strategy: Replace | Retain
# outputs:
# example.com/accelerator-memory: 5Gi
# example.com/accelerator-gpc: 4
#objectRetentionPolicies:
# workloads:
# afterFinished: null # null indicates infinite retention, 0s means no retention at all
# afterDeactivatedByKueue: null # null indicates infinite retention, 0s means no retention at all
3 changes: 3 additions & 0 deletions config/test/dra-kueue/claim.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ spec:
- name: gpu
exactly:
deviceClassName: gpu.intel.com
selectors:
- cel:
expression: "device.attributes['gpu.intel.com'].driver == 'xe' || device.attributes['gpu.intel.com'].driver == 'i915'"