Skip to content

Commit

Permalink
Implement backoff re-queuing mechanism (#1709)
Browse files Browse the repository at this point in the history
* Add requeuingStrategy.backoffLimitCount to config API

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Add status.requeueState to the Workload

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Implement a backoff re-queuing mechanism when the workload meets an Eviction with a PodsReadyTimout reason

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Update KEP-1282

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Update Config API comments

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Update Workload object comments

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* The backoffLimitCount must be greater than or equal to 0

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Just checking if the workload has an evicted condition

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Record events after the workload is updated

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Rename canNotQueueWorkloadByBackoff to backoffWaitingTimeExpired

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Always insert requeueState

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Remove not useful api comments

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

* Avoid flakinesses of integration tests caused by checking the backoff pending

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>

---------

Signed-off-by: Yuki Iwai <yuki.iwai.tz@gmail.com>
  • Loading branch information
tenzen-y committed Feb 12, 2024
1 parent dc123b4 commit 5a0a714
Show file tree
Hide file tree
Showing 36 changed files with 1,195 additions and 157 deletions.
23 changes: 21 additions & 2 deletions apis/config/v1beta1/configuration_types.go
Expand Up @@ -218,10 +218,29 @@ type MultiKueue struct {
}

type RequeuingStrategy struct {
// Timestamp defines the timestamp used for requeuing a Workload
// that was evicted due to Pod readiness. Defaults to Eviction.
// Timestamp defines the timestamp used for re-queuing a Workload
// that was evicted due to Pod readiness. The possible values are:
//
// - `Eviction` (default) indicates from Workload `Evicted` condition with `PodsReadyTimeout` reason.
// - `Creation` indicates from Workload .metadata.creationTimestamp.
//
// +optional
Timestamp *RequeuingTimestamp `json:"timestamp,omitempty"`

// BackoffLimitCount defines the maximum number of re-queuing retries.
// Once the number is reached, the workload is deactivated (`.spec.activate`=`false`).
// When it is null, the workloads will repeatedly and endless re-queueing.
//
// Every backoff duration is about "1.41284738^(n-1)+Rand" where the "n" represents the "workloadStatus.requeueState.count",
// and the "Rand" represents the random jitter. During this time, the workload is taken as an inadmissible and
// other workloads will have a chance to be admitted.
// For example, when the "waitForPodsReady.timeout" is the default, the workload deactivation time is as follows:
// {backoffLimitCount, workloadDeactivationSeconds}
// ~= {1, 601}, {2, 902}, ...,{5, 1811}, ...,{10, 3374}, ...,{20, 8730}, ...,{30, 86400(=24 hours)}, ...
//
// Defaults to null.
// +optional
BackoffLimitCount *int32 `json:"backoffLimitCount,omitempty"`
}

type RequeuingTimestamp string
Expand Down
5 changes: 5 additions & 0 deletions apis/config/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions apis/kueue/v1beta1/workload_types.go
Expand Up @@ -151,6 +151,12 @@ type WorkloadStatus struct {
// changed once set.
Admission *Admission `json:"admission,omitempty"`

// requeueState holds the re-queue state
// when a workload meets Eviction with PodsReadyTimeout reason.
//
// +optional
RequeueState *RequeueState `json:"requeueState,omitempty"`

// conditions hold the latest available observations of the Workload
// current state.
//
Expand Down Expand Up @@ -184,6 +190,23 @@ type WorkloadStatus struct {
AdmissionChecks []AdmissionCheckState `json:"admissionChecks,omitempty" patchStrategy:"merge" patchMergeKey:"name"`
}

type RequeueState struct {
// count records the number of times a workload has been re-queued
// When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
// this count would be reset to null.
//
// +optional
// +kubebuilder:validation:Minimum=0
Count *int32 `json:"count,omitempty"`

// requeueAt records the time when a workload will be re-queued.
// When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
// this time would be reset to null.
//
// +optional
RequeueAt *metav1.Time `json:"requeueAt,omitempty"`
}

type AdmissionCheckState struct {
// name identifies the admission check.
// +required
Expand Down
29 changes: 29 additions & 0 deletions apis/kueue/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions charts/kueue/templates/crd/kueue.x-k8s.io_workloads.yaml
Expand Up @@ -8088,6 +8088,27 @@ spec:
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
requeueState:
description: |-
requeueState holds the re-queue state
when a workload meets Eviction with PodsReadyTimeout reason.
properties:
count:
description: |-
count records the number of times a workload has been re-queued
When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
this count would be reset to null.
format: int32
minimum: 0
type: integer
requeueAt:
description: |-
requeueAt records the time when a workload will be re-queued.
When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
this time would be reset to null.
format: date-time
type: string
type: object
type: object
type: object
served: true
Expand Down
1 change: 1 addition & 0 deletions charts/kueue/templates/webhook/webhook.yaml
Expand Up @@ -290,6 +290,7 @@ webhooks:
- UPDATE
resources:
- workloads
- workloads/status
sideEffects: None
---
apiVersion: admissionregistration.k8s.io/v1
Expand Down
51 changes: 51 additions & 0 deletions client-go/applyconfiguration/kueue/v1beta1/requeuestate.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions client-go/applyconfiguration/kueue/v1beta1/workloadstatus.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions client-go/applyconfiguration/utils.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 1 addition & 5 deletions cmd/kueue/main.go
Expand Up @@ -342,11 +342,7 @@ func setupServerVersionFetcher(mgr ctrl.Manager, kubeConfig *rest.Config) *kubev
}

func blockForPodsReady(cfg *configapi.Configuration) bool {
return waitForPodsReady(cfg) && cfg.WaitForPodsReady.BlockAdmission != nil && *cfg.WaitForPodsReady.BlockAdmission
}

func waitForPodsReady(cfg *configapi.Configuration) bool {
return cfg.WaitForPodsReady != nil && cfg.WaitForPodsReady.Enable
return config.WaitForPodsReadyIsEnabled(cfg) && cfg.WaitForPodsReady.BlockAdmission != nil && *cfg.WaitForPodsReady.BlockAdmission
}

func podsReadyRequeuingTimestamp(cfg *configapi.Configuration) configapi.RequeuingTimestamp {
Expand Down
21 changes: 21 additions & 0 deletions config/components/crd/bases/kueue.x-k8s.io_workloads.yaml
Expand Up @@ -8075,6 +8075,27 @@ spec:
x-kubernetes-list-map-keys:
- name
x-kubernetes-list-type: map
requeueState:
description: |-
requeueState holds the re-queue state
when a workload meets Eviction with PodsReadyTimeout reason.
properties:
count:
description: |-
count records the number of times a workload has been re-queued
When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
this count would be reset to null.
format: int32
minimum: 0
type: integer
requeueAt:
description: |-
requeueAt records the time when a workload will be re-queued.
When a deactivated (`.spec.activate`=`false`) workload is reactivated (`.spec.activate`=`true`),
this time would be reset to null.
format: date-time
type: string
type: object
type: object
type: object
served: true
Expand Down
1 change: 1 addition & 0 deletions config/components/webhook/manifests.yaml
Expand Up @@ -270,6 +270,7 @@ webhooks:
- UPDATE
resources:
- workloads
- workloads/status
sideEffects: None
---
apiVersion: admissionregistration.k8s.io/v1
Expand Down

0 comments on commit 5a0a714

Please sign in to comment.