Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Pod finalizers for succeeded groups #1905

Merged
merged 1 commit into from Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 2 additions & 5 deletions pkg/controller/jobframework/reconciler.go
Expand Up @@ -273,11 +273,8 @@ func (r *JobReconciler) ReconcileGenericJob(ctx context.Context, req ctrl.Reques
}

if wl != nil && apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) {
// Finalize the job if it's finished
if _, finished := job.Finished(); finished {
if err := r.finalizeJob(ctx, job); err != nil {
return ctrl.Result{}, err
}
if err := r.finalizeJob(ctx, job); err != nil {
return ctrl.Result{}, err
}
Comment on lines +276 to 278
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this PR, we always finalize the Job when the Job has WorkloadFinished condition.
So, is this workload-controller mechanism still needed?

if len(wl.ObjectMeta.OwnerReferences) == 0 && !wl.DeletionTimestamp.IsZero() {
return ctrl.Result{}, workload.RemoveFinalizer(ctx, r.client, &wl)
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It still helps in the case of a rogue user manually deleting finalizers.

But I'm also never sure which cases we are missing when dealing with finalizers. So I prefer to keep it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense.
Thank you for the clarification!
/lgtm


r.record.Eventf(object, corev1.EventTypeNormal, ReasonFinishedWorkload,
Expand Down
3 changes: 2 additions & 1 deletion pkg/controller/jobs/pod/pod_controller_test.go
Expand Up @@ -978,10 +978,11 @@ func TestReconciler(t *testing.T) {
},
workloadCmpOpts: defaultWorkloadCmpOpts,
},
"workload is not deleted if one of the pods in the finished group is absent": {
"Pods are finalized even if one of the pods in the finished group is absent": {
pods: []corev1.Pod{
*basePodWrapper.
Clone().
KueueFinalizer().
Label("kueue.x-k8s.io/managed", "true").
Group("test-group").
GroupTotalCount("2").
Expand Down
60 changes: 60 additions & 0 deletions test/integration/controller/jobs/pod/pod_controller_test.go
Expand Up @@ -18,6 +18,7 @@ package pod

import (
"fmt"
"strconv"
"time"

"github.com/google/go-cmp/cmp"
Expand Down Expand Up @@ -1025,6 +1026,65 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu
})
})

ginkgo.It("Should finalize all Succeeded Pods when deleted", func() {
ginkgo.By("Creating pods with queue name")
// Use a number of Pods big enough to cause conflicts when removing finalizers >50% of the time.
const podCount = 7
pods := make([]*corev1.Pod, podCount)
for i := range pods {
pods[i] = testingpod.MakePod(fmt.Sprintf("test-pod-%d", i), ns.Name).
Group("test-group").
GroupTotalCount(strconv.Itoa(podCount)).
Request(corev1.ResourceCPU, "1").
Queue("test-queue").
Obj()
gomega.Expect(k8sClient.Create(ctx, pods[i])).Should(gomega.Succeed())
}

ginkgo.By("checking that workload is created for the pod group")
wlLookupKey := types.NamespacedName{
Namespace: pods[0].Namespace,
Name: "test-group",
}
createdWorkload := &kueue.Workload{}
gomega.Eventually(func() error {
return k8sClient.Get(ctx, wlLookupKey, createdWorkload)
}, util.Timeout, util.Interval).Should(gomega.Succeed())

ginkgo.By("Admitting workload", func() {
admission := testing.MakeAdmission(clusterQueue.Name).PodSets(
kueue.PodSetAssignment{
Name: "4b0469f7",
Flavors: map[corev1.ResourceName]kueue.ResourceFlavorReference{
corev1.ResourceCPU: "default",
},
Count: ptr.To[int32](podCount),
},
).Obj()
gomega.Expect(util.SetQuotaReservation(ctx, k8sClient, createdWorkload, admission)).Should(gomega.Succeed())
util.SyncAdmittedConditionForWorkloads(ctx, k8sClient, createdWorkload)

for i := range pods {
util.ExpectPodUnsuspendedWithNodeSelectors(ctx, k8sClient, client.ObjectKeyFromObject(pods[i]), map[string]string{"kubernetes.io/arch": "arm64"})
}
})

ginkgo.By("Finishing and deleting Pods", func() {
util.SetPodsPhase(ctx, k8sClient, corev1.PodSucceeded, pods...)
for i := range pods {
gomega.Expect(k8sClient.Delete(ctx, pods[i])).To(gomega.Succeed())
}

gomega.Eventually(func(g gomega.Gomega) {
for i := range pods {
key := types.NamespacedName{Namespace: ns.Name, Name: fmt.Sprintf("test-pod-%d", i)}
g.Expect(k8sClient.Get(ctx, key, &corev1.Pod{})).To(testing.BeNotFoundError())
}
}, util.Timeout, util.Interval).Should(gomega.Succeed())
})

})

ginkgo.It("Should finalize workload if pods are absent", func() {
ginkgo.By("Creating pods with queue name")
pod1 := testingpod.MakePod("test-pod1", ns.Name).
Expand Down