From 41769de2a9b79a1bd2734ee75f895b2492f8fd94 Mon Sep 17 00:00:00 2001 From: Knative Prow Robot Date: Sat, 3 Feb 2024 02:23:31 +0000 Subject: [PATCH] [release-1.13] Don't drop traffic when upgrading a deployment fails (#14864) * Surface Replica failures over Progressing failures When transforming the deployment status to the revision we want to bubble up the more severe condition to Ready. Since Replica failures will include a more actionable error message this condition is preferred * Stop always marking the revision healthy when the PA is Ready This isn't accurate when the Revision has failed to rollout an update to it's deployment * Various updates to the revision reconciler 1. PA Reachability now depends on the status of the Deployment If we have available replicas we don't mark the revision as unreachable. This allows ongoing requests to be handled 2. Always propagate the K8s Deployment Status to the Revision. We don't need to gate this depending on whether the Revision required activation. Since the only two conditions we propagate from the Deployment is Progressing and ReplicaSetFailure=False 3. Mark Revision as Deploying if the PA's service name isn't set * test deployment failures don't drop traffic on upgrade * fix boilerplate check --------- Co-authored-by: dprotaso --- pkg/apis/serving/k8s_lifecycle.go | 51 ++++--- pkg/apis/serving/k8s_lifecycle_test.go | 41 +++++- pkg/apis/serving/v1/revision_helpers.go | 7 - pkg/apis/serving/v1/revision_helpers_test.go | 43 ------ pkg/apis/serving/v1/revision_lifecycle.go | 23 +-- .../serving/v1/revision_lifecycle_test.go | 68 +++++---- pkg/reconciler/autoscaling/kpa/kpa_test.go | 16 +-- pkg/reconciler/autoscaling/kpa/scaler_test.go | 2 +- pkg/reconciler/revision/cruds.go | 8 +- .../revision/reconcile_resources.go | 42 +++--- pkg/reconciler/revision/resources/pa.go | 20 ++- pkg/reconciler/revision/resources/pa_test.go | 20 ++- pkg/reconciler/revision/table_test.go | 18 ++- test/upgrade/deployment_failure.go | 134 ++++++++++++++++++ test/upgrade/postupgrade.go | 1 + test/upgrade/preupgrade.go | 1 + 16 files changed, 341 insertions(+), 154 deletions(-) create mode 100644 test/upgrade/deployment_failure.go diff --git a/pkg/apis/serving/k8s_lifecycle.go b/pkg/apis/serving/k8s_lifecycle.go index 9703eb8054d2..f63060c8b521 100644 --- a/pkg/apis/serving/k8s_lifecycle.go +++ b/pkg/apis/serving/k8s_lifecycle.go @@ -50,29 +50,42 @@ func TransformDeploymentStatus(ds *appsv1.DeploymentStatus) *duckv1.Status { // The absence of this condition means no failure has occurred. If we find it // below, we'll overwrite this. depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady) + depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, "Deploying", "") - for _, cond := range ds.Conditions { - // TODO(jonjohnsonjr): Should we care about appsv1.DeploymentAvailable here? - switch cond.Type { - case appsv1.DeploymentProgressing: - switch cond.Status { - case corev1.ConditionUnknown: - depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message) - case corev1.ConditionTrue: - depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing) - case corev1.ConditionFalse: - depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message) + conds := []appsv1.DeploymentConditionType{ + appsv1.DeploymentProgressing, + appsv1.DeploymentReplicaFailure, + } + + for _, wantType := range conds { + for _, cond := range ds.Conditions { + if wantType != cond.Type { + continue } - case appsv1.DeploymentReplicaFailure: - switch cond.Status { - case corev1.ConditionUnknown: - depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) - case corev1.ConditionTrue: - depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) - case corev1.ConditionFalse: - depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady) + + switch cond.Type { + case appsv1.DeploymentProgressing: + switch cond.Status { + case corev1.ConditionUnknown: + depCondSet.Manage(s).MarkUnknown(DeploymentConditionProgressing, cond.Reason, cond.Message) + case corev1.ConditionTrue: + depCondSet.Manage(s).MarkTrue(DeploymentConditionProgressing) + case corev1.ConditionFalse: + depCondSet.Manage(s).MarkFalse(DeploymentConditionProgressing, cond.Reason, cond.Message) + } + case appsv1.DeploymentReplicaFailure: + switch cond.Status { + case corev1.ConditionUnknown: + depCondSet.Manage(s).MarkUnknown(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) + case corev1.ConditionTrue: + depCondSet.Manage(s).MarkFalse(DeploymentConditionReplicaSetReady, cond.Reason, cond.Message) + case corev1.ConditionFalse: + depCondSet.Manage(s).MarkTrue(DeploymentConditionReplicaSetReady) + } } + } } + return s } diff --git a/pkg/apis/serving/k8s_lifecycle_test.go b/pkg/apis/serving/k8s_lifecycle_test.go index fcb76f327095..344e95d9482a 100644 --- a/pkg/apis/serving/k8s_lifecycle_test.go +++ b/pkg/apis/serving/k8s_lifecycle_test.go @@ -40,12 +40,14 @@ func TestTransformDeploymentStatus(t *testing.T) { Conditions: []apis.Condition{{ Type: DeploymentConditionProgressing, Status: corev1.ConditionUnknown, + Reason: "Deploying", }, { Type: DeploymentConditionReplicaSetReady, Status: corev1.ConditionTrue, }, { Type: DeploymentConditionReady, Status: corev1.ConditionUnknown, + Reason: "Deploying", }}, }, }, { @@ -147,7 +149,7 @@ func TestTransformDeploymentStatus(t *testing.T) { Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionTrue, Reason: "ReplicaSetReason", - Message: "Something bag happened", + Message: "Something bad happened", }}, }, want: &duckv1.Status{ @@ -158,12 +160,45 @@ func TestTransformDeploymentStatus(t *testing.T) { Type: DeploymentConditionReplicaSetReady, Status: corev1.ConditionFalse, Reason: "ReplicaSetReason", - Message: "Something bag happened", + Message: "Something bad happened", }, { Type: DeploymentConditionReady, Status: corev1.ConditionFalse, Reason: "ReplicaSetReason", - Message: "Something bag happened", + Message: "Something bad happened", + }}, + }, + }, { + name: "replica failure has priority over progressing", + ds: &appsv1.DeploymentStatus{ + Conditions: []appsv1.DeploymentCondition{{ + Type: appsv1.DeploymentReplicaFailure, + Status: corev1.ConditionTrue, + Reason: "ReplicaSetReason", + Message: "Something really bad happened", + }, { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressingReason", + Message: "Something bad happened", + }}, + }, + want: &duckv1.Status{ + Conditions: []apis.Condition{{ + Type: DeploymentConditionProgressing, + Status: corev1.ConditionFalse, + Reason: "ProgressingReason", + Message: "Something bad happened", + }, { + Type: DeploymentConditionReplicaSetReady, + Status: corev1.ConditionFalse, + Reason: "ReplicaSetReason", + Message: "Something really bad happened", + }, { + Type: DeploymentConditionReady, + Status: corev1.ConditionFalse, + Reason: "ReplicaSetReason", + Message: "Something really bad happened", }}, }, }} diff --git a/pkg/apis/serving/v1/revision_helpers.go b/pkg/apis/serving/v1/revision_helpers.go index 4270f94ac518..e561c7ae6495 100644 --- a/pkg/apis/serving/v1/revision_helpers.go +++ b/pkg/apis/serving/v1/revision_helpers.go @@ -19,7 +19,6 @@ package v1 import ( "time" - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" net "knative.dev/networking/pkg/apis/networking" "knative.dev/pkg/kmeta" @@ -144,9 +143,3 @@ func (rs *RevisionStatus) IsActivationRequired() bool { c := revisionCondSet.Manage(rs).GetCondition(RevisionConditionActive) return c != nil && c.Status != corev1.ConditionTrue } - -// IsReplicaSetFailure returns true if the deployment replicaset failed to create -func (rs *RevisionStatus) IsReplicaSetFailure(deploymentStatus *appsv1.DeploymentStatus) bool { - ds := serving.TransformDeploymentStatus(deploymentStatus) - return ds != nil && ds.GetCondition(serving.DeploymentConditionReplicaSetReady).IsFalse() -} diff --git a/pkg/apis/serving/v1/revision_helpers_test.go b/pkg/apis/serving/v1/revision_helpers_test.go index dd95c11b4b26..65feea02ed5f 100644 --- a/pkg/apis/serving/v1/revision_helpers_test.go +++ b/pkg/apis/serving/v1/revision_helpers_test.go @@ -20,7 +20,6 @@ import ( "testing" "time" - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -268,45 +267,3 @@ func TestSetRoutingState(t *testing.T) { t.Error("Expected default value for unparsable annotationm but got:", got) } } - -func TestIsReplicaSetFailure(t *testing.T) { - revisionStatus := RevisionStatus{} - cases := []struct { - name string - status appsv1.DeploymentStatus - IsReplicaSetFailure bool - }{{ - name: "empty deployment status should not be a failure", - status: appsv1.DeploymentStatus{}, - }, { - name: "Ready deployment status should not be a failure", - status: appsv1.DeploymentStatus{ - Conditions: []appsv1.DeploymentCondition{{ - Type: appsv1.DeploymentAvailable, Status: corev1.ConditionTrue, - }}, - }, - }, { - name: "ReplicasetFailure true should be a failure", - status: appsv1.DeploymentStatus{ - Conditions: []appsv1.DeploymentCondition{{ - Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionTrue, - }}, - }, - IsReplicaSetFailure: true, - }, { - name: "ReplicasetFailure false should not be a failure", - status: appsv1.DeploymentStatus{ - Conditions: []appsv1.DeploymentCondition{{ - Type: appsv1.DeploymentReplicaFailure, Status: corev1.ConditionFalse, - }}, - }, - }} - - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - if got, want := revisionStatus.IsReplicaSetFailure(&tc.status), tc.IsReplicaSetFailure; got != want { - t.Errorf("IsReplicaSetFailure = %v, want: %v", got, want) - } - }) - } -} diff --git a/pkg/apis/serving/v1/revision_lifecycle.go b/pkg/apis/serving/v1/revision_lifecycle.go index f95c93675aa3..9c92c01a3cd8 100644 --- a/pkg/apis/serving/v1/revision_lifecycle.go +++ b/pkg/apis/serving/v1/revision_lifecycle.go @@ -170,6 +170,8 @@ func (rs *RevisionStatus) PropagateDeploymentStatus(original *appsv1.DeploymentS // PropagateAutoscalerStatus propagates autoscaler's status to the revision's status. func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodAutoscalerStatus) { + resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse() + // Reflect the PA status in our own. cond := ps.GetCondition(autoscalingv1alpha1.PodAutoscalerConditionReady) rs.ActualReplicas = nil @@ -183,13 +185,16 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA } if cond == nil { - rs.MarkActiveUnknown("Deploying", "") + rs.MarkActiveUnknown(ReasonDeploying, "") + + if !resUnavailable { + rs.MarkResourcesAvailableUnknown(ReasonDeploying, "") + } return } // Don't mark the resources available, if deployment status already determined // it isn't so. - resUnavailable := rs.GetCondition(RevisionConditionResourcesAvailable).IsFalse() if ps.IsScaleTargetInitialized() && !resUnavailable { // Precondition for PA being initialized is SKS being active and // that implies that |service.endpoints| > 0. @@ -197,6 +202,12 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA rs.MarkContainerHealthyTrue() } + // Mark resource unavailable if we don't have a Service Name and the deployment is ready + // This can happen when we have initial scale set to 0 + if rs.GetCondition(RevisionConditionResourcesAvailable).IsTrue() && ps.ServiceName == "" { + rs.MarkResourcesAvailableUnknown(ReasonDeploying, "") + } + switch cond.Status { case corev1.ConditionUnknown: rs.MarkActiveUnknown(cond.Reason, cond.Message) @@ -222,14 +233,6 @@ func (rs *RevisionStatus) PropagateAutoscalerStatus(ps *autoscalingv1alpha1.PodA rs.MarkActiveFalse(cond.Reason, cond.Message) case corev1.ConditionTrue: rs.MarkActiveTrue() - - // Precondition for PA being active is SKS being active and - // that implies that |service.endpoints| > 0. - // - // Note: This is needed for backwards compatibility as we're adding the new - // ScaleTargetInitialized condition to gate readiness. - rs.MarkResourcesAvailableTrue() - rs.MarkContainerHealthyTrue() } } diff --git a/pkg/apis/serving/v1/revision_lifecycle_test.go b/pkg/apis/serving/v1/revision_lifecycle_test.go index e613cf5a36df..7032c94ade52 100644 --- a/pkg/apis/serving/v1/revision_lifecycle_test.go +++ b/pkg/apis/serving/v1/revision_lifecycle_test.go @@ -17,6 +17,7 @@ limitations under the License. package v1 import ( + "fmt" "sort" "testing" @@ -536,6 +537,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) { // PodAutoscaler becomes ready, making us active. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, @@ -551,6 +553,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) { // PodAutoscaler flipping back to Unknown causes Active become ongoing immediately. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, @@ -566,6 +569,7 @@ func TestPropagateAutoscalerStatus(t *testing.T) { // PodAutoscaler becoming unready makes Active false, but doesn't affect readiness. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, @@ -582,32 +586,45 @@ func TestPropagateAutoscalerStatus(t *testing.T) { apistest.CheckConditionSucceeded(r, RevisionConditionResourcesAvailable, t) } -func TestPAResAvailableNoOverride(t *testing.T) { - r := &RevisionStatus{} - r.InitializeConditions() - apistest.CheckConditionOngoing(r, RevisionConditionReady, t) - - // Deployment determined that something's wrong, e.g. the only pod - // has crashed. - r.MarkResourcesAvailableFalse("somehow", "somewhere") +func TestPropagateAutoscalerStatus_NoOverridingResourcesAvailable(t *testing.T) { + // Cases to test Ready condition + // we fix ScaleTargetInitialized to True + cases := []corev1.ConditionStatus{ + corev1.ConditionTrue, + corev1.ConditionFalse, + corev1.ConditionUnknown, + } - // PodAutoscaler achieved initial scale. - r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ - Status: duckv1.Status{ - Conditions: duckv1.Conditions{{ - Type: autoscalingv1alpha1.PodAutoscalerConditionReady, - Status: corev1.ConditionUnknown, - }, { - Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized, - Status: corev1.ConditionTrue, - }}, - }, - }) - // Verify we did not override this. - apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t) - cond := r.GetCondition(RevisionConditionResourcesAvailable) - if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant { - t.Error("PA Status propagation overrode the ResourcesAvailable status") + for _, tc := range cases { + t.Run(fmt.Sprintf("ready is %v", tc), func(t *testing.T) { + + r := &RevisionStatus{} + r.InitializeConditions() + apistest.CheckConditionOngoing(r, RevisionConditionReady, t) + + // Deployment determined that something's wrong, e.g. the only pod + // has crashed. + r.MarkResourcesAvailableFalse("somehow", "somewhere") + r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + Status: duckv1.Status{ + Conditions: duckv1.Conditions{{ + Type: autoscalingv1alpha1.PodAutoscalerConditionReady, + Status: tc, + }, { + Type: autoscalingv1alpha1.PodAutoscalerConditionScaleTargetInitialized, + Status: corev1.ConditionTrue, + }}, + }, + }) + + // Verify we did not override this. + apistest.CheckConditionFailed(r, RevisionConditionResourcesAvailable, t) + + cond := r.GetCondition(RevisionConditionResourcesAvailable) + if got, notWant := cond.Reason, ReasonProgressDeadlineExceeded; got == notWant { + t.Error("PodAutoscalerStatus propagation overrode the ResourcesAvailable status") + } + }) } } @@ -671,6 +688,7 @@ func TestPropagateAutoscalerStatusRace(t *testing.T) { // The PodAutoscaler might have been ready but it's scaled down already. r.PropagateAutoscalerStatus(&autoscalingv1alpha1.PodAutoscalerStatus{ + ServiceName: "some-service", Status: duckv1.Status{ Conditions: duckv1.Conditions{{ Type: autoscalingv1alpha1.PodAutoscalerConditionReady, diff --git a/pkg/reconciler/autoscaling/kpa/kpa_test.go b/pkg/reconciler/autoscaling/kpa/kpa_test.go index c45d96449c6c..f16450cc44f2 100644 --- a/pkg/reconciler/autoscaling/kpa/kpa_test.go +++ b/pkg/reconciler/autoscaling/kpa/kpa_test.go @@ -221,7 +221,7 @@ func markScaleTargetInitialized(pa *autoscalingv1alpha1.PodAutoscaler) { func kpa(ns, n string, opts ...PodAutoscalerOption) *autoscalingv1alpha1.PodAutoscaler { rev := newTestRevision(ns, n) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) kpa.Generation = 1 kpa.Annotations[autoscaling.ClassAnnotationKey] = "kpa.autoscaling.knative.dev" kpa.Annotations[autoscaling.MetricAnnotationKey] = "concurrency" @@ -1303,7 +1303,7 @@ func TestGlobalResyncOnUpdateAutoscalerConfigMap(t *testing.T) { rev := newTestRevision(testNamespace, testRevision) newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) sks := aresources.MakeSKS(kpa, nv1a1.SKSOperationModeServe, minActivators) sks.Status.PrivateServiceName = "bogus" sks.Status.InitializeConditions() @@ -1372,7 +1372,7 @@ func TestReconcileDeciderCreatesAndDeletes(t *testing.T) { newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) sks := sks(testNamespace, testRevision, WithDeployRef(kpa.Spec.ScaleTargetRef.Name), WithSKSReady) fakenetworkingclient.Get(ctx).NetworkingV1alpha1().ServerlessServices(testNamespace).Create(ctx, sks, metav1.CreateOptions{}) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) @@ -1446,7 +1446,7 @@ func TestUpdate(t *testing.T) { fakekubeclient.Get(ctx).CoreV1().Pods(testNamespace).Create(ctx, pod, metav1.CreateOptions{}) fakefilteredpodsinformer.Get(ctx, serving.RevisionUID).Informer().GetIndexer().Add(pod) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) kpa.SetDefaults(context.Background()) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1525,7 +1525,7 @@ func TestControllerCreateError(t *testing.T) { createErr: want, }) - kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision)) + kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1568,7 +1568,7 @@ func TestControllerUpdateError(t *testing.T) { createErr: want, }) - kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision)) + kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1610,7 +1610,7 @@ func TestControllerGetError(t *testing.T) { getErr: want, }) - kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision)) + kpa := revisionresources.MakePA(newTestRevision(testNamespace, testRevision), nil) fakeservingclient.Get(ctx).AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, kpa, metav1.CreateOptions{}) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) @@ -1649,7 +1649,7 @@ func TestScaleFailure(t *testing.T) { // Only put the KPA in the lister, which will prompt failures scaling it. rev := newTestRevision(testNamespace, testRevision) - kpa := revisionresources.MakePA(rev) + kpa := revisionresources.MakePA(rev, nil) fakepainformer.Get(ctx).Informer().GetIndexer().Add(kpa) newDeployment(ctx, t, fakedynamicclient.Get(ctx), testRevision+"-deployment", 3) diff --git a/pkg/reconciler/autoscaling/kpa/scaler_test.go b/pkg/reconciler/autoscaling/kpa/scaler_test.go index b9538514c35e..6b9b8e1f917a 100644 --- a/pkg/reconciler/autoscaling/kpa/scaler_test.go +++ b/pkg/reconciler/autoscaling/kpa/scaler_test.go @@ -640,7 +640,7 @@ func TestDisableScaleToZero(t *testing.T) { func newKPA(ctx context.Context, t *testing.T, servingClient clientset.Interface, revision *v1.Revision) *autoscalingv1alpha1.PodAutoscaler { t.Helper() - pa := revisionresources.MakePA(revision) + pa := revisionresources.MakePA(revision, nil) pa.Status.InitializeConditions() _, err := servingClient.AutoscalingV1alpha1().PodAutoscalers(testNamespace).Create(ctx, pa, metav1.CreateOptions{}) if err != nil { diff --git a/pkg/reconciler/revision/cruds.go b/pkg/reconciler/revision/cruds.go index 100591179019..5607e54c9aa4 100644 --- a/pkg/reconciler/revision/cruds.go +++ b/pkg/reconciler/revision/cruds.go @@ -115,7 +115,11 @@ func (c *Reconciler) createImageCache(ctx context.Context, rev *v1.Revision, con return c.cachingclient.CachingV1alpha1().Images(image.Namespace).Create(ctx, image, metav1.CreateOptions{}) } -func (c *Reconciler) createPA(ctx context.Context, rev *v1.Revision) (*autoscalingv1alpha1.PodAutoscaler, error) { - pa := resources.MakePA(rev) +func (c *Reconciler) createPA( + ctx context.Context, + rev *v1.Revision, + deployment *appsv1.Deployment, +) (*autoscalingv1alpha1.PodAutoscaler, error) { + pa := resources.MakePA(rev, deployment) return c.client.AutoscalingV1alpha1().PodAutoscalers(pa.Namespace).Create(ctx, pa, metav1.CreateOptions{}) } diff --git a/pkg/reconciler/revision/reconcile_resources.go b/pkg/reconciler/revision/reconcile_resources.go index 1bef83c102b0..33ce2ba03abe 100644 --- a/pkg/reconciler/revision/reconcile_resources.go +++ b/pkg/reconciler/revision/reconcile_resources.go @@ -49,42 +49,27 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision) // Deployment does not exist. Create it. rev.Status.MarkResourcesAvailableUnknown(v1.ReasonDeploying, "") rev.Status.MarkContainerHealthyUnknown(v1.ReasonDeploying, "") - deployment, err = c.createDeployment(ctx, rev) - if err != nil { + if _, err = c.createDeployment(ctx, rev); err != nil { return fmt.Errorf("failed to create deployment %q: %w", deploymentName, err) } logger.Infof("Created deployment %q", deploymentName) + return nil } else if err != nil { return fmt.Errorf("failed to get deployment %q: %w", deploymentName, err) } else if !metav1.IsControlledBy(deployment, rev) { // Surface an error in the revision's status, and return an error. rev.Status.MarkResourcesAvailableFalse(v1.ReasonNotOwned, v1.ResourceNotOwnedMessage("Deployment", deploymentName)) return fmt.Errorf("revision: %q does not own Deployment: %q", rev.Name, deploymentName) - } else { - // The deployment exists, but make sure that it has the shape that we expect. - deployment, err = c.checkAndUpdateDeployment(ctx, rev, deployment) - if err != nil { - return fmt.Errorf("failed to update deployment %q: %w", deploymentName, err) - } - - // Now that we have a Deployment, determine whether there is any relevant - // status to surface in the Revision. - // - // TODO(jonjohnsonjr): Should we check Generation != ObservedGeneration? - // The autoscaler mutates the deployment pretty often, which would cause us - // to flip back and forth between Ready and Unknown every time we scale up - // or down. - if !rev.Status.IsActivationRequired() { - rev.Status.PropagateDeploymentStatus(&deployment.Status) - } } - // If the replicaset is failing we assume its an error we have to surface - if rev.Status.IsReplicaSetFailure(&deployment.Status) { - rev.Status.PropagateDeploymentStatus(&deployment.Status) - return nil + // The deployment exists, but make sure that it has the shape that we expect. + deployment, err = c.checkAndUpdateDeployment(ctx, rev, deployment) + if err != nil { + return fmt.Errorf("failed to update deployment %q: %w", deploymentName, err) } + rev.Status.PropagateDeploymentStatus(&deployment.Status) + // If a container keeps crashing (no active pods in the deployment although we want some) if *deployment.Spec.Replicas > 0 && deployment.Status.AvailableReplicas == 0 { pods, err := c.kubeclient.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(deployment.Spec.Selector)}) @@ -151,6 +136,13 @@ func (c *Reconciler) reconcileImageCache(ctx context.Context, rev *v1.Revision) func (c *Reconciler) reconcilePA(ctx context.Context, rev *v1.Revision) error { ns := rev.Namespace + + deploymentName := resourcenames.Deployment(rev) + deployment, err := c.deploymentLister.Deployments(ns).Get(deploymentName) + if err != nil { + return err + } + paName := resourcenames.PA(rev) logger := logging.FromContext(ctx) logger.Info("Reconciling PA: ", paName) @@ -158,7 +150,7 @@ func (c *Reconciler) reconcilePA(ctx context.Context, rev *v1.Revision) error { pa, err := c.podAutoscalerLister.PodAutoscalers(ns).Get(paName) if apierrs.IsNotFound(err) { // PA does not exist. Create it. - pa, err = c.createPA(ctx, rev) + pa, err = c.createPA(ctx, rev, deployment) if err != nil { return fmt.Errorf("failed to create PA %q: %w", paName, err) } @@ -173,7 +165,7 @@ func (c *Reconciler) reconcilePA(ctx context.Context, rev *v1.Revision) error { // Perhaps tha PA spec changed underneath ourselves? // We no longer require immutability, so need to reconcile PA each time. - tmpl := resources.MakePA(rev) + tmpl := resources.MakePA(rev, deployment) logger.Debugf("Desired PASpec: %#v", tmpl.Spec) if !equality.Semantic.DeepEqual(tmpl.Spec, pa.Spec) { diff, _ := kmp.SafeDiff(tmpl.Spec, pa.Spec) // Can't realistically fail on PASpec. diff --git a/pkg/reconciler/revision/resources/pa.go b/pkg/reconciler/revision/resources/pa.go index e51f4e8553f7..32a2cff3cbb1 100644 --- a/pkg/reconciler/revision/resources/pa.go +++ b/pkg/reconciler/revision/resources/pa.go @@ -17,6 +17,7 @@ limitations under the License. package resources import ( + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -28,7 +29,7 @@ import ( ) // MakePA makes a Knative Pod Autoscaler resource from a revision. -func MakePA(rev *v1.Revision) *autoscalingv1alpha1.PodAutoscaler { +func MakePA(rev *v1.Revision, deployment *appsv1.Deployment) *autoscalingv1alpha1.PodAutoscaler { return &autoscalingv1alpha1.PodAutoscaler{ ObjectMeta: metav1.ObjectMeta{ Name: names.PA(rev), @@ -45,20 +46,27 @@ func MakePA(rev *v1.Revision) *autoscalingv1alpha1.PodAutoscaler { Name: names.Deployment(rev), }, ProtocolType: rev.GetProtocol(), - Reachability: reachability(rev), + Reachability: reachability(rev, deployment), }, } } -func reachability(rev *v1.Revision) autoscalingv1alpha1.ReachabilityType { +func reachability(rev *v1.Revision, deployment *appsv1.Deployment) autoscalingv1alpha1.ReachabilityType { // check infra failures - conds := []apis.ConditionType{ + infraFailure := false + for _, cond := range []apis.ConditionType{ v1.RevisionConditionResourcesAvailable, v1.RevisionConditionContainerHealthy, + } { + if c := rev.Status.GetCondition(cond); c != nil && c.IsFalse() { + infraFailure = true + break + } } - for _, cond := range conds { - if c := rev.Status.GetCondition(cond); c != nil && c.IsFalse() { + if infraFailure && deployment != nil && deployment.Spec.Replicas != nil { + // If we have an infra failure and no ready replicas - then this revision is unreachable + if *deployment.Spec.Replicas > 0 && deployment.Status.ReadyReplicas == 0 { return autoscalingv1alpha1.ReachabilityUnreachable } } diff --git a/pkg/reconciler/revision/resources/pa_test.go b/pkg/reconciler/revision/resources/pa_test.go index dbc1dc7e2dc5..cb4e101acbb1 100644 --- a/pkg/reconciler/revision/resources/pa_test.go +++ b/pkg/reconciler/revision/resources/pa_test.go @@ -20,6 +20,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" duckv1 "knative.dev/pkg/apis/duck/v1" @@ -35,6 +36,7 @@ func TestMakePA(t *testing.T) { tests := []struct { name string rev *v1.Revision + dep *appsv1.Deployment want *autoscalingv1alpha1.PodAutoscaler }{{ name: "name is bar (Concurrency=1, Reachable=true)", @@ -243,6 +245,14 @@ func TestMakePA(t *testing.T) { }}, }, { name: "failed deployment", + dep: &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.Int32(1), + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 0, + }, + }, rev: &v1.Revision{ ObjectMeta: metav1.ObjectMeta{ Namespace: "blah", @@ -305,6 +315,14 @@ func TestMakePA(t *testing.T) { }, { // Crashlooping container that never starts name: "failed container", + dep: &appsv1.Deployment{ + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.Int32(1), + }, + Status: appsv1.DeploymentStatus{ + ReadyReplicas: 0, + }, + }, rev: &v1.Revision{ ObjectMeta: metav1.ObjectMeta{ Namespace: "blah", @@ -368,7 +386,7 @@ func TestMakePA(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got := MakePA(test.rev) + got := MakePA(test.rev, test.dep) if !cmp.Equal(got, test.want) { t.Error("MakePA (-want, +got) =", cmp.Diff(test.want, got)) } diff --git a/pkg/reconciler/revision/table_test.go b/pkg/reconciler/revision/table_test.go index 385d65e0a16b..c534b56e7ed8 100644 --- a/pkg/reconciler/revision/table_test.go +++ b/pkg/reconciler/revision/table_test.go @@ -244,7 +244,9 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateReserve, fc), withDefaultContainerStatuses(), WithRevisionObservedGeneration(1)), pa("foo", "stable-deactivation", - WithNoTraffic("NoTraffic", "This thing is inactive."), WithReachabilityUnreachable, + WithPAStatusService("stable-deactivation"), + WithNoTraffic("NoTraffic", "This thing is inactive."), + WithReachabilityUnreachable, WithScaleTargetInitialized), deploy(t, "foo", "stable-deactivation"), image("foo", "stable-deactivation"), @@ -306,6 +308,7 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateReserve, fc), MarkRevisionReady, WithRevisionObservedGeneration(1)), pa("foo", "pa-inactive", + WithPAStatusService("pa-inactive"), WithNoTraffic("NoTraffic", "This thing is inactive."), WithScaleTargetInitialized, WithReachabilityUnreachable), @@ -350,6 +353,7 @@ func TestReconcile(t *testing.T) { Objects: []runtime.Object{ Revision("foo", "pa-inactive", allUnknownConditions, WithLogURL, + MarkDeploying(v1.ReasonDeploying), WithRevisionObservedGeneration(1)), pa("foo", "pa-inactive", WithNoTraffic("NoTraffic", "This thing is inactive."), WithPAStatusService("")), @@ -361,6 +365,7 @@ func TestReconcile(t *testing.T) { Object: Revision("foo", "pa-inactive", WithLogURL, withDefaultContainerStatuses(), allUnknownConditions, MarkInactive("NoTraffic", "This thing is inactive."), + MarkDeploying(v1.ReasonDeploying), WithRevisionObservedGeneration(1)), }}, Key: "foo/pa-inactive", @@ -398,6 +403,7 @@ func TestReconcile(t *testing.T) { // Protocol type is the only thing that can be changed on PA Objects: []runtime.Object{ Revision("foo", "fix-mutated-pa", + allUnknownConditions, WithLogURL, MarkRevisionReady, WithRoutingState(v1.RoutingStateActive, fc)), pa("foo", "fix-mutated-pa", WithProtocolType(networking.ProtocolH2C), @@ -452,6 +458,7 @@ func TestReconcile(t *testing.T) { // status of the Revision. Objects: []runtime.Object{ Revision("foo", "deploy-timeout", + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, MarkActive), pa("foo", "deploy-timeout", WithReachabilityReachable), @@ -507,6 +514,7 @@ func TestReconcile(t *testing.T) { // It then verifies that Reconcile propagates this into the status of the Revision. Objects: []runtime.Object{ Revision("foo", "deploy-replica-failure", + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, MarkActive), pa("foo", "deploy-replica-failure", WithReachabilityReachable), @@ -531,7 +539,8 @@ func TestReconcile(t *testing.T) { // Test the propagation of ImagePullBackoff from user container. Objects: []runtime.Object{ Revision("foo", "pull-backoff", - WithLogURL, MarkActivating("Deploying", ""), + WithLogURL, + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), ), pa("foo", "pull-backoff", WithReachabilityReachable), // pa can't be ready since deployment times out. @@ -557,6 +566,7 @@ func TestReconcile(t *testing.T) { // that Reconcile propagates this into the status of the Revision. Objects: []runtime.Object{ Revision("foo", "pod-error", + allUnknownConditions, WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, allUnknownConditions, MarkActive), pa("foo", "pod-error", WithReachabilityReachable), // PA can't be ready, since no traffic. @@ -835,7 +845,7 @@ func withInitContainerStatuses() RevisionOption { // TODO(mattmoor): Come up with a better name for this. func allUnknownConditions(r *v1.Revision) { WithInitRevConditions(r) - MarkDeploying("")(r) + MarkDeploying("Deploying")(r) MarkActivating("Deploying", "")(r) } @@ -893,7 +903,7 @@ func imageInit(namespace, name string, co ...configOption) []runtime.Object { func pa(namespace, name string, ko ...PodAutoscalerOption) *autoscalingv1alpha1.PodAutoscaler { rev := Revision(namespace, name) - k := resources.MakePA(rev) + k := resources.MakePA(rev, nil) for _, opt := range ko { opt(k) diff --git a/test/upgrade/deployment_failure.go b/test/upgrade/deployment_failure.go new file mode 100644 index 000000000000..324c149ca909 --- /dev/null +++ b/test/upgrade/deployment_failure.go @@ -0,0 +1,134 @@ +/* +Copyright 2024 The Knative Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package upgrade + +import ( + "context" + + admissionv1 "k8s.io/api/admissionregistration/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "knative.dev/pkg/ptr" + "knative.dev/pkg/test/helpers" + pkgupgrade "knative.dev/pkg/test/upgrade" + "knative.dev/serving/pkg/apis/autoscaling" + v1 "knative.dev/serving/pkg/apis/serving/v1" + "knative.dev/serving/test" + "knative.dev/serving/test/e2e" + v1test "knative.dev/serving/test/v1" +) + +func DeploymentFailurePostUpgrade() pkgupgrade.Operation { + return pkgupgrade.NewOperation("DeploymentFailurePostUpgrade", func(c pkgupgrade.Context) { + clients := e2e.Setup(c.T) + + names := test.ResourceNames{ + Service: "deployment-upgrade-failure", + Image: test.HelloWorld, + } + + service, err := clients.ServingClient.Services.Get(context.Background(), names.Service, metav1.GetOptions{}) + if err != nil { + c.T.Fatal("Failed to get Service: ", err) + } + + // Deployment failures should surface to the Service Ready Condition + if err := v1test.WaitForServiceState(clients.ServingClient, names.Service, v1test.IsServiceFailed, "ServiceIsNotReady"); err != nil { + c.T.Fatal("Service did not transition to Ready=False", err) + } + + // Traffic should still work since the deployment has an active replicaset + url := service.Status.URL.URL() + assertServiceResourcesUpdated(c.T, clients, names, url, test.HelloWorldText) + }) +} + +func DeploymentFailurePreUpgrade() pkgupgrade.Operation { + return pkgupgrade.NewOperation("DeploymentFailurePreUpgrade", func(c pkgupgrade.Context) { + c.T.Log("Creating Service") + ctx := context.Background() + + clients := e2e.Setup(c.T) + names := &test.ResourceNames{ + Service: "deployment-upgrade-failure", + Image: test.HelloWorld, + } + + resources, err := v1test.CreateServiceReady(c.T, clients, names, func(s *v1.Service) { + s.Spec.Template.Annotations = map[string]string{ + autoscaling.MinScaleAnnotation.Key(): "1", + autoscaling.MaxScaleAnnotation.Key(): "1", + } + }) + + if err != nil { + c.T.Fatal("Failed to create Service:", err) + } + + url := resources.Service.Status.URL.URL() + // This polls until we get a 200 with the right body. + assertServiceResourcesUpdated(c.T, clients, *names, url, test.HelloWorldText) + + // Setup webhook that fails when deployment is updated + // Failing to update the Deployment shouldn't cause a traffic drop + // note: the deployment is only updated if the controllers change the spec + // and this happens when the queue proxy image is changed when upgrading + c.T.Log("Creating Failing Webhook") + noSideEffects := admissionv1.SideEffectClassNone + + selector := &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "serving.knative.dev/service": names.Service, + }, + } + + // Create a broken webhook that breaks scheduling Pods + _, err = clients.KubeClient.AdmissionregistrationV1().MutatingWebhookConfigurations().Create( + ctx, + &admissionv1.MutatingWebhookConfiguration{ + ObjectMeta: metav1.ObjectMeta{ + Name: "broken-webhook", + }, + Webhooks: []admissionv1.MutatingWebhook{{ + AdmissionReviewVersions: []string{"v1"}, + Name: "webhook.non-existing.dev", + ClientConfig: admissionv1.WebhookClientConfig{ + Service: &admissionv1.ServiceReference{ + Name: helpers.AppendRandomString("non-existing"), + Namespace: helpers.AppendRandomString("non-existing"), + }, + }, + ObjectSelector: selector, + TimeoutSeconds: ptr.Int32(5), + SideEffects: &noSideEffects, + Rules: []admissionv1.RuleWithOperations{{ + Operations: []admissionv1.OperationType{"CREATE"}, + Rule: admissionv1.Rule{ + APIGroups: []string{""}, // core + APIVersions: []string{"v1"}, + Resources: []string{"pods"}, + }, + }}, + }}, + }, + metav1.CreateOptions{}, + ) + + if err != nil { + c.T.Fatal("Failed to create bad webhook:", err) + } + }) +} diff --git a/test/upgrade/postupgrade.go b/test/upgrade/postupgrade.go index 299e9a4c2370..61f5b1121600 100644 --- a/test/upgrade/postupgrade.go +++ b/test/upgrade/postupgrade.go @@ -42,6 +42,7 @@ func ServingPostUpgradeTests() []pkgupgrade.Operation { CreateNewServicePostUpgradeTest(), InitialScalePostUpgradeTest(), CRDStoredVersionPostUpgradeTest(), + DeploymentFailurePostUpgrade(), } } diff --git a/test/upgrade/preupgrade.go b/test/upgrade/preupgrade.go index 3edcdde8266e..d00f5ef86e87 100644 --- a/test/upgrade/preupgrade.go +++ b/test/upgrade/preupgrade.go @@ -35,6 +35,7 @@ func ServingPreUpgradeTests() []pkgupgrade.Operation { ServicePreUpgradeAndScaleToZeroTest(), BYORevisionPreUpgradeTest(), InitialScalePreUpgradeTest(), + DeploymentFailurePreUpgrade(), } }