Merge pull request #9950 from jeffrosenberg/error-on-failed-jobs-9285

Throw an error from jobReady() if the job exceeds its BackoffLimit
3 years ago · 99e1dce8c8
parent 8554d518d5 b13f3a13d4
commit 99e1dce8c8
2 changed files with 49 additions and 33 deletions
--- a/pkg/kube/ready.go
+++ b/pkg/kube/ready.go
@ -18,6 +18,7 @@ package kube // import "helm.sh/helm/v3/pkg/kube"

 import (
 	"context"
+	"fmt"

 	appsv1 "k8s.io/api/apps/v1"
 	appsv1beta1 "k8s.io/api/apps/v1beta1"
@ -83,8 +84,8 @@ type ReadyChecker struct {

 // IsReady checks if v is ready. It supports checking readiness for pods,
 // deployments, persistent volume claims, services, daemon sets, custom
-// resource definitions, stateful sets, replication controllers, and replica
-// sets. All other resource kinds are always considered ready.
+// resource definitions, stateful sets, replication controllers, jobs (optional),
+// and replica sets. All other resource kinds are always considered ready.
 //
 // IsReady will fetch the latest state of the object from the server prior to
 // performing readiness checks, and it will return any error encountered.
@ -105,9 +106,11 @@ func (c *ReadyChecker) IsReady(ctx context.Context, v *resource.Info) (bool, err
 	case *batchv1.Job:
 		if c.checkJobs {
 			job, err := c.client.BatchV1().Jobs(v.Namespace).Get(ctx, v.Name, metav1.GetOptions{})
-			if err != nil || !c.jobReady(job) {
+			if err != nil {
 				return false, err
 			}
+			ready, err := c.jobReady(job)
+			return ready, err
 		}
 	case *appsv1.Deployment, *appsv1beta1.Deployment, *appsv1beta2.Deployment, *extensionsv1beta1.Deployment:
 		currentDeployment, err := c.client.AppsV1().Deployments(v.Namespace).Get(ctx, v.Name, metav1.GetOptions{})
@ -222,16 +225,17 @@ func (c *ReadyChecker) isPodReady(pod *corev1.Pod) bool {
 	return false
 }

-func (c *ReadyChecker) jobReady(job *batchv1.Job) bool {
+func (c *ReadyChecker) jobReady(job *batchv1.Job) (bool, error) {
 	if job.Status.Failed > *job.Spec.BackoffLimit {
 		c.log("Job is failed: %s/%s", job.GetNamespace(), job.GetName())
-		return false
+		// If a job is failed, it can't recover, so throw an error
+		return false, fmt.Errorf("job is failed: %s/%s", job.GetNamespace(), job.GetName())
 	}
 	if job.Spec.Completions != nil && job.Status.Succeeded < *job.Spec.Completions {
 		c.log("Job is not completed: %s/%s", job.GetNamespace(), job.GetName())
-		return false
+		return false, nil
 	}
-	return true
+	return true, nil
 }

 func (c *ReadyChecker) serviceReady(s *corev1.Service) bool {
--- a/pkg/kube/ready_test.go
+++ b/pkg/kube/ready_test.go
@ -4,7 +4,6 @@ Copyright The Helm Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
    http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
@ -270,44 +269,52 @@ func Test_ReadyChecker_jobReady(t *testing.T) {
 		job *batchv1.Job
 	}
 	tests := []struct {
-		name string
-		args args
-		want bool
+		name    string
+		args    args
+		want    bool
+		wantErr bool
 	}{
 		{
-			name: "job is completed",
-			args: args{job: newJob("foo", 1, intToInt32(1), 1, 0)},
-			want: true,
+			name:    "job is completed",
+			args:    args{job: newJob("foo", 1, intToInt32(1), 1, 0)},
+			want:    true,
+			wantErr: false,
 		},
 		{
-			name: "job is incomplete",
-			args: args{job: newJob("foo", 1, intToInt32(1), 0, 0)},
-			want: false,
+			name:    "job is incomplete",
+			args:    args{job: newJob("foo", 1, intToInt32(1), 0, 0)},
+			want:    false,
+			wantErr: false,
 		},
 		{
-			name: "job is failed",
-			args: args{job: newJob("foo", 1, intToInt32(1), 0, 1)},
-			want: false,
+			name:    "job is failed but within BackoffLimit",
+			args:    args{job: newJob("foo", 1, intToInt32(1), 0, 1)},
+			want:    false,
+			wantErr: false,
 		},
 		{
-			name: "job is completed with retry",
-			args: args{job: newJob("foo", 1, intToInt32(1), 1, 1)},
-			want: true,
+			name:    "job is completed with retry",
+			args:    args{job: newJob("foo", 1, intToInt32(1), 1, 1)},
+			want:    true,
+			wantErr: false,
 		},
 		{
-			name: "job is failed with retry",
-			args: args{job: newJob("foo", 1, intToInt32(1), 0, 2)},
-			want: false,
+			name:    "job is failed and beyond BackoffLimit",
+			args:    args{job: newJob("foo", 1, intToInt32(1), 0, 2)},
+			want:    false,
+			wantErr: true,
 		},
 		{
-			name: "job is completed single run",
-			args: args{job: newJob("foo", 0, intToInt32(1), 1, 0)},
-			want: true,
+			name:    "job is completed single run",
+			args:    args{job: newJob("foo", 0, intToInt32(1), 1, 0)},
+			want:    true,
+			wantErr: false,
 		},
 		{
-			name: "job is failed single run",
-			args: args{job: newJob("foo", 0, intToInt32(1), 0, 1)},
-			want: false,
+			name:    "job is failed single run",
+			args:    args{job: newJob("foo", 0, intToInt32(1), 0, 1)},
+			want:    false,
+			wantErr: true,
 		},
 		{
 			name: "job with null completions",
@ -318,7 +325,12 @@ func Test_ReadyChecker_jobReady(t *testing.T) {
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
 			c := NewReadyChecker(fake.NewSimpleClientset(), nil)
-			if got := c.jobReady(tt.args.job); got != tt.want {
+			got, err := c.jobReady(tt.args.job)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("jobReady() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if got != tt.want {
 				t.Errorf("jobReady() = %v, want %v", got, tt.want)
 			}
 		})