From fc74964f8a039ce209966b70fa7ba0fc7ea36a9e Mon Sep 17 00:00:00 2001 From: MichaelMorris Date: Fri, 3 Feb 2023 14:02:12 +0000 Subject: [PATCH 1/2] Add robustness to wait status checks When a user specifies --wait for an install or upgrade this commit will enable the user to specifiy a number of retries to attepmt if a status check fails Errors including a HTTP status code < 500 will not be retried Signed-off-by: MichaelMorris --- cmd/helm/install.go | 1 + cmd/helm/upgrade.go | 2 ++ pkg/action/install.go | 20 ++++++++++++----- pkg/action/install_test.go | 16 +++++++++++++ pkg/action/upgrade.go | 24 +++++++++++++------- pkg/action/upgrade_test.go | 46 ++++++++++++++++++++++++++++++++++++++ pkg/kube/client.go | 18 +++++++++++++-- pkg/kube/fake/fake.go | 17 ++++++++++++++ pkg/kube/fake/printer.go | 10 +++++++++ pkg/kube/interface.go | 12 ++++++++++ pkg/kube/wait.go | 45 ++++++++++++++++++++++++++++++++----- 11 files changed, 189 insertions(+), 22 deletions(-) diff --git a/cmd/helm/install.go b/cmd/helm/install.go index 6ffc968ce..ae68be121 100644 --- a/cmd/helm/install.go +++ b/cmd/helm/install.go @@ -167,6 +167,7 @@ func addInstallFlags(cmd *cobra.Command, f *pflag.FlagSet, client *action.Instal f.DurationVar(&client.Timeout, "timeout", 300*time.Second, "time to wait for any individual Kubernetes operation (like Jobs for hooks)") f.BoolVar(&client.Wait, "wait", false, "if set, will wait until all Pods, PVCs, Services, and minimum number of Pods of a Deployment, StatefulSet, or ReplicaSet are in a ready state before marking the release as successful. It will wait for as long as --timeout") f.BoolVar(&client.WaitForJobs, "wait-for-jobs", false, "if set and --wait enabled, will wait until all Jobs have been completed before marking the release as successful. It will wait for as long as --timeout") + f.IntVar(&client.WaitRetries, "wait-retries", 0, "if set and --wait enabled, will retry any failed check on resource state subject to the specified number of retries") f.BoolVarP(&client.GenerateName, "generate-name", "g", false, "generate the name (and omit the NAME parameter)") f.StringVar(&client.NameTemplate, "name-template", "", "specify template used to name the release") f.StringVar(&client.Description, "description", "", "add a custom description") diff --git a/cmd/helm/upgrade.go b/cmd/helm/upgrade.go index 145d342b7..8b7634619 100644 --- a/cmd/helm/upgrade.go +++ b/cmd/helm/upgrade.go @@ -117,6 +117,7 @@ func newUpgradeCmd(cfg *action.Configuration, out io.Writer) *cobra.Command { instClient.Timeout = client.Timeout instClient.Wait = client.Wait instClient.WaitForJobs = client.WaitForJobs + instClient.WaitRetries = client.WaitRetries instClient.Devel = client.Devel instClient.Namespace = client.Namespace instClient.Atomic = client.Atomic @@ -233,6 +234,7 @@ func newUpgradeCmd(cfg *action.Configuration, out io.Writer) *cobra.Command { f.BoolVar(&client.ReuseValues, "reuse-values", false, "when upgrading, reuse the last release's values and merge in any overrides from the command line via --set and -f. If '--reset-values' is specified, this is ignored") f.BoolVar(&client.Wait, "wait", false, "if set, will wait until all Pods, PVCs, Services, and minimum number of Pods of a Deployment, StatefulSet, or ReplicaSet are in a ready state before marking the release as successful. It will wait for as long as --timeout") f.BoolVar(&client.WaitForJobs, "wait-for-jobs", false, "if set and --wait enabled, will wait until all Jobs have been completed before marking the release as successful. It will wait for as long as --timeout") + f.IntVar(&client.WaitRetries, "wait-retries", 0, "if set and --wait enabled, will retry any failed check on resource state, except if HTTP status code < 500 is received, subject to the specified number of retries") f.BoolVar(&client.Atomic, "atomic", false, "if set, upgrade process rolls back changes made in case of failed upgrade. The --wait flag will be set automatically if --atomic is used") f.IntVar(&client.MaxHistory, "history-max", settings.MaxHistory, "limit the maximum number of revisions saved per release. Use 0 for no limit") f.BoolVar(&client.CleanupOnFail, "cleanup-on-fail", false, "allow deletion of new resources created in this upgrade when upgrade fails") diff --git a/pkg/action/install.go b/pkg/action/install.go index 5fb7e6736..63abc7dbd 100644 --- a/pkg/action/install.go +++ b/pkg/action/install.go @@ -77,6 +77,7 @@ type Install struct { Replace bool Wait bool WaitForJobs bool + WaitRetries int Devel bool DependencyUpdate bool Timeout time.Duration @@ -413,17 +414,24 @@ func (i *Install) performInstall(c chan<- resultMessage, rel *release.Release, t } if i.Wait { + var err error if i.WaitForJobs { - if err := i.cfg.KubeClient.WaitWithJobs(resources, i.Timeout); err != nil { - i.reportToRun(c, rel, err) - return + if kubeClient, ok := i.cfg.KubeClient.(kube.InterfaceWithRetry); ok { + err = kubeClient.WaitWithJobsWithRetry(resources, i.Timeout, i.WaitRetries) + } else { + err = i.cfg.KubeClient.WaitWithJobs(resources, i.Timeout) } } else { - if err := i.cfg.KubeClient.Wait(resources, i.Timeout); err != nil { - i.reportToRun(c, rel, err) - return + if kubeClient, ok := i.cfg.KubeClient.(kube.InterfaceWithRetry); ok { + err = kubeClient.WaitWithRetry(resources, i.Timeout, i.WaitRetries) + } else { + err = i.cfg.KubeClient.Wait(resources, i.Timeout) } } + if err != nil { + i.reportToRun(c, rel, err) + return + } } if !i.DisableHooks { diff --git a/pkg/action/install_test.go b/pkg/action/install_test.go index dd0cdb54d..438f12439 100644 --- a/pkg/action/install_test.go +++ b/pkg/action/install_test.go @@ -393,6 +393,22 @@ func TestInstallRelease_Wait_Interrupted(t *testing.T) { is.Contains(res.Info.Description, "Release \"interrupted-release\" failed: context canceled") is.Equal(res.Info.Status, release.StatusFailed) } +func TestInstallRelease_Wait_With_Retries(t *testing.T) { + is := assert.New(t) + instAction := installAction(t) + instAction.ReleaseName = "come-fail-away" + failer := instAction.cfg.KubeClient.(*kubefake.FailingKubeClient) + failer.WaitError = fmt.Errorf("I timed out") + instAction.cfg.KubeClient = failer + instAction.Wait = true + instAction.WaitRetries = 2 + vals := map[string]interface{}{} + + res, err := instAction.Run(buildChart(), vals) + is.Error(err) + is.Contains(res.Info.Description, "I timed out") + is.Equal(res.Info.Status, release.StatusFailed) +} func TestInstallRelease_WaitForJobs(t *testing.T) { is := assert.New(t) instAction := installAction(t) diff --git a/pkg/action/upgrade.go b/pkg/action/upgrade.go index 784df4013..a35fe90ff 100644 --- a/pkg/action/upgrade.go +++ b/pkg/action/upgrade.go @@ -68,6 +68,8 @@ type Upgrade struct { Wait bool // WaitForJobs determines whether the wait operation for the Jobs should be performed after the upgrade is requested. WaitForJobs bool + // WaitRetries determines whether any failed resource state checks will be retried during the wait operation. + WaitRetries int // DisableHooks disables hook processing if set to true. DisableHooks bool // DryRun controls whether the operation is prepared, but not executed. @@ -402,19 +404,25 @@ func (u *Upgrade) releasingUpgrade(c chan<- resultMessage, upgradedRelease *rele u.cfg.Log( "waiting for release %s resources (created: %d updated: %d deleted: %d)", upgradedRelease.Name, len(results.Created), len(results.Updated), len(results.Deleted)) + var err error if u.WaitForJobs { - if err := u.cfg.KubeClient.WaitWithJobs(target, u.Timeout); err != nil { - u.cfg.recordRelease(originalRelease) - u.reportToPerformUpgrade(c, upgradedRelease, results.Created, err) - return + if kubeClient, ok := u.cfg.KubeClient.(kube.InterfaceWithRetry); ok { + err = kubeClient.WaitWithJobsWithRetry(target, u.Timeout, u.WaitRetries) + } else { + err = u.cfg.KubeClient.WaitWithJobs(target, u.Timeout) } } else { - if err := u.cfg.KubeClient.Wait(target, u.Timeout); err != nil { - u.cfg.recordRelease(originalRelease) - u.reportToPerformUpgrade(c, upgradedRelease, results.Created, err) - return + if kubeClient, ok := u.cfg.KubeClient.(kube.InterfaceWithRetry); ok { + err = kubeClient.WaitWithRetry(target, u.Timeout, u.WaitRetries) + } else { + err = u.cfg.KubeClient.Wait(target, u.Timeout) } } + if err != nil { + u.cfg.recordRelease(originalRelease) + u.reportToPerformUpgrade(c, upgradedRelease, results.Created, err) + return + } } // post-upgrade hooks diff --git a/pkg/action/upgrade_test.go b/pkg/action/upgrade_test.go index 62922b373..62e4fa411 100644 --- a/pkg/action/upgrade_test.go +++ b/pkg/action/upgrade_test.go @@ -88,6 +88,28 @@ func TestUpgradeRelease_Wait(t *testing.T) { is.Contains(res.Info.Description, "I timed out") is.Equal(res.Info.Status, release.StatusFailed) } +func TestUpgradeRelease_Wait_With_Retries(t *testing.T) { + is := assert.New(t) + req := require.New(t) + + upAction := upgradeAction(t) + rel := releaseStub() + rel.Name = "come-fail-away" + rel.Info.Status = release.StatusDeployed + upAction.cfg.Releases.Create(rel) + + failer := upAction.cfg.KubeClient.(*kubefake.FailingKubeClient) + failer.WaitError = fmt.Errorf("I timed out") + upAction.cfg.KubeClient = failer + upAction.Wait = true + upAction.WaitRetries = 2 + vals := map[string]interface{}{} + + res, err := upAction.Run(rel.Name, buildChart(), vals) + req.Error(err) + is.Contains(res.Info.Description, "I timed out") + is.Equal(res.Info.Status, release.StatusFailed) +} func TestUpgradeRelease_WaitForJobs(t *testing.T) { is := assert.New(t) @@ -112,6 +134,30 @@ func TestUpgradeRelease_WaitForJobs(t *testing.T) { is.Equal(res.Info.Status, release.StatusFailed) } +func TestUpgradeRelease_WaitForJobs_With_Retries(t *testing.T) { + is := assert.New(t) + req := require.New(t) + + upAction := upgradeAction(t) + rel := releaseStub() + rel.Name = "come-fail-away" + rel.Info.Status = release.StatusDeployed + upAction.cfg.Releases.Create(rel) + + failer := upAction.cfg.KubeClient.(*kubefake.FailingKubeClient) + failer.WaitError = fmt.Errorf("I timed out") + upAction.cfg.KubeClient = failer + upAction.Wait = true + upAction.WaitForJobs = true + upAction.WaitRetries = 2 + vals := map[string]interface{}{} + + res, err := upAction.Run(rel.Name, buildChart(), vals) + req.Error(err) + is.Contains(res.Info.Description, "I timed out") + is.Equal(res.Info.Status, release.StatusFailed) +} + func TestUpgradeRelease_CleanupOnFail(t *testing.T) { is := assert.New(t) req := require.New(t) diff --git a/pkg/kube/client.go b/pkg/kube/client.go index 7b3c803f9..a2c1a7a40 100644 --- a/pkg/kube/client.go +++ b/pkg/kube/client.go @@ -275,6 +275,13 @@ func getResource(info *resource.Info) (runtime.Object, error) { // Wait waits up to the given timeout for the specified resources to be ready. func (c *Client) Wait(resources ResourceList, timeout time.Duration) error { + return c.WaitWithRetry(resources, timeout, 0) +} + +// WaitWithRetry waits up to the given timeout for the specified resources to be ready. If an error +// is encountered when checking on the status of a resource then retries will be performed subject +// to the given maximum number of retries +func (c *Client) WaitWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error { cs, err := c.getKubeClient() if err != nil { return err @@ -285,11 +292,18 @@ func (c *Client) Wait(resources ResourceList, timeout time.Duration) error { log: c.Log, timeout: timeout, } - return w.waitForResources(resources) + return w.waitForResources(resources, waitRetries) } // WaitWithJobs wait up to the given timeout for the specified resources to be ready, including jobs. func (c *Client) WaitWithJobs(resources ResourceList, timeout time.Duration) error { + return c.WaitWithJobsWithRetry(resources, timeout, 0) +} + +// WaitWithJobsWithRetry waits up to the given timeout for the specified resources to be ready, including jobs. +// If an error is encountered when checking on the status of a resource then retries will be performed subject +// to the given maximum number of retries +func (c *Client) WaitWithJobsWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error { cs, err := c.getKubeClient() if err != nil { return err @@ -300,7 +314,7 @@ func (c *Client) WaitWithJobs(resources ResourceList, timeout time.Duration) err log: c.Log, timeout: timeout, } - return w.waitForResources(resources) + return w.waitForResources(resources, waitRetries) } // WaitForDelete wait up to the given timeout for the specified resources to be deleted. diff --git a/pkg/kube/fake/fake.go b/pkg/kube/fake/fake.go index 267020d57..4b351fa70 100644 --- a/pkg/kube/fake/fake.go +++ b/pkg/kube/fake/fake.go @@ -74,6 +74,15 @@ func (f *FailingKubeClient) Wait(resources kube.ResourceList, d time.Duration) e return f.PrintingKubeClient.Wait(resources, d) } +// Waits the amount of time defined on f.WaitDuration, then returns the configured error if set or prints. +func (f *FailingKubeClient) WaitWithRetry(resources kube.ResourceList, d time.Duration, waitRetries int) error { + time.Sleep(f.WaitDuration) + if f.WaitError != nil { + return f.WaitError + } + return f.PrintingKubeClient.WaitWithRetry(resources, d, waitRetries) +} + // WaitWithJobs returns the configured error if set or prints func (f *FailingKubeClient) WaitWithJobs(resources kube.ResourceList, d time.Duration) error { if f.WaitError != nil { @@ -82,6 +91,14 @@ func (f *FailingKubeClient) WaitWithJobs(resources kube.ResourceList, d time.Dur return f.PrintingKubeClient.WaitWithJobs(resources, d) } +// WaitWithJobs returns the configured error if set or prints +func (f *FailingKubeClient) WaitWithJobsWithRetry(resources kube.ResourceList, d time.Duration, waitRetries int) error { + if f.WaitError != nil { + return f.WaitError + } + return f.PrintingKubeClient.WaitWithJobsWithRetry(resources, d, waitRetries) +} + // WaitForDelete returns the configured error if set or prints func (f *FailingKubeClient) WaitForDelete(resources kube.ResourceList, d time.Duration) error { if f.WaitError != nil { diff --git a/pkg/kube/fake/printer.go b/pkg/kube/fake/printer.go index e6c4b6207..23c746686 100644 --- a/pkg/kube/fake/printer.go +++ b/pkg/kube/fake/printer.go @@ -62,11 +62,21 @@ func (p *PrintingKubeClient) Wait(resources kube.ResourceList, _ time.Duration) return err } +func (p *PrintingKubeClient) WaitWithRetry(resources kube.ResourceList, _ time.Duration, _ int) error { + _, err := io.Copy(p.Out, bufferize(resources)) + return err +} + func (p *PrintingKubeClient) WaitWithJobs(resources kube.ResourceList, _ time.Duration) error { _, err := io.Copy(p.Out, bufferize(resources)) return err } +func (p *PrintingKubeClient) WaitWithJobsWithRetry(resources kube.ResourceList, _ time.Duration, _ int) error { + _, err := io.Copy(p.Out, bufferize(resources)) + return err +} + func (p *PrintingKubeClient) WaitForDelete(resources kube.ResourceList, _ time.Duration) error { _, err := io.Copy(p.Out, bufferize(resources)) return err diff --git a/pkg/kube/interface.go b/pkg/kube/interface.go index ce42ed950..8c799985f 100644 --- a/pkg/kube/interface.go +++ b/pkg/kube/interface.go @@ -110,7 +110,19 @@ type InterfaceResources interface { BuildTable(reader io.Reader, validate bool) (ResourceList, error) } +// InterfaceWithRetry is introduced to avoid breaking backwards compatibility for Interface implementers. +// +// TODO Helm 4: Remove InterfaceWithRetry and integrate its method(s) into the Interface. +type InterfaceWithRetry interface { + // Wait waits up to the given timeout for the specified resources to be ready. + WaitWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error + + // WaitWithJobs wait up to the given timeout for the specified resources to be ready, including jobs. + WaitWithJobsWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error +} + var _ Interface = (*Client)(nil) var _ InterfaceExt = (*Client)(nil) var _ InterfaceDeletionPropagation = (*Client)(nil) var _ InterfaceResources = (*Client)(nil) +var _ InterfaceWithRetry = (*Client)(nil) diff --git a/pkg/kube/wait.go b/pkg/kube/wait.go index ecdd38940..a392c99c9 100644 --- a/pkg/kube/wait.go +++ b/pkg/kube/wait.go @@ -32,6 +32,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/cli-runtime/pkg/resource" "k8s.io/apimachinery/pkg/util/wait" ) @@ -42,18 +43,35 @@ type waiter struct { log func(string, ...interface{}) } -// waitForResources polls to get the current status of all pods, PVCs, Services and -// Jobs(optional) until all are ready or a timeout is reached -func (w *waiter) waitForResources(created ResourceList) error { - w.log("beginning wait for %d resources with timeout of %v", len(created), w.timeout) +// Jobs(optional) until all are ready or a timeout is reached. +// If an error is encountered when checking on the status of a resource then retries +// will be performed subject to the given maximum number of retries +func (w *waiter) waitForResources(created ResourceList, waitRetries int) error { + w.log("beginning wait for %d resources with timeout of %v, maximum retries %d", len(created), w.timeout, waitRetries) ctx, cancel := context.WithTimeout(context.Background(), w.timeout) defer cancel() + numberOfErrors := make([]int, len(created)) + for i := range numberOfErrors { + numberOfErrors[i] = 0 + } + return wait.PollUntilContextCancel(ctx, 2*time.Second, true, func(ctx context.Context) (bool, error) { - for _, v := range created { + for i, v := range created { ready, err := w.c.IsReady(ctx, v) - if !ready || err != nil { + + if waitRetries > 0 && w.isRetryableError(err, v) { + numberOfErrors[i]++ + if numberOfErrors[i] > waitRetries { + w.log("Max number of retries reached") + return false, err + } + w.log("Retrying as current number of retries %d less than max number of retries %d", numberOfErrors[i]-1, waitRetries) + return false, nil + } + numberOfErrors[i] = 0 + if !ready { return false, err } } @@ -61,6 +79,21 @@ func (w *waiter) waitForResources(created ResourceList) error { }) } +func (w *waiter) isRetryableError(err error, resource *resource.Info) bool { + if err == nil { + return false + } + w.log("Error received when checking status of resource %s. Error: '%s', Resource details: '%s'", resource.Name, err, resource) + if ev, ok := err.(*apierrors.StatusError); ok { + statusCode := ev.Status().Code + retryable := statusCode >= 500 + w.log("Status code received: %d. Retryable error? %t", statusCode, retryable) + return retryable + } + w.log("Retryable error? %t", true) + return true +} + // waitForDeletedResources polls to check if all the resources are deleted or a timeout is reached func (w *waiter) waitForDeletedResources(deleted ResourceList) error { w.log("beginning wait for %d resources to be deleted with timeout of %v", len(deleted), w.timeout) From d2cf8c66f1775783edbc150d1a509f58e769e75e Mon Sep 17 00:00:00 2001 From: MichaelMorris Date: Thu, 13 Jul 2023 17:46:03 +0100 Subject: [PATCH 2/2] Updated for review comments Signed-off-by: MichaelMorris --- cmd/helm/install.go | 1 - cmd/helm/upgrade.go | 2 -- pkg/action/install.go | 20 +++++------------ pkg/action/install_test.go | 16 ------------- pkg/action/upgrade.go | 24 +++++++------------- pkg/action/upgrade_test.go | 46 -------------------------------------- pkg/kube/client.go | 18 ++------------- pkg/kube/fake/fake.go | 17 -------------- pkg/kube/fake/printer.go | 10 --------- pkg/kube/interface.go | 12 ---------- pkg/kube/wait.go | 17 +++++++++----- 11 files changed, 27 insertions(+), 156 deletions(-) diff --git a/cmd/helm/install.go b/cmd/helm/install.go index ae68be121..6ffc968ce 100644 --- a/cmd/helm/install.go +++ b/cmd/helm/install.go @@ -167,7 +167,6 @@ func addInstallFlags(cmd *cobra.Command, f *pflag.FlagSet, client *action.Instal f.DurationVar(&client.Timeout, "timeout", 300*time.Second, "time to wait for any individual Kubernetes operation (like Jobs for hooks)") f.BoolVar(&client.Wait, "wait", false, "if set, will wait until all Pods, PVCs, Services, and minimum number of Pods of a Deployment, StatefulSet, or ReplicaSet are in a ready state before marking the release as successful. It will wait for as long as --timeout") f.BoolVar(&client.WaitForJobs, "wait-for-jobs", false, "if set and --wait enabled, will wait until all Jobs have been completed before marking the release as successful. It will wait for as long as --timeout") - f.IntVar(&client.WaitRetries, "wait-retries", 0, "if set and --wait enabled, will retry any failed check on resource state subject to the specified number of retries") f.BoolVarP(&client.GenerateName, "generate-name", "g", false, "generate the name (and omit the NAME parameter)") f.StringVar(&client.NameTemplate, "name-template", "", "specify template used to name the release") f.StringVar(&client.Description, "description", "", "add a custom description") diff --git a/cmd/helm/upgrade.go b/cmd/helm/upgrade.go index 8b7634619..145d342b7 100644 --- a/cmd/helm/upgrade.go +++ b/cmd/helm/upgrade.go @@ -117,7 +117,6 @@ func newUpgradeCmd(cfg *action.Configuration, out io.Writer) *cobra.Command { instClient.Timeout = client.Timeout instClient.Wait = client.Wait instClient.WaitForJobs = client.WaitForJobs - instClient.WaitRetries = client.WaitRetries instClient.Devel = client.Devel instClient.Namespace = client.Namespace instClient.Atomic = client.Atomic @@ -234,7 +233,6 @@ func newUpgradeCmd(cfg *action.Configuration, out io.Writer) *cobra.Command { f.BoolVar(&client.ReuseValues, "reuse-values", false, "when upgrading, reuse the last release's values and merge in any overrides from the command line via --set and -f. If '--reset-values' is specified, this is ignored") f.BoolVar(&client.Wait, "wait", false, "if set, will wait until all Pods, PVCs, Services, and minimum number of Pods of a Deployment, StatefulSet, or ReplicaSet are in a ready state before marking the release as successful. It will wait for as long as --timeout") f.BoolVar(&client.WaitForJobs, "wait-for-jobs", false, "if set and --wait enabled, will wait until all Jobs have been completed before marking the release as successful. It will wait for as long as --timeout") - f.IntVar(&client.WaitRetries, "wait-retries", 0, "if set and --wait enabled, will retry any failed check on resource state, except if HTTP status code < 500 is received, subject to the specified number of retries") f.BoolVar(&client.Atomic, "atomic", false, "if set, upgrade process rolls back changes made in case of failed upgrade. The --wait flag will be set automatically if --atomic is used") f.IntVar(&client.MaxHistory, "history-max", settings.MaxHistory, "limit the maximum number of revisions saved per release. Use 0 for no limit") f.BoolVar(&client.CleanupOnFail, "cleanup-on-fail", false, "allow deletion of new resources created in this upgrade when upgrade fails") diff --git a/pkg/action/install.go b/pkg/action/install.go index 63abc7dbd..5fb7e6736 100644 --- a/pkg/action/install.go +++ b/pkg/action/install.go @@ -77,7 +77,6 @@ type Install struct { Replace bool Wait bool WaitForJobs bool - WaitRetries int Devel bool DependencyUpdate bool Timeout time.Duration @@ -414,24 +413,17 @@ func (i *Install) performInstall(c chan<- resultMessage, rel *release.Release, t } if i.Wait { - var err error if i.WaitForJobs { - if kubeClient, ok := i.cfg.KubeClient.(kube.InterfaceWithRetry); ok { - err = kubeClient.WaitWithJobsWithRetry(resources, i.Timeout, i.WaitRetries) - } else { - err = i.cfg.KubeClient.WaitWithJobs(resources, i.Timeout) + if err := i.cfg.KubeClient.WaitWithJobs(resources, i.Timeout); err != nil { + i.reportToRun(c, rel, err) + return } } else { - if kubeClient, ok := i.cfg.KubeClient.(kube.InterfaceWithRetry); ok { - err = kubeClient.WaitWithRetry(resources, i.Timeout, i.WaitRetries) - } else { - err = i.cfg.KubeClient.Wait(resources, i.Timeout) + if err := i.cfg.KubeClient.Wait(resources, i.Timeout); err != nil { + i.reportToRun(c, rel, err) + return } } - if err != nil { - i.reportToRun(c, rel, err) - return - } } if !i.DisableHooks { diff --git a/pkg/action/install_test.go b/pkg/action/install_test.go index 438f12439..dd0cdb54d 100644 --- a/pkg/action/install_test.go +++ b/pkg/action/install_test.go @@ -393,22 +393,6 @@ func TestInstallRelease_Wait_Interrupted(t *testing.T) { is.Contains(res.Info.Description, "Release \"interrupted-release\" failed: context canceled") is.Equal(res.Info.Status, release.StatusFailed) } -func TestInstallRelease_Wait_With_Retries(t *testing.T) { - is := assert.New(t) - instAction := installAction(t) - instAction.ReleaseName = "come-fail-away" - failer := instAction.cfg.KubeClient.(*kubefake.FailingKubeClient) - failer.WaitError = fmt.Errorf("I timed out") - instAction.cfg.KubeClient = failer - instAction.Wait = true - instAction.WaitRetries = 2 - vals := map[string]interface{}{} - - res, err := instAction.Run(buildChart(), vals) - is.Error(err) - is.Contains(res.Info.Description, "I timed out") - is.Equal(res.Info.Status, release.StatusFailed) -} func TestInstallRelease_WaitForJobs(t *testing.T) { is := assert.New(t) instAction := installAction(t) diff --git a/pkg/action/upgrade.go b/pkg/action/upgrade.go index a35fe90ff..784df4013 100644 --- a/pkg/action/upgrade.go +++ b/pkg/action/upgrade.go @@ -68,8 +68,6 @@ type Upgrade struct { Wait bool // WaitForJobs determines whether the wait operation for the Jobs should be performed after the upgrade is requested. WaitForJobs bool - // WaitRetries determines whether any failed resource state checks will be retried during the wait operation. - WaitRetries int // DisableHooks disables hook processing if set to true. DisableHooks bool // DryRun controls whether the operation is prepared, but not executed. @@ -404,25 +402,19 @@ func (u *Upgrade) releasingUpgrade(c chan<- resultMessage, upgradedRelease *rele u.cfg.Log( "waiting for release %s resources (created: %d updated: %d deleted: %d)", upgradedRelease.Name, len(results.Created), len(results.Updated), len(results.Deleted)) - var err error if u.WaitForJobs { - if kubeClient, ok := u.cfg.KubeClient.(kube.InterfaceWithRetry); ok { - err = kubeClient.WaitWithJobsWithRetry(target, u.Timeout, u.WaitRetries) - } else { - err = u.cfg.KubeClient.WaitWithJobs(target, u.Timeout) + if err := u.cfg.KubeClient.WaitWithJobs(target, u.Timeout); err != nil { + u.cfg.recordRelease(originalRelease) + u.reportToPerformUpgrade(c, upgradedRelease, results.Created, err) + return } } else { - if kubeClient, ok := u.cfg.KubeClient.(kube.InterfaceWithRetry); ok { - err = kubeClient.WaitWithRetry(target, u.Timeout, u.WaitRetries) - } else { - err = u.cfg.KubeClient.Wait(target, u.Timeout) + if err := u.cfg.KubeClient.Wait(target, u.Timeout); err != nil { + u.cfg.recordRelease(originalRelease) + u.reportToPerformUpgrade(c, upgradedRelease, results.Created, err) + return } } - if err != nil { - u.cfg.recordRelease(originalRelease) - u.reportToPerformUpgrade(c, upgradedRelease, results.Created, err) - return - } } // post-upgrade hooks diff --git a/pkg/action/upgrade_test.go b/pkg/action/upgrade_test.go index 62e4fa411..62922b373 100644 --- a/pkg/action/upgrade_test.go +++ b/pkg/action/upgrade_test.go @@ -88,28 +88,6 @@ func TestUpgradeRelease_Wait(t *testing.T) { is.Contains(res.Info.Description, "I timed out") is.Equal(res.Info.Status, release.StatusFailed) } -func TestUpgradeRelease_Wait_With_Retries(t *testing.T) { - is := assert.New(t) - req := require.New(t) - - upAction := upgradeAction(t) - rel := releaseStub() - rel.Name = "come-fail-away" - rel.Info.Status = release.StatusDeployed - upAction.cfg.Releases.Create(rel) - - failer := upAction.cfg.KubeClient.(*kubefake.FailingKubeClient) - failer.WaitError = fmt.Errorf("I timed out") - upAction.cfg.KubeClient = failer - upAction.Wait = true - upAction.WaitRetries = 2 - vals := map[string]interface{}{} - - res, err := upAction.Run(rel.Name, buildChart(), vals) - req.Error(err) - is.Contains(res.Info.Description, "I timed out") - is.Equal(res.Info.Status, release.StatusFailed) -} func TestUpgradeRelease_WaitForJobs(t *testing.T) { is := assert.New(t) @@ -134,30 +112,6 @@ func TestUpgradeRelease_WaitForJobs(t *testing.T) { is.Equal(res.Info.Status, release.StatusFailed) } -func TestUpgradeRelease_WaitForJobs_With_Retries(t *testing.T) { - is := assert.New(t) - req := require.New(t) - - upAction := upgradeAction(t) - rel := releaseStub() - rel.Name = "come-fail-away" - rel.Info.Status = release.StatusDeployed - upAction.cfg.Releases.Create(rel) - - failer := upAction.cfg.KubeClient.(*kubefake.FailingKubeClient) - failer.WaitError = fmt.Errorf("I timed out") - upAction.cfg.KubeClient = failer - upAction.Wait = true - upAction.WaitForJobs = true - upAction.WaitRetries = 2 - vals := map[string]interface{}{} - - res, err := upAction.Run(rel.Name, buildChart(), vals) - req.Error(err) - is.Contains(res.Info.Description, "I timed out") - is.Equal(res.Info.Status, release.StatusFailed) -} - func TestUpgradeRelease_CleanupOnFail(t *testing.T) { is := assert.New(t) req := require.New(t) diff --git a/pkg/kube/client.go b/pkg/kube/client.go index a2c1a7a40..7b3c803f9 100644 --- a/pkg/kube/client.go +++ b/pkg/kube/client.go @@ -275,13 +275,6 @@ func getResource(info *resource.Info) (runtime.Object, error) { // Wait waits up to the given timeout for the specified resources to be ready. func (c *Client) Wait(resources ResourceList, timeout time.Duration) error { - return c.WaitWithRetry(resources, timeout, 0) -} - -// WaitWithRetry waits up to the given timeout for the specified resources to be ready. If an error -// is encountered when checking on the status of a resource then retries will be performed subject -// to the given maximum number of retries -func (c *Client) WaitWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error { cs, err := c.getKubeClient() if err != nil { return err @@ -292,18 +285,11 @@ func (c *Client) WaitWithRetry(resources ResourceList, timeout time.Duration, wa log: c.Log, timeout: timeout, } - return w.waitForResources(resources, waitRetries) + return w.waitForResources(resources) } // WaitWithJobs wait up to the given timeout for the specified resources to be ready, including jobs. func (c *Client) WaitWithJobs(resources ResourceList, timeout time.Duration) error { - return c.WaitWithJobsWithRetry(resources, timeout, 0) -} - -// WaitWithJobsWithRetry waits up to the given timeout for the specified resources to be ready, including jobs. -// If an error is encountered when checking on the status of a resource then retries will be performed subject -// to the given maximum number of retries -func (c *Client) WaitWithJobsWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error { cs, err := c.getKubeClient() if err != nil { return err @@ -314,7 +300,7 @@ func (c *Client) WaitWithJobsWithRetry(resources ResourceList, timeout time.Dura log: c.Log, timeout: timeout, } - return w.waitForResources(resources, waitRetries) + return w.waitForResources(resources) } // WaitForDelete wait up to the given timeout for the specified resources to be deleted. diff --git a/pkg/kube/fake/fake.go b/pkg/kube/fake/fake.go index 4b351fa70..267020d57 100644 --- a/pkg/kube/fake/fake.go +++ b/pkg/kube/fake/fake.go @@ -74,15 +74,6 @@ func (f *FailingKubeClient) Wait(resources kube.ResourceList, d time.Duration) e return f.PrintingKubeClient.Wait(resources, d) } -// Waits the amount of time defined on f.WaitDuration, then returns the configured error if set or prints. -func (f *FailingKubeClient) WaitWithRetry(resources kube.ResourceList, d time.Duration, waitRetries int) error { - time.Sleep(f.WaitDuration) - if f.WaitError != nil { - return f.WaitError - } - return f.PrintingKubeClient.WaitWithRetry(resources, d, waitRetries) -} - // WaitWithJobs returns the configured error if set or prints func (f *FailingKubeClient) WaitWithJobs(resources kube.ResourceList, d time.Duration) error { if f.WaitError != nil { @@ -91,14 +82,6 @@ func (f *FailingKubeClient) WaitWithJobs(resources kube.ResourceList, d time.Dur return f.PrintingKubeClient.WaitWithJobs(resources, d) } -// WaitWithJobs returns the configured error if set or prints -func (f *FailingKubeClient) WaitWithJobsWithRetry(resources kube.ResourceList, d time.Duration, waitRetries int) error { - if f.WaitError != nil { - return f.WaitError - } - return f.PrintingKubeClient.WaitWithJobsWithRetry(resources, d, waitRetries) -} - // WaitForDelete returns the configured error if set or prints func (f *FailingKubeClient) WaitForDelete(resources kube.ResourceList, d time.Duration) error { if f.WaitError != nil { diff --git a/pkg/kube/fake/printer.go b/pkg/kube/fake/printer.go index 23c746686..e6c4b6207 100644 --- a/pkg/kube/fake/printer.go +++ b/pkg/kube/fake/printer.go @@ -62,21 +62,11 @@ func (p *PrintingKubeClient) Wait(resources kube.ResourceList, _ time.Duration) return err } -func (p *PrintingKubeClient) WaitWithRetry(resources kube.ResourceList, _ time.Duration, _ int) error { - _, err := io.Copy(p.Out, bufferize(resources)) - return err -} - func (p *PrintingKubeClient) WaitWithJobs(resources kube.ResourceList, _ time.Duration) error { _, err := io.Copy(p.Out, bufferize(resources)) return err } -func (p *PrintingKubeClient) WaitWithJobsWithRetry(resources kube.ResourceList, _ time.Duration, _ int) error { - _, err := io.Copy(p.Out, bufferize(resources)) - return err -} - func (p *PrintingKubeClient) WaitForDelete(resources kube.ResourceList, _ time.Duration) error { _, err := io.Copy(p.Out, bufferize(resources)) return err diff --git a/pkg/kube/interface.go b/pkg/kube/interface.go index 8c799985f..ce42ed950 100644 --- a/pkg/kube/interface.go +++ b/pkg/kube/interface.go @@ -110,19 +110,7 @@ type InterfaceResources interface { BuildTable(reader io.Reader, validate bool) (ResourceList, error) } -// InterfaceWithRetry is introduced to avoid breaking backwards compatibility for Interface implementers. -// -// TODO Helm 4: Remove InterfaceWithRetry and integrate its method(s) into the Interface. -type InterfaceWithRetry interface { - // Wait waits up to the given timeout for the specified resources to be ready. - WaitWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error - - // WaitWithJobs wait up to the given timeout for the specified resources to be ready, including jobs. - WaitWithJobsWithRetry(resources ResourceList, timeout time.Duration, waitRetries int) error -} - var _ Interface = (*Client)(nil) var _ InterfaceExt = (*Client)(nil) var _ InterfaceDeletionPropagation = (*Client)(nil) var _ InterfaceResources = (*Client)(nil) -var _ InterfaceWithRetry = (*Client)(nil) diff --git a/pkg/kube/wait.go b/pkg/kube/wait.go index a392c99c9..6b747d551 100644 --- a/pkg/kube/wait.go +++ b/pkg/kube/wait.go @@ -19,6 +19,7 @@ package kube // import "helm.sh/helm/v3/pkg/kube" import ( "context" "fmt" + "net/http" "time" "github.com/pkg/errors" @@ -43,11 +44,10 @@ type waiter struct { log func(string, ...interface{}) } -// Jobs(optional) until all are ready or a timeout is reached. -// If an error is encountered when checking on the status of a resource then retries -// will be performed subject to the given maximum number of retries -func (w *waiter) waitForResources(created ResourceList, waitRetries int) error { - w.log("beginning wait for %d resources with timeout of %v, maximum retries %d", len(created), w.timeout, waitRetries) +// waitForResources polls to get the current status of all pods, PVCs, Services and +// Jobs(optional) until all are ready or a timeout is reached +func (w *waiter) waitForResources(created ResourceList) error { + w.log("beginning wait for %d resources with timeout of %v", len(created), w.timeout) ctx, cancel := context.WithTimeout(context.Background(), w.timeout) defer cancel() @@ -58,6 +58,7 @@ func (w *waiter) waitForResources(created ResourceList, waitRetries int) error { } return wait.PollUntilContextCancel(ctx, 2*time.Second, true, func(ctx context.Context) (bool, error) { + waitRetries := 30 for i, v := range created { ready, err := w.c.IsReady(ctx, v) @@ -86,7 +87,7 @@ func (w *waiter) isRetryableError(err error, resource *resource.Info) bool { w.log("Error received when checking status of resource %s. Error: '%s', Resource details: '%s'", resource.Name, err, resource) if ev, ok := err.(*apierrors.StatusError); ok { statusCode := ev.Status().Code - retryable := statusCode >= 500 + retryable := w.isRetryableHTTPStatusCode(statusCode) w.log("Status code received: %d. Retryable error? %t", statusCode, retryable) return retryable } @@ -94,6 +95,10 @@ func (w *waiter) isRetryableError(err error, resource *resource.Info) bool { return true } +func (w *waiter) isRetryableHTTPStatusCode(httpStatusCode int32) bool { + return httpStatusCode == 0 || httpStatusCode == http.StatusTooManyRequests || (httpStatusCode >= 500 && httpStatusCode != http.StatusNotImplemented) +} + // waitForDeletedResources polls to check if all the resources are deleted or a timeout is reached func (w *waiter) waitForDeletedResources(deleted ResourceList) error { w.log("beginning wait for %d resources to be deleted with timeout of %v", len(deleted), w.timeout)