From 054eabddd7c480e4dee6dce5db837c41ee19b9eb Mon Sep 17 00:00:00 2001 From: Benoit Tigeot Date: Tue, 14 Oct 2025 19:07:54 +0200 Subject: [PATCH] Return errors on upgrade when deletion fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a rebase of https://github.com/helm/helm/pull/12299 as the pull request was tagged for Helm v4. Closes: https://github.com/helm/helm/issues/11375 Related: https://github.com/helm/helm/pull/7929 It was a pain to reproduce, here is a script: ``` set -u NS=default RELEASE=test-release CHART=./test-chart SA=limited-helm-sa HELM=${HELM:-./bin/helm} echo "Helm: $($HELM version)" echo "Cleaning…" $HELM uninstall "$RELEASE" -n "$NS" >/dev/null 2>&1 || true kubectl -n "$NS" delete sa "$SA" role "${SA}-role" rolebinding "${SA}-rb" >/dev/null 2>&1 || true kubectl -n "$NS" delete cronjob "$RELEASE-test-chart-cronjob" >/dev/null 2>&1 || true rm -rf "$CHART" /tmp/limited-helm-kubeconfig echo "Create minimal chart with only a CronJob" $HELM create "$CHART" >/dev/null rm -f "$CHART"/templates/{deployment.yaml,service.yaml,hpa.yaml,tests/test-connection.yaml,serviceaccount.yaml} cat > "$CHART/templates/cronjob.yaml" <<'YAML' apiVersion: batch/v1 kind: CronJob metadata: name: {{ include "test-chart.fullname" . }}-cronjob spec: schedule: "*/5 * * * *" jobTemplate: spec: template: spec: restartPolicy: OnFailure containers: - name: hello image: busybox command: ["/bin/sh","-c","date; echo Hello from CronJob"] YAML echo "RBAC: allow Helm storage, basic reads/creates, but NO delete on cronjobs" kubectl -n "$NS" apply -f - >/dev/null </dev/null < "$KCFG" < "$CHART/templates/configmap.yaml" <<'YAML' apiVersion: v1 kind: ConfigMap metadata: name: {{ include "test-chart.fullname" . }}-config data: hello: world YAML echo "Upgrade without CronJob (as limited SA)" KUBECONFIG="$KCFG" $HELM upgrade --install "$RELEASE" "$CHART" -n "$NS" RC=$? echo "Post-upgrade verification" if kubectl -n "$NS" get cronjob "$RELEASE-test-chart-cronjob" >/dev/null 2>&1; then echo "OK: Stale CronJob still present: $RELEASE-test-chart-cronjob" else echo "NO_OK: CronJob deleted" fi echo "Helm exit code: $RC" exit 0 ``` With the current build: ```sh ./reproduce-helm-issue.sh Helm: version.BuildInfo{Version:"v4.0+unreleased", GitCommit:"f19bb9cd4c99943f7a4980d6670de44affe3e472", GitTreeState:"dirty", GoVersion:"go1.24.0"} Cleaning… Create minimal chart with CronJob + ConfigMap (we will remove both in v2) RBAC: allow Helm storage + delete for configmaps, but NO delete on cronjobs Create kubeconfig for that SA Install v1 (as limited SA) Release "test-release" does not exist. Installing it now. NAME: test-release LAST DEPLOYED: Tue Oct 14 18:55:57 2025 NAMESPACE: default STATUS: deployed REVISION: 1 DESCRIPTION: Install complete TEST SUITE: None NOTES: 1. Get the application URL by running these commands: export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=test-chart,app.kubernetes.io/instance=test-release" -o jsonpath="{.items[0].metadata.name}") export CONTAINER_PORT=$(kubectl get pod --namespace default $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") echo "Visit http://127.0.0.1:8080 to use your application" kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT Verify v1 objects exist NAME SCHEDULE TIMEZONE SUSPEND ACTIVE LAST SCHEDULE AGE test-release-test-chart-cronjob */5 * * * * False 0 0s NAME DATA AGE test-release-test-chart-config 1 0s Prepare v2: remove BOTH CronJob and ConfigMap from the chart Upgrade to v2 (as limited SA) — expecting CronJob delete first, then ConfigMap - CronJob delete should FAIL (no delete permission) - ConfigMap delete should SUCCEED (delete allowed) — proves 'continue on error' and inverted order level=DEBUG msg="getting history for release" release=test-release level=DEBUG msg="getting release history" name=test-release level=DEBUG msg="preparing upgrade" name=test-release level=DEBUG msg="getting last revision" name=test-release level=DEBUG msg="getting release history" name=test-release level=DEBUG msg="number of dependencies in the chart" dependencies=0 level=DEBUG msg="determined release apply method" server_side_apply=true previous_release_apply_method=ssa level=DEBUG msg="performing update" name=test-release level=DEBUG msg="creating upgraded release" name=test-release level=DEBUG msg="creating release" key=sh.helm.release.v1.test-release.v2 level=DEBUG msg="getting release history" name=test-release level=DEBUG msg="using server-side apply for resource update" forceConflicts=false dryRun=false fieldValidationDirective=Strict upgradeClientSideFieldManager=false level=DEBUG msg="checking resources for changes" resources=0 level=DEBUG msg="deleting resource" namespace=default name=test-release-test-chart-config kind=ConfigMap level=DEBUG msg="deleting resource" namespace=default name=test-release-test-chart-cronjob kind=CronJob level=DEBUG msg="failed to delete resource" namespace=default name=test-release-test-chart-cronjob kind=CronJob error="cronjobs.batch \"test-release-test-chart-cronjob\" is forbidden: User \"system:serviceaccount:default:limited-helm-sa\" cannot delete resource \"cronjobs\" in API group \"batch\" in the namespace \"default\"" level=INFO msg="update completed" created=0 updated=0 deleted=1 level=WARN msg="update completed with errors" errors=1 level=DEBUG msg="updating release" key=sh.helm.release.v1.test-release.v1 level=WARN msg="upgrade failed" name=test-release error="failed to delete resource test-release-test-chart-cronjob: cronjobs.batch \"test-release-test-chart-cronjob\" is forbidden: User \"system:serviceaccount:default:limited-helm-sa\" cannot delete resource \"cronjobs\" in API group \"batch\" in the namespace \"default\"" level=DEBUG msg="updating release" key=sh.helm.release.v1.test-release.v2 Error: UPGRADE FAILED: failed to delete resource test-release-test-chart-cronjob: cronjobs.batch "test-release-test-chart-cronjob" is forbidden: User "system:serviceaccount:default:limited-helm-sa" cannot delete resource "cronjobs" in API group "batch" in the namespace "default" Post-upgrade verification Stale CronJob still present: test-release-test-chart-cronjob (expected if delete is forbidden) ConfigMap deleted as expected: test-release-test-chart-config (and after CronJob attempt) Helm exit code: 1 ``` With last version v3.19: ``` HELM=/usr/local/bin/helm ./reproduce-helm-issue.sh Helm: version.BuildInfo{Version:"v3.19.0", GitCommit:"3d8990f0836691f0229297773f3524598f46bda6", GitTreeState:"clean", GoVersion:"go1.24.7"} Cleaning… Create minimal chart with only a CronJob RBAC: allow Helm storage, basic reads/creates, but NO delete on cronjobs Create kubeconfig for that SA Install (as limited SA) Release "test-release" does not exist. Installing it now. NAME: test-release LAST DEPLOYED: Tue Oct 14 19:07:01 2025 NAMESPACE: default STATUS: deployed REVISION: 1 TEST SUITE: None NOTES: 1. Get the application URL by running these commands: export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=test-chart,app.kubernetes.io/instance=test-release" -o jsonpath="{.items[0].metadata.name}") export CONTAINER_PORT=$(kubectl get pod --namespace default $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") echo "Visit http://127.0.0.1:8080 to use your application" kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT CronJob after install: NAME SCHEDULE TIMEZONE SUSPEND ACTIVE LAST SCHEDULE AGE test-release-test-chart-cronjob */5 * * * * False 0 0s Remove CronJob from chart and add a small ConfigMap to force an upgrade Upgrade without CronJob (as limited SA) Release "test-release" has been upgraded. Happy Helming! NAME: test-release LAST DEPLOYED: Tue Oct 14 19:07:01 2025 NAMESPACE: default STATUS: deployed REVISION: 2 TEST SUITE: None NOTES: 1. Get the application URL by running these commands: export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=test-chart,app.kubernetes.io/instance=test-release" -o jsonpath="{.items[0].metadata.name}") export CONTAINER_PORT=$(kubectl get pod --namespace default $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") echo "Visit http://127.0.0.1:8080 to use your application" kubectl --namespace default port-forward $POD_NAME 8080:$CONTAINER_PORT Post-upgrade verification OK: Stale CronJob still present: test-release-test-chart-cronjob Helm exit code: 0 ``` Co-authored-by: dayeguilaiye <979014041@qq.com> Signed-off-by: Benoit Tigeot --- pkg/kube/client.go | 7 +++++++ pkg/kube/client_test.go | 16 +++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pkg/kube/client.go b/pkg/kube/client.go index 30c014ad5..bc9ee3cc3 100644 --- a/pkg/kube/client.go +++ b/pkg/kube/client.go @@ -569,10 +569,17 @@ func (c *Client) update(originals, targets ResourceList, updateApplyFunc UpdateA } if err := deleteResource(info, metav1.DeletePropagationBackground); err != nil { slog.Debug("failed to delete resource", "namespace", info.Namespace, "name", info.Name, "kind", info.Mapping.GroupVersionKind.Kind, slog.Any("error", err)) + if !apierrors.IsNotFound(err) { + updateErrors = append(updateErrors, fmt.Errorf("failed to delete resource %s: %w", info.Name, err)) + } continue } res.Deleted = append(res.Deleted, info) } + + if len(updateErrors) != 0 { + return res, joinErrors(updateErrors, " && ") + } return res, nil } diff --git a/pkg/kube/client_test.go b/pkg/kube/client_test.go index d8c0fba5f..c6845fc52 100644 --- a/pkg/kube/client_test.go +++ b/pkg/kube/client_test.go @@ -336,6 +336,8 @@ func TestUpdate(t *testing.T) { "/namespaces/default/pods:POST", // retry due to 409 "/namespaces/default/pods/squid:GET", "/namespaces/default/pods/squid:DELETE", + "/namespaces/default/pods/notfound:GET", + "/namespaces/default/pods/notfound:DELETE", } expectedActionsServerSideApply := []string{ @@ -351,11 +353,13 @@ func TestUpdate(t *testing.T) { "/namespaces/default/pods:POST", // retry due to 409 "/namespaces/default/pods/squid:GET", "/namespaces/default/pods/squid:DELETE", + "/namespaces/default/pods/notfound:GET", + "/namespaces/default/pods/notfound:DELETE", } testCases := map[string]testCase{ "client-side apply": { - OriginalPods: newPodList("starfish", "otter", "squid"), + OriginalPods: newPodList("starfish", "otter", "squid", "notfound"), TargetPods: func() v1.PodList { listTarget := newPodList("starfish", "otter", "dolphin") listTarget.Items[0].Spec.Containers[0].Ports = []v1.ContainerPort{{Name: "https", ContainerPort: 443}} @@ -367,7 +371,7 @@ func TestUpdate(t *testing.T) { ExpectedActions: expectedActionsClientSideApply, }, "client-side apply (three-way merge for unstructured)": { - OriginalPods: newPodList("starfish", "otter", "squid"), + OriginalPods: newPodList("starfish", "otter", "squid", "notfound"), TargetPods: func() v1.PodList { listTarget := newPodList("starfish", "otter", "dolphin") listTarget.Items[0].Spec.Containers[0].Ports = []v1.ContainerPort{{Name: "https", ContainerPort: 443}} @@ -379,7 +383,7 @@ func TestUpdate(t *testing.T) { ExpectedActions: expectedActionsClientSideApply, }, "serverSideApply": { - OriginalPods: newPodList("starfish", "otter", "squid"), + OriginalPods: newPodList("starfish", "otter", "squid", "notfound"), TargetPods: func() v1.PodList { listTarget := newPodList("starfish", "otter", "dolphin") listTarget.Items[0].Spec.Containers[0].Ports = []v1.ContainerPort{{Name: "https", ContainerPort: 443}} @@ -444,6 +448,12 @@ func TestUpdate(t *testing.T) { return newResponse(http.StatusOK, &listTarget.Items[1]) case p == "/namespaces/default/pods/squid" && m == http.MethodGet: return newResponse(http.StatusOK, &listTarget.Items[2]) + case p == "/namespaces/default/pods/notfound" && m == http.MethodGet: + // Resource exists in original but will simulate not found on delete + return newResponse(http.StatusOK, &listOriginal.Items[3]) + case p == "/namespaces/default/pods/notfound" && m == http.MethodDelete: + // Simulate a not found during deletion; should not cause update to fail + return newResponse(http.StatusNotFound, notFoundBody()) default: }