From 4e24ee41a436889a2542a008eaa0ecab8332a1eb Mon Sep 17 00:00:00 2001 From: Terry Howe Date: Sun, 26 Apr 2026 09:16:27 -0600 Subject: [PATCH] fix(kube): prevent spurious early exit in WaitForDelete during informer sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During informer initialization there is a brief window where watched resources appear as Unknown before their real statuses are delivered. The statusObserver skips Unknown resources when waiting for deletion (they may have been deleted before the watch started), but if *all* resources are in that transient Unknown state the skipped-resource list is empty. AggregateStatus on an empty slice returns the desired status, causing cancel() to be called immediately — before any real status event has arrived. Guard against this by tracking the count of Unknown-skipped resources. When every resource was Unknown-skipped and none have a definitive status yet, defer the early-cancel decision until at least one resource reports a real status. This preserves the correct behaviour for resources that were genuinely deleted before the watch started (they eventually receive a NotFound or stay Unknown, and the aggregate succeeds), while fixing the race for resources that are transiently Unknown at startup. Also tighten the ctx.Err() check in waitForDelete: only append a deadline error when there are resource-specific errors to accompany it. A timeout while all resources are Unknown or NotFound is not itself an error — the resources are in an acceptable state for a delete wait. Fixes: TestStatusWaitForDelete/error_when_not_all_objects_are_deleted Signed-off-by: Terry Howe --- pkg/kube/statuswait.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pkg/kube/statuswait.go b/pkg/kube/statuswait.go index 59c1218ff..a93d28fc1 100644 --- a/pkg/kube/statuswait.go +++ b/pkg/kube/statuswait.go @@ -160,7 +160,10 @@ func (w *statusWaiter) waitForDelete(ctx context.Context, resourceList ResourceL errs = append(errs, fmt.Errorf("resource %s/%s/%s still exists. status: %s, message: %s", rs.Identifier.GroupKind.Kind, rs.Identifier.Namespace, rs.Identifier.Name, rs.Status, rs.Message)) } - if err := ctx.Err(); err != nil { + // Only include a deadline error when there are also resource-specific errors. + // If all resources are Unknown or NotFound (e.g. deleted before the watch started), + // a timeout is not itself an error for WaitForDelete. + if err := ctx.Err(); err != nil && len(errs) > 0 { errs = append(errs, err) } if len(errs) > 0 { @@ -234,6 +237,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl return func(statusCollector *collector.ResourceStatusCollector, _ event.Event) { var rss []*event.ResourceStatus var nonDesiredResources []*event.ResourceStatus + var unknownSkipped int for _, rs := range statusCollector.ResourceStatuses { if rs == nil { continue @@ -241,6 +245,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl // If a resource is already deleted before waiting has started, it will show as unknown. // This check ensures we don't wait forever for a resource that is already deleted. if rs.Status == status.UnknownStatus && desired == status.NotFoundStatus { + unknownSkipped++ continue } // Failed is a terminal state. This check ensures we don't wait forever for a resource @@ -254,6 +259,14 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl } } + // During informer initialization there is a brief window where existing resources + // appear as Unknown before their real status is delivered. If every resource was + // skipped as Unknown, we cannot yet distinguish "all deleted" from "not yet synced", + // so hold off on the early-cancel to avoid a spurious success or premature exit. + if unknownSkipped > 0 && len(rss) == 0 { + return + } + if aggregator.AggregateStatus(rss, desired) == desired { logger.Debug("all resources achieved desired status", "desiredStatus", desired, "resourceCount", len(rss)) cancel()