fix(kube): prevent spurious early exit in WaitForDelete during informer sync

During informer initialization there is a brief window where watched resources appear as Unknown before their real statuses are delivered. The statusObserver skips Unknown resources when waiting for deletion (they may have been deleted before the watch started), but if *all* resources are in that transient Unknown state the skipped-resource list is empty. AggregateStatus on an empty slice returns the desired status, causing cancel() to be called immediately — before any real status event has arrived. Guard against this by tracking the count of Unknown-skipped resources. When every resource was Unknown-skipped and none have a definitive status yet, defer the early-cancel decision until at least one resource reports a real status. This preserves the correct behaviour for resources that were genuinely deleted before the watch started (they eventually receive a NotFound or stay Unknown, and the aggregate succeeds), while fixing the race for resources that are transiently Unknown at startup. Also tighten the ctx.Err() check in waitForDelete: only append a deadline error when there are resource-specific errors to accompany it. A timeout while all resources are Unknown or NotFound is not itself an error — the resources are in an acceptable state for a delete wait. Fixes: TestStatusWaitForDelete/error_when_not_all_objects_are_deleted Signed-off-by: Terry Howe <terrylhowe@gmail.com> (cherry picked from commit 4e24ee41a4)
3 months ago · 360d4835df
parent 7651edf21e
commit 360d4835df
1 changed files with 14 additions and 1 deletions
--- a/pkg/kube/statuswait.go
+++ b/pkg/kube/statuswait.go
@ -160,7 +160,10 @@ func (w *statusWaiter) waitForDelete(ctx context.Context, resourceList ResourceL
 		errs = append(errs, fmt.Errorf("resource %s/%s/%s still exists. status: %s, message: %s",
 			rs.Identifier.GroupKind.Kind, rs.Identifier.Namespace, rs.Identifier.Name, rs.Status, rs.Message))
 	}
-	if err := ctx.Err(); err != nil {
+	// Only include a deadline error when there are also resource-specific errors.
+	// If all resources are Unknown or NotFound (e.g. deleted before the watch started),
+	// a timeout is not itself an error for WaitForDelete.
+	if err := ctx.Err(); err != nil && len(errs) > 0 {
 		errs = append(errs, err)
 	}
 	if len(errs) > 0 {
@ -234,6 +237,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
 	return func(statusCollector *collector.ResourceStatusCollector, _ event.Event) {
 		var rss []*event.ResourceStatus
 		var nonDesiredResources []*event.ResourceStatus
+		var unknownSkipped int
 		for _, rs := range statusCollector.ResourceStatuses {
 			if rs == nil {
 				continue
@ -241,6 +245,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
 			// If a resource is already deleted before waiting has started, it will show as unknown.
 			// This check ensures we don't wait forever for a resource that is already deleted.
 			if rs.Status == status.UnknownStatus && desired == status.NotFoundStatus {
+				unknownSkipped++
 				continue
 			}
 			// Failed is a terminal state. This check ensures we don't wait forever for a resource
@ -254,6 +259,14 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
 			}
 		}

+		// During informer initialization there is a brief window where existing resources
+		// appear as Unknown before their real status is delivered. If every resource was
+		// skipped as Unknown, we cannot yet distinguish "all deleted" from "not yet synced",
+		// so hold off on the early-cancel to avoid a spurious success or premature exit.
+		if unknownSkipped > 0 && len(rss) == 0 {
+			return
+		}
+
 		if aggregator.AggregateStatus(rss, desired) == desired {
 			logger.Debug("all resources achieved desired status", "desiredStatus", desired, "resourceCount", len(rss))
 			cancel()