From 4e24ee41a436889a2542a008eaa0ecab8332a1eb Mon Sep 17 00:00:00 2001
From: Terry Howe <terrylhowe@gmail.com>
Date: Sun, 26 Apr 2026 09:16:27 -0600
Subject: [PATCH] fix(kube): prevent spurious early exit in WaitForDelete
 during informer sync
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

During informer initialization there is a brief window where watched
resources appear as Unknown before their real statuses are delivered.
The statusObserver skips Unknown resources when waiting for deletion
(they may have been deleted before the watch started), but if *all*
resources are in that transient Unknown state the skipped-resource list
is empty. AggregateStatus on an empty slice returns the desired status,
causing cancel() to be called immediately — before any real status event
has arrived.

Guard against this by tracking the count of Unknown-skipped resources.
When every resource was Unknown-skipped and none have a definitive status
yet, defer the early-cancel decision until at least one resource reports
a real status. This preserves the correct behaviour for resources that
were genuinely deleted before the watch started (they eventually receive
a NotFound or stay Unknown, and the aggregate succeeds), while fixing
the race for resources that are transiently Unknown at startup.

Also tighten the ctx.Err() check in waitForDelete: only append a
deadline error when there are resource-specific errors to accompany it.
A timeout while all resources are Unknown or NotFound is not itself an
error — the resources are in an acceptable state for a delete wait.

Fixes: TestStatusWaitForDelete/error_when_not_all_objects_are_deleted
Signed-off-by: Terry Howe <terrylhowe@gmail.com>
---
 pkg/kube/statuswait.go | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/pkg/kube/statuswait.go b/pkg/kube/statuswait.go
index 59c1218ff..a93d28fc1 100644
--- a/pkg/kube/statuswait.go
+++ b/pkg/kube/statuswait.go
@@ -160,7 +160,10 @@ func (w *statusWaiter) waitForDelete(ctx context.Context, resourceList ResourceL
 		errs = append(errs, fmt.Errorf("resource %s/%s/%s still exists. status: %s, message: %s",
 			rs.Identifier.GroupKind.Kind, rs.Identifier.Namespace, rs.Identifier.Name, rs.Status, rs.Message))
 	}
-	if err := ctx.Err(); err != nil {
+	// Only include a deadline error when there are also resource-specific errors.
+	// If all resources are Unknown or NotFound (e.g. deleted before the watch started),
+	// a timeout is not itself an error for WaitForDelete.
+	if err := ctx.Err(); err != nil && len(errs) > 0 {
 		errs = append(errs, err)
 	}
 	if len(errs) > 0 {
@@ -234,6 +237,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
 	return func(statusCollector *collector.ResourceStatusCollector, _ event.Event) {
 		var rss []*event.ResourceStatus
 		var nonDesiredResources []*event.ResourceStatus
+		var unknownSkipped int
 		for _, rs := range statusCollector.ResourceStatuses {
 			if rs == nil {
 				continue
@@ -241,6 +245,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
 			// If a resource is already deleted before waiting has started, it will show as unknown.
 			// This check ensures we don't wait forever for a resource that is already deleted.
 			if rs.Status == status.UnknownStatus && desired == status.NotFoundStatus {
+				unknownSkipped++
 				continue
 			}
 			// Failed is a terminal state. This check ensures we don't wait forever for a resource
@@ -254,6 +259,14 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
 			}
 		}
 
+		// During informer initialization there is a brief window where existing resources
+		// appear as Unknown before their real status is delivered. If every resource was
+		// skipped as Unknown, we cannot yet distinguish "all deleted" from "not yet synced",
+		// so hold off on the early-cancel to avoid a spurious success or premature exit.
+		if unknownSkipped > 0 && len(rss) == 0 {
+			return
+		}
+
 		if aggregator.AggregateStatus(rss, desired) == desired {
 			logger.Debug("all resources achieved desired status", "desiredStatus", desired, "resourceCount", len(rss))
 			cancel()