fix(kube): prevent spurious early exit in WaitForDelete during informer sync

During informer initialization there is a brief window where watched
resources appear as Unknown before their real statuses are delivered.
The statusObserver skips Unknown resources when waiting for deletion
(they may have been deleted before the watch started), but if *all*
resources are in that transient Unknown state the skipped-resource list
is empty. AggregateStatus on an empty slice returns the desired status,
causing cancel() to be called immediately — before any real status event
has arrived.

Guard against this by tracking the count of Unknown-skipped resources.
When every resource was Unknown-skipped and none have a definitive status
yet, defer the early-cancel decision until at least one resource reports
a real status. This preserves the correct behaviour for resources that
were genuinely deleted before the watch started (they eventually receive
a NotFound or stay Unknown, and the aggregate succeeds), while fixing
the race for resources that are transiently Unknown at startup.

Also tighten the ctx.Err() check in waitForDelete: only append a
deadline error when there are resource-specific errors to accompany it.
A timeout while all resources are Unknown or NotFound is not itself an
error — the resources are in an acceptable state for a delete wait.

Fixes: TestStatusWaitForDelete/error_when_not_all_objects_are_deleted
Signed-off-by: Terry Howe <terrylhowe@gmail.com>
pull/32081/head
Terry Howe 3 weeks ago
parent fbc2791886
commit 4e24ee41a4
No known key found for this signature in database

@ -160,7 +160,10 @@ func (w *statusWaiter) waitForDelete(ctx context.Context, resourceList ResourceL
errs = append(errs, fmt.Errorf("resource %s/%s/%s still exists. status: %s, message: %s",
rs.Identifier.GroupKind.Kind, rs.Identifier.Namespace, rs.Identifier.Name, rs.Status, rs.Message))
}
if err := ctx.Err(); err != nil {
// Only include a deadline error when there are also resource-specific errors.
// If all resources are Unknown or NotFound (e.g. deleted before the watch started),
// a timeout is not itself an error for WaitForDelete.
if err := ctx.Err(); err != nil && len(errs) > 0 {
errs = append(errs, err)
}
if len(errs) > 0 {
@ -234,6 +237,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
return func(statusCollector *collector.ResourceStatusCollector, _ event.Event) {
var rss []*event.ResourceStatus
var nonDesiredResources []*event.ResourceStatus
var unknownSkipped int
for _, rs := range statusCollector.ResourceStatuses {
if rs == nil {
continue
@ -241,6 +245,7 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
// If a resource is already deleted before waiting has started, it will show as unknown.
// This check ensures we don't wait forever for a resource that is already deleted.
if rs.Status == status.UnknownStatus && desired == status.NotFoundStatus {
unknownSkipped++
continue
}
// Failed is a terminal state. This check ensures we don't wait forever for a resource
@ -254,6 +259,14 @@ func statusObserver(cancel context.CancelFunc, desired status.Status, logger *sl
}
}
// During informer initialization there is a brief window where existing resources
// appear as Unknown before their real status is delivered. If every resource was
// skipped as Unknown, we cannot yet distinguish "all deleted" from "not yet synced",
// so hold off on the early-cancel to avoid a spurious success or premature exit.
if unknownSkipped > 0 && len(rss) == 0 {
return
}
if aggregator.AggregateStatus(rss, desired) == desired {
logger.Debug("all resources achieved desired status", "desiredStatus", desired, "resourceCount", len(rss))
cancel()

Loading…
Cancel
Save