From a7f84439aacd3864b40055b60a3c3e54292d1646 Mon Sep 17 00:00:00 2001
From: Terry Howe <terrylhowe@gmail.com>
Date: Tue, 7 Apr 2026 08:23:39 -0600
Subject: [PATCH] test(kube): fix flaky WaitForDelete test by avoiding informer
 sync race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous fix (increasing timeout / reducing deletion delay) did not
work because the flakiness is not a timing problem at all.

Root cause: fluxcd/cli-utils HasSynced() returns true after the initial
list item is *popped* from DeltaFIFO, which is before AddFunc delivers
the ResourceUpdateEvent to the collector. This creates a race where the
SyncEvent can arrive at the statusObserver *before* the pod's Current
status is recorded. When that happens:
  - statusObserver sees pod as Unknown
  - Unknown is skipped for WaitForDelete (by design, to handle resources
    that were already deleted before watching started)
  - AggregateStatus([], NotFoundStatus) == NotFoundStatus → cancel()
  - The watch context is cancelled before DeleteFunc can fire
  - Final check: pod still Current → error

The test intent is to verify that waitForDeleteCtx (not the cancelled
generalCtx) is selected. A non-existent resource satisfies this:
  - With waitForDeleteCtx=Background(): informer syncs with empty list
    → Unknown → cancel → success ✓
  - With generalCtx (cancelled, wrong): context immediately done
    → ctx.Err() appended → error returned ✓

Remove the goroutine-based deletion and the pod creation to eliminate
the race while preserving the context-selection assertion.

Signed-off-by: Terry Howe <terrylhowe@gmail.com>
---
 pkg/kube/statuswait_test.go | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/pkg/kube/statuswait_test.go b/pkg/kube/statuswait_test.go
index 3ce6b0fd6..73a424720 100644
--- a/pkg/kube/statuswait_test.go
+++ b/pkg/kube/statuswait_test.go
@@ -317,7 +317,7 @@ func TestStatusWaitForDelete(t *testing.T) {
 			t.Parallel()
 			c := newTestClient(t)
 			timeout := time.Second
-			timeUntilPodDelete := time.Millisecond * 100
+			timeUntilPodDelete := time.Millisecond * 500
 			fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme)
 			fakeMapper := testutil.NewFakeRESTMapper(
 				v1.SchemeGroupVersion.WithKind("Pod"),
@@ -1680,8 +1680,6 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) {
 	t.Run("method-specific context overrides general context for WaitForDelete", func(t *testing.T) {
 		t.Parallel()
 		c := newTestClient(t)
-		timeout := 5 * time.Second
-		timeUntilPodDelete := time.Millisecond * 100
 		fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme)
 		fakeMapper := testutil.NewFakeRESTMapper(
 			v1.SchemeGroupVersion.WithKind("Pod"),
@@ -1698,27 +1696,14 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) {
 			waitForDeleteCtx: context.Background(), // Not cancelled - should be used
 		}
 
+		// Use a non-existent resource: WaitForDelete should return immediately since
+		// the pod is already in the desired "deleted" state.
+		// This also validates context selection: if generalCtx (cancelled) were
+		// incorrectly used instead of waitForDeleteCtx, the watch context would be
+		// immediately cancelled and the call would return a context error.
 		objs := getRuntimeObjFromManifests(t, []string{podCurrentManifest})
-		for _, obj := range objs {
-			u := obj.(*unstructured.Unstructured)
-			gvr := getGVR(t, fakeMapper, u)
-			err := fakeClient.Tracker().Create(gvr, u, u.GetNamespace())
-			require.NoError(t, err)
-		}
-
-		// Schedule deletion
-		for _, obj := range objs {
-			u := obj.(*unstructured.Unstructured)
-			gvr := getGVR(t, fakeMapper, u)
-			go func(gvr schema.GroupVersionResource, u *unstructured.Unstructured) {
-				time.Sleep(timeUntilPodDelete)
-				err := fakeClient.Tracker().Delete(gvr, u.GetNamespace(), u.GetName())
-				assert.NoError(t, err)
-			}(gvr, u)
-		}
-
 		resourceList := getResourceListFromRuntimeObjs(t, c, objs)
-		err := sw.WaitForDelete(resourceList, timeout)
+		err := sw.WaitForDelete(resourceList, time.Second)
 		// Should succeed because method context is used and it's not cancelled
 		assert.NoError(t, err)
 	})