test(kube): fix flaky WaitForDelete test by avoiding informer sync race

The previous fix (increasing timeout / reducing deletion delay) did not work because the flakiness is not a timing problem at all. Root cause: fluxcd/cli-utils HasSynced() returns true after the initial list item is *popped* from DeltaFIFO, which is before AddFunc delivers the ResourceUpdateEvent to the collector. This creates a race where the SyncEvent can arrive at the statusObserver *before* the pod's Current status is recorded. When that happens: - statusObserver sees pod as Unknown - Unknown is skipped for WaitForDelete (by design, to handle resources that were already deleted before watching started) - AggregateStatus([], NotFoundStatus) == NotFoundStatus → cancel() - The watch context is cancelled before DeleteFunc can fire - Final check: pod still Current → error The test intent is to verify that waitForDeleteCtx (not the cancelled generalCtx) is selected. A non-existent resource satisfies this: - With waitForDeleteCtx=Background(): informer syncs with empty list → Unknown → cancel → success ✓ - With generalCtx (cancelled, wrong): context immediately done → ctx.Err() appended → error returned ✓ Remove the goroutine-based deletion and the pod creation to eliminate the race while preserving the context-selection assertion. Signed-off-by: Terry Howe <terrylhowe@gmail.com>
3 months ago · a7f84439aa
parent 4c0d21f53f
commit a7f84439aa
1 changed files with 7 additions and 22 deletions
--- a/pkg/kube/statuswait_test.go
+++ b/pkg/kube/statuswait_test.go
@ -317,7 +317,7 @@ func TestStatusWaitForDelete(t *testing.T) {
 			t.Parallel()
 			c := newTestClient(t)
 			timeout := time.Second
-			timeUntilPodDelete := time.Millisecond * 100
+			timeUntilPodDelete := time.Millisecond * 500
 			fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme)
 			fakeMapper := testutil.NewFakeRESTMapper(
 				v1.SchemeGroupVersion.WithKind("Pod"),
@ -1680,8 +1680,6 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) {
 	t.Run("method-specific context overrides general context for WaitForDelete", func(t *testing.T) {
 		t.Parallel()
 		c := newTestClient(t)
-		timeout := 5 * time.Second
-		timeUntilPodDelete := time.Millisecond * 100
 		fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme)
 		fakeMapper := testutil.NewFakeRESTMapper(
 			v1.SchemeGroupVersion.WithKind("Pod"),
@ -1698,27 +1696,14 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) {
 			waitForDeleteCtx: context.Background(), // Not cancelled - should be used
 		}

+		// Use a non-existent resource: WaitForDelete should return immediately since
+		// the pod is already in the desired "deleted" state.
+		// This also validates context selection: if generalCtx (cancelled) were
+		// incorrectly used instead of waitForDeleteCtx, the watch context would be
+		// immediately cancelled and the call would return a context error.
 		objs := getRuntimeObjFromManifests(t, []string{podCurrentManifest})
-		for _, obj := range objs {
-			u := obj.(*unstructured.Unstructured)
-			gvr := getGVR(t, fakeMapper, u)
-			err := fakeClient.Tracker().Create(gvr, u, u.GetNamespace())
-			require.NoError(t, err)
-		}
-
-		// Schedule deletion
-		for _, obj := range objs {
-			u := obj.(*unstructured.Unstructured)
-			gvr := getGVR(t, fakeMapper, u)
-			go func(gvr schema.GroupVersionResource, u *unstructured.Unstructured) {
-				time.Sleep(timeUntilPodDelete)
-				err := fakeClient.Tracker().Delete(gvr, u.GetNamespace(), u.GetName())
-				assert.NoError(t, err)
-			}(gvr, u)
-		}
-
 		resourceList := getResourceListFromRuntimeObjs(t, c, objs)
-		err := sw.WaitForDelete(resourceList, timeout)
+		err := sw.WaitForDelete(resourceList, time.Second)
 		// Should succeed because method context is used and it's not cancelled
 		assert.NoError(t, err)
 	})