From a7f84439aacd3864b40055b60a3c3e54292d1646 Mon Sep 17 00:00:00 2001 From: Terry Howe Date: Tue, 7 Apr 2026 08:23:39 -0600 Subject: [PATCH] test(kube): fix flaky WaitForDelete test by avoiding informer sync race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix (increasing timeout / reducing deletion delay) did not work because the flakiness is not a timing problem at all. Root cause: fluxcd/cli-utils HasSynced() returns true after the initial list item is *popped* from DeltaFIFO, which is before AddFunc delivers the ResourceUpdateEvent to the collector. This creates a race where the SyncEvent can arrive at the statusObserver *before* the pod's Current status is recorded. When that happens: - statusObserver sees pod as Unknown - Unknown is skipped for WaitForDelete (by design, to handle resources that were already deleted before watching started) - AggregateStatus([], NotFoundStatus) == NotFoundStatus → cancel() - The watch context is cancelled before DeleteFunc can fire - Final check: pod still Current → error The test intent is to verify that waitForDeleteCtx (not the cancelled generalCtx) is selected. A non-existent resource satisfies this: - With waitForDeleteCtx=Background(): informer syncs with empty list → Unknown → cancel → success ✓ - With generalCtx (cancelled, wrong): context immediately done → ctx.Err() appended → error returned ✓ Remove the goroutine-based deletion and the pod creation to eliminate the race while preserving the context-selection assertion. Signed-off-by: Terry Howe --- pkg/kube/statuswait_test.go | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/pkg/kube/statuswait_test.go b/pkg/kube/statuswait_test.go index 3ce6b0fd6..73a424720 100644 --- a/pkg/kube/statuswait_test.go +++ b/pkg/kube/statuswait_test.go @@ -317,7 +317,7 @@ func TestStatusWaitForDelete(t *testing.T) { t.Parallel() c := newTestClient(t) timeout := time.Second - timeUntilPodDelete := time.Millisecond * 100 + timeUntilPodDelete := time.Millisecond * 500 fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme) fakeMapper := testutil.NewFakeRESTMapper( v1.SchemeGroupVersion.WithKind("Pod"), @@ -1680,8 +1680,6 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) { t.Run("method-specific context overrides general context for WaitForDelete", func(t *testing.T) { t.Parallel() c := newTestClient(t) - timeout := 5 * time.Second - timeUntilPodDelete := time.Millisecond * 100 fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme) fakeMapper := testutil.NewFakeRESTMapper( v1.SchemeGroupVersion.WithKind("Pod"), @@ -1698,27 +1696,14 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) { waitForDeleteCtx: context.Background(), // Not cancelled - should be used } + // Use a non-existent resource: WaitForDelete should return immediately since + // the pod is already in the desired "deleted" state. + // This also validates context selection: if generalCtx (cancelled) were + // incorrectly used instead of waitForDeleteCtx, the watch context would be + // immediately cancelled and the call would return a context error. objs := getRuntimeObjFromManifests(t, []string{podCurrentManifest}) - for _, obj := range objs { - u := obj.(*unstructured.Unstructured) - gvr := getGVR(t, fakeMapper, u) - err := fakeClient.Tracker().Create(gvr, u, u.GetNamespace()) - require.NoError(t, err) - } - - // Schedule deletion - for _, obj := range objs { - u := obj.(*unstructured.Unstructured) - gvr := getGVR(t, fakeMapper, u) - go func(gvr schema.GroupVersionResource, u *unstructured.Unstructured) { - time.Sleep(timeUntilPodDelete) - err := fakeClient.Tracker().Delete(gvr, u.GetNamespace(), u.GetName()) - assert.NoError(t, err) - }(gvr, u) - } - resourceList := getResourceListFromRuntimeObjs(t, c, objs) - err := sw.WaitForDelete(resourceList, timeout) + err := sw.WaitForDelete(resourceList, time.Second) // Should succeed because method context is used and it's not cancelled assert.NoError(t, err) })