From 4c0d21f53f2ca78b525e31dbbf9cc9cfb818a2e3 Mon Sep 17 00:00:00 2001 From: Terry Howe Date: Tue, 7 Apr 2026 04:26:12 -0600 Subject: [PATCH 1/2] test(kube): fix flaky WaitForDelete timing in status wait tests TestMethodContextOverridesGeneralContext/WaitForDelete used a 1s timeout with a 500ms deletion delay, leaving only ~500ms for the fake watcher to propagate the delete event. On loaded CI runners this window is too tight and causes intermittent failures. Increase the timeout to 5s and reduce the deletion delay to 100ms so there is ample headroom. Apply the same deletion-delay reduction to TestStatusWaitForDelete which shares the same pattern. Signed-off-by: Terry Howe --- pkg/kube/statuswait_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/kube/statuswait_test.go b/pkg/kube/statuswait_test.go index 0639e07fc..3ce6b0fd6 100644 --- a/pkg/kube/statuswait_test.go +++ b/pkg/kube/statuswait_test.go @@ -317,7 +317,7 @@ func TestStatusWaitForDelete(t *testing.T) { t.Parallel() c := newTestClient(t) timeout := time.Second - timeUntilPodDelete := time.Millisecond * 500 + timeUntilPodDelete := time.Millisecond * 100 fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme) fakeMapper := testutil.NewFakeRESTMapper( v1.SchemeGroupVersion.WithKind("Pod"), @@ -1680,8 +1680,8 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) { t.Run("method-specific context overrides general context for WaitForDelete", func(t *testing.T) { t.Parallel() c := newTestClient(t) - timeout := time.Second - timeUntilPodDelete := time.Millisecond * 500 + timeout := 5 * time.Second + timeUntilPodDelete := time.Millisecond * 100 fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme) fakeMapper := testutil.NewFakeRESTMapper( v1.SchemeGroupVersion.WithKind("Pod"), From a7f84439aacd3864b40055b60a3c3e54292d1646 Mon Sep 17 00:00:00 2001 From: Terry Howe Date: Tue, 7 Apr 2026 08:23:39 -0600 Subject: [PATCH 2/2] test(kube): fix flaky WaitForDelete test by avoiding informer sync race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix (increasing timeout / reducing deletion delay) did not work because the flakiness is not a timing problem at all. Root cause: fluxcd/cli-utils HasSynced() returns true after the initial list item is *popped* from DeltaFIFO, which is before AddFunc delivers the ResourceUpdateEvent to the collector. This creates a race where the SyncEvent can arrive at the statusObserver *before* the pod's Current status is recorded. When that happens: - statusObserver sees pod as Unknown - Unknown is skipped for WaitForDelete (by design, to handle resources that were already deleted before watching started) - AggregateStatus([], NotFoundStatus) == NotFoundStatus → cancel() - The watch context is cancelled before DeleteFunc can fire - Final check: pod still Current → error The test intent is to verify that waitForDeleteCtx (not the cancelled generalCtx) is selected. A non-existent resource satisfies this: - With waitForDeleteCtx=Background(): informer syncs with empty list → Unknown → cancel → success ✓ - With generalCtx (cancelled, wrong): context immediately done → ctx.Err() appended → error returned ✓ Remove the goroutine-based deletion and the pod creation to eliminate the race while preserving the context-selection assertion. Signed-off-by: Terry Howe --- pkg/kube/statuswait_test.go | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/pkg/kube/statuswait_test.go b/pkg/kube/statuswait_test.go index 3ce6b0fd6..73a424720 100644 --- a/pkg/kube/statuswait_test.go +++ b/pkg/kube/statuswait_test.go @@ -317,7 +317,7 @@ func TestStatusWaitForDelete(t *testing.T) { t.Parallel() c := newTestClient(t) timeout := time.Second - timeUntilPodDelete := time.Millisecond * 100 + timeUntilPodDelete := time.Millisecond * 500 fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme) fakeMapper := testutil.NewFakeRESTMapper( v1.SchemeGroupVersion.WithKind("Pod"), @@ -1680,8 +1680,6 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) { t.Run("method-specific context overrides general context for WaitForDelete", func(t *testing.T) { t.Parallel() c := newTestClient(t) - timeout := 5 * time.Second - timeUntilPodDelete := time.Millisecond * 100 fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme) fakeMapper := testutil.NewFakeRESTMapper( v1.SchemeGroupVersion.WithKind("Pod"), @@ -1698,27 +1696,14 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) { waitForDeleteCtx: context.Background(), // Not cancelled - should be used } + // Use a non-existent resource: WaitForDelete should return immediately since + // the pod is already in the desired "deleted" state. + // This also validates context selection: if generalCtx (cancelled) were + // incorrectly used instead of waitForDeleteCtx, the watch context would be + // immediately cancelled and the call would return a context error. objs := getRuntimeObjFromManifests(t, []string{podCurrentManifest}) - for _, obj := range objs { - u := obj.(*unstructured.Unstructured) - gvr := getGVR(t, fakeMapper, u) - err := fakeClient.Tracker().Create(gvr, u, u.GetNamespace()) - require.NoError(t, err) - } - - // Schedule deletion - for _, obj := range objs { - u := obj.(*unstructured.Unstructured) - gvr := getGVR(t, fakeMapper, u) - go func(gvr schema.GroupVersionResource, u *unstructured.Unstructured) { - time.Sleep(timeUntilPodDelete) - err := fakeClient.Tracker().Delete(gvr, u.GetNamespace(), u.GetName()) - assert.NoError(t, err) - }(gvr, u) - } - resourceList := getResourceListFromRuntimeObjs(t, c, objs) - err := sw.WaitForDelete(resourceList, timeout) + err := sw.WaitForDelete(resourceList, time.Second) // Should succeed because method context is used and it's not cancelled assert.NoError(t, err) })