test(kube): fix flaky WaitForDelete test by avoiding informer sync race

The previous fix (increasing timeout / reducing deletion delay) did not
work because the flakiness is not a timing problem at all.

Root cause: fluxcd/cli-utils HasSynced() returns true after the initial
list item is *popped* from DeltaFIFO, which is before AddFunc delivers
the ResourceUpdateEvent to the collector. This creates a race where the
SyncEvent can arrive at the statusObserver *before* the pod's Current
status is recorded. When that happens:
  - statusObserver sees pod as Unknown
  - Unknown is skipped for WaitForDelete (by design, to handle resources
    that were already deleted before watching started)
  - AggregateStatus([], NotFoundStatus) == NotFoundStatus → cancel()
  - The watch context is cancelled before DeleteFunc can fire
  - Final check: pod still Current → error

The test intent is to verify that waitForDeleteCtx (not the cancelled
generalCtx) is selected. A non-existent resource satisfies this:
  - With waitForDeleteCtx=Background(): informer syncs with empty list
    → Unknown → cancel → success ✓
  - With generalCtx (cancelled, wrong): context immediately done
    → ctx.Err() appended → error returned ✓

Remove the goroutine-based deletion and the pod creation to eliminate
the race while preserving the context-selection assertion.

Signed-off-by: Terry Howe <terrylhowe@gmail.com>
pull/32016/head
Terry Howe 4 days ago
parent 4c0d21f53f
commit a7f84439aa
No known key found for this signature in database

@ -317,7 +317,7 @@ func TestStatusWaitForDelete(t *testing.T) {
t.Parallel()
c := newTestClient(t)
timeout := time.Second
timeUntilPodDelete := time.Millisecond * 100
timeUntilPodDelete := time.Millisecond * 500
fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme)
fakeMapper := testutil.NewFakeRESTMapper(
v1.SchemeGroupVersion.WithKind("Pod"),
@ -1680,8 +1680,6 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) {
t.Run("method-specific context overrides general context for WaitForDelete", func(t *testing.T) {
t.Parallel()
c := newTestClient(t)
timeout := 5 * time.Second
timeUntilPodDelete := time.Millisecond * 100
fakeClient := dynamicfake.NewSimpleDynamicClient(scheme.Scheme)
fakeMapper := testutil.NewFakeRESTMapper(
v1.SchemeGroupVersion.WithKind("Pod"),
@ -1698,27 +1696,14 @@ func TestMethodContextOverridesGeneralContext(t *testing.T) {
waitForDeleteCtx: context.Background(), // Not cancelled - should be used
}
// Use a non-existent resource: WaitForDelete should return immediately since
// the pod is already in the desired "deleted" state.
// This also validates context selection: if generalCtx (cancelled) were
// incorrectly used instead of waitForDeleteCtx, the watch context would be
// immediately cancelled and the call would return a context error.
objs := getRuntimeObjFromManifests(t, []string{podCurrentManifest})
for _, obj := range objs {
u := obj.(*unstructured.Unstructured)
gvr := getGVR(t, fakeMapper, u)
err := fakeClient.Tracker().Create(gvr, u, u.GetNamespace())
require.NoError(t, err)
}
// Schedule deletion
for _, obj := range objs {
u := obj.(*unstructured.Unstructured)
gvr := getGVR(t, fakeMapper, u)
go func(gvr schema.GroupVersionResource, u *unstructured.Unstructured) {
time.Sleep(timeUntilPodDelete)
err := fakeClient.Tracker().Delete(gvr, u.GetNamespace(), u.GetName())
assert.NoError(t, err)
}(gvr, u)
}
resourceList := getResourceListFromRuntimeObjs(t, c, objs)
err := sw.WaitForDelete(resourceList, timeout)
err := sw.WaitForDelete(resourceList, time.Second)
// Should succeed because method context is used and it's not cancelled
assert.NoError(t, err)
})

Loading…
Cancel
Save