Skip to content

Commit fc67134

Browse files
authored
fix flaky TestCoordinatorReportsInvalidPolicy (#4402)
* increase state change timeout, fail fast and better error messages * print coordinator logs on failure
1 parent bde9684 commit fc67134

File tree

1 file changed

+23
-12
lines changed

1 file changed

+23
-12
lines changed

internal/pkg/agent/application/coordinator/coordinator_unit_test.go

+23-12
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/elastic/elastic-agent/pkg/component"
3434
"github.com/elastic/elastic-agent/pkg/component/runtime"
3535
agentclient "github.com/elastic/elastic-agent/pkg/control/v2/client"
36+
"github.com/elastic/elastic-agent/pkg/core/logger"
3637
"github.com/elastic/elastic-agent/pkg/utils/broadcaster"
3738
)
3839

@@ -331,22 +332,31 @@ func TestCoordinatorReportsInvalidPolicy(t *testing.T) {
331332
// does let's report a failure instead of timing out the test runner.
332333
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
333334
defer cancel()
334-
logger := logp.NewLogger("testing")
335+
336+
log, obs := logger.NewTesting("")
337+
defer func() {
338+
if t.Failed() {
339+
t.Log("test failed, coordinator logs below:")
340+
for _, l := range obs.TakeAll() {
341+
t.Log(l)
342+
}
343+
}
344+
}()
335345

336346
upgradeMgr, err := upgrade.NewUpgrader(
337-
logger,
347+
log,
338348
&artifact.Config{},
339349
&info.AgentInfo{},
340350
)
341-
require.NoError(t, err)
351+
require.NoError(t, err, "errored when creating a new upgrader")
342352

343353
// Channels have buffer length 1, so we don't have to run on multiple
344354
// goroutines.
345355
stateChan := make(chan State, 1)
346356
configChan := make(chan ConfigChange, 1)
347357
varsChan := make(chan []*transpiler.Vars, 1)
348358
coord := &Coordinator{
349-
logger: logger,
359+
logger: log,
350360
state: State{
351361
CoordinatorState: agentclient.Healthy,
352362
CoordinatorMessage: "Running",
@@ -390,19 +400,18 @@ agent.download.sourceURI:
390400
"failed to reload upgrade manager configuration",
391401
"configErr should match policy failure, got %v", coord.configErr)
392402

403+
stateChangeTimeout := 2 * time.Second
393404
select {
394405
case state := <-stateChan:
395406
assert.Equal(t, agentclient.Failed, state.State, "Failed policy change should cause Failed coordinator state")
396407
assert.Contains(t, state.Message, cfgChange.err.Error(), "Coordinator state should report failed policy change")
397408

398409
// The component model update happens on a goroutine, thus the new state
399410
// might not have been sent yet. Therefore, a timeout is required.
400-
case <-time.After(time.Second):
401-
assert.Fail(t, "Coordinator's state didn't change")
411+
case <-time.After(stateChangeTimeout):
412+
t.Fatalf("timedout after %s waiting Coordinator's state to change", stateChangeTimeout)
402413
}
403414

404-
// isn't this another test?
405-
406415
// Send an empty vars update. This should regenerate the component model
407416
// based on the last good (empty) policy, producing a "successful" update,
408417
// but the overall reported state should still be Failed because the last
@@ -420,8 +429,9 @@ agent.download.sourceURI:
420429

421430
// The component model update happens on a goroutine, thus the new state
422431
// might not have been sent yet. Therefore, a timeout is required.
423-
case <-time.After(time.Second):
424-
assert.Fail(t, "Vars change should cause state update")
432+
case <-time.After(stateChangeTimeout):
433+
t.Fatalf("timedout after %s waiting Vars change to cause a state update",
434+
stateChangeTimeout)
425435
}
426436

427437
// Finally, send an empty (valid) policy update and confirm that it
@@ -439,8 +449,9 @@ agent.download.sourceURI:
439449

440450
// The component model update happens on a goroutine, thus the new state
441451
// might not have been sent yet. Therefore, a timeout is required.
442-
case <-time.After(time.Second):
443-
assert.Fail(t, "Policy change should cause state update")
452+
case <-time.After(stateChangeTimeout):
453+
t.Fatalf("timedout after %s waiting Policy change to cause a state update",
454+
stateChangeTimeout)
444455
}
445456
}
446457

0 commit comments

Comments
 (0)