Skip to content

Commit dac3ffe

Browse files
authored
[8.15](backport #5375 #5409 #5361) Fix flaky tests (#5410)
TestLogIngestionFleetManaged was failing because the namespace generated by the integration tests framework was not unique among different tests and test runs, so sometimes collisions would occurs causing some tests to be flaky. TestDebLogIngestFleetManaged was failing because it also has got Beats logging connection errors before receiving the configuration from Elastic-Agent, now this message is also in the allow list. When testing .deb the AGENT_KEEP_INSTALLED environment variable is respected. When an integration test fails, the work directory created by the framework is now kept and its path is printed. createTempDir register a test cleanup function to remove the folder it created, however, on Windows, this folder sometimes fails to be removed because there are still open file handlers for the files within the folder. We fix this problem by retrying to remove the folder with a maximum overall wait time of 3 seconds. This is a very similar approach to what Go's t.TempDir does. Fix the flakiness from TestUpgradeHandler* tests by re-working the mockUpgradeManager, now it accepts a function for its Upgrade method and their implementation is goroutine safe TestEnvWithDefault Now TestEnvWithDefault unsets all environment variables it sets, allowing it to be run multiple times using -count. TestContainerCMDEventToStderr TestContainerCMDEventToStderr did not call agentFixture.Prepare early enough leading to an empty STATE_PATH env var, so all state information was in /usr/share/elastic-agent, which could make subsequent tests to fail because they could read /usr/share/elastic-agent/state/container-paths.yml and use a state path different than the one set in the test.
1 parent 83800f0 commit dac3ffe

File tree

11 files changed

+177
-47
lines changed

11 files changed

+177
-47
lines changed

internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go

+121-25
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ package handlers
66

77
import (
88
"context"
9+
"errors"
10+
"sync/atomic"
911
"testing"
1012
"time"
1113

@@ -26,7 +28,15 @@ import (
2628
)
2729

2830
type mockUpgradeManager struct {
29-
msgChan chan string
31+
UpgradeFn func(
32+
ctx context.Context,
33+
version string,
34+
sourceURI string,
35+
action *fleetapi.ActionUpgrade,
36+
details *details.Details,
37+
skipVerifyOverride bool,
38+
skipDefaultPgp bool,
39+
pgpBytes ...string) (reexec.ShutdownCallbackFn, error)
3040
}
3141

3242
func (u *mockUpgradeManager) Upgradeable() bool {
@@ -37,15 +47,25 @@ func (u *mockUpgradeManager) Reload(rawConfig *config.Config) error {
3747
return nil
3848
}
3949

40-
func (u *mockUpgradeManager) Upgrade(ctx context.Context, version string, sourceURI string, action *fleetapi.ActionUpgrade, details *details.Details, skipVerifyOverride bool, skipDefaultPgp bool, pgpBytes ...string) (_ reexec.ShutdownCallbackFn, err error) {
41-
select {
42-
case <-time.After(2 * time.Second):
43-
u.msgChan <- "completed " + version
44-
return nil, nil
45-
case <-ctx.Done():
46-
u.msgChan <- "canceled " + version
47-
return nil, ctx.Err()
48-
}
50+
func (u *mockUpgradeManager) Upgrade(
51+
ctx context.Context,
52+
version string,
53+
sourceURI string,
54+
action *fleetapi.ActionUpgrade,
55+
details *details.Details,
56+
skipVerifyOverride bool,
57+
skipDefaultPgp bool,
58+
pgpBytes ...string) (reexec.ShutdownCallbackFn, error) {
59+
60+
return u.UpgradeFn(
61+
ctx,
62+
version,
63+
sourceURI,
64+
action,
65+
details,
66+
skipVerifyOverride,
67+
skipDefaultPgp,
68+
pgpBytes...)
4969
}
5070

5171
func (u *mockUpgradeManager) Ack(ctx context.Context, acker acker.Acker) error {
@@ -65,7 +85,7 @@ func TestUpgradeHandler(t *testing.T) {
6585
log, _ := logger.New("", false)
6686

6787
agentInfo := &info.AgentInfo{}
68-
msgChan := make(chan string)
88+
upgradeCalledChan := make(chan struct{})
6989

7090
// Create and start the coordinator
7191
c := coordinator.New(
@@ -75,7 +95,21 @@ func TestUpgradeHandler(t *testing.T) {
7595
agentInfo,
7696
component.RuntimeSpecs{},
7797
nil,
78-
&mockUpgradeManager{msgChan: msgChan},
98+
&mockUpgradeManager{
99+
UpgradeFn: func(
100+
ctx context.Context,
101+
version string,
102+
sourceURI string,
103+
action *fleetapi.ActionUpgrade,
104+
details *details.Details,
105+
skipVerifyOverride bool,
106+
skipDefaultPgp bool,
107+
pgpBytes ...string) (reexec.ShutdownCallbackFn, error) {
108+
109+
upgradeCalledChan <- struct{}{}
110+
return nil, nil
111+
},
112+
},
79113
nil, nil, nil, nil, nil, false)
80114
//nolint:errcheck // We don't need the termination state of the Coordinator
81115
go c.Run(ctx)
@@ -86,8 +120,13 @@ func TestUpgradeHandler(t *testing.T) {
86120
ack := noopacker.New()
87121
err := u.Handle(ctx, &a, ack)
88122
require.NoError(t, err)
89-
msg := <-msgChan
90-
require.Equal(t, "completed 8.3.0", msg)
123+
124+
// Make sure this test does not dead lock or wait for too long
125+
select {
126+
case <-time.Tick(50 * time.Millisecond):
127+
t.Fatal("mockUpgradeManager.Upgrade was not called")
128+
case <-upgradeCalledChan:
129+
}
91130
}
92131

93132
func TestUpgradeHandlerSameVersion(t *testing.T) {
@@ -99,17 +138,37 @@ func TestUpgradeHandlerSameVersion(t *testing.T) {
99138
log, _ := logger.New("", false)
100139

101140
agentInfo := &info.AgentInfo{}
102-
msgChan := make(chan string)
141+
upgradeCalledChan := make(chan struct{})
103142

104143
// Create and start the Coordinator
144+
upgradeCalled := atomic.Bool{}
105145
c := coordinator.New(
106146
log,
107147
configuration.DefaultConfiguration(),
108148
logger.DefaultLogLevel,
109149
agentInfo,
110150
component.RuntimeSpecs{},
111151
nil,
112-
&mockUpgradeManager{msgChan: msgChan},
152+
&mockUpgradeManager{
153+
UpgradeFn: func(
154+
ctx context.Context,
155+
version string,
156+
sourceURI string,
157+
action *fleetapi.ActionUpgrade,
158+
details *details.Details,
159+
skipVerifyOverride bool,
160+
skipDefaultPgp bool,
161+
pgpBytes ...string) (reexec.ShutdownCallbackFn, error) {
162+
163+
if upgradeCalled.CompareAndSwap(false, true) {
164+
upgradeCalledChan <- struct{}{}
165+
return nil, nil
166+
}
167+
err := errors.New("mockUpgradeManager.Upgrade called more than once")
168+
t.Error(err.Error())
169+
return nil, err
170+
},
171+
},
113172
nil, nil, nil, nil, nil, false)
114173
//nolint:errcheck // We don't need the termination state of the Coordinator
115174
go c.Run(ctx)
@@ -122,8 +181,13 @@ func TestUpgradeHandlerSameVersion(t *testing.T) {
122181
err2 := u.Handle(ctx, &a, ack)
123182
require.NoError(t, err1)
124183
require.NoError(t, err2)
125-
msg := <-msgChan
126-
require.Equal(t, "completed 8.3.0", msg)
184+
185+
// Make sure this test does not dead lock or wait for too long
186+
select {
187+
case <-time.Tick(50 * time.Millisecond):
188+
t.Fatal("mockUpgradeManager.Upgrade was not called")
189+
case <-upgradeCalledChan:
190+
}
127191
}
128192

129193
func TestUpgradeHandlerNewVersion(t *testing.T) {
@@ -133,9 +197,9 @@ func TestUpgradeHandlerNewVersion(t *testing.T) {
133197
defer cancel()
134198

135199
log, _ := logger.New("", false)
200+
upgradeCalledChan := make(chan string)
136201

137202
agentInfo := &info.AgentInfo{}
138-
msgChan := make(chan string)
139203

140204
// Create and start the Coordinator
141205
c := coordinator.New(
@@ -145,7 +209,27 @@ func TestUpgradeHandlerNewVersion(t *testing.T) {
145209
agentInfo,
146210
component.RuntimeSpecs{},
147211
nil,
148-
&mockUpgradeManager{msgChan: msgChan},
212+
&mockUpgradeManager{
213+
UpgradeFn: func(
214+
ctx context.Context,
215+
version string,
216+
sourceURI string,
217+
action *fleetapi.ActionUpgrade,
218+
details *details.Details,
219+
skipVerifyOverride bool,
220+
skipDefaultPgp bool,
221+
pgpBytes ...string) (reexec.ShutdownCallbackFn, error) {
222+
223+
defer func() {
224+
upgradeCalledChan <- version
225+
}()
226+
if version == "8.2.0" {
227+
return nil, errors.New("upgrade to 8.2.0 will always fail")
228+
}
229+
230+
return nil, nil
231+
},
232+
},
149233
nil, nil, nil, nil, nil, false)
150234
//nolint:errcheck // We don't need the termination state of the Coordinator
151235
go c.Run(ctx)
@@ -156,13 +240,25 @@ func TestUpgradeHandlerNewVersion(t *testing.T) {
156240
a2 := fleetapi.ActionUpgrade{Data: fleetapi.ActionUpgradeData{
157241
Version: "8.5.0", SourceURI: "http://localhost"}}
158242
ack := noopacker.New()
243+
244+
checkMsg := func(c <-chan string, expected, errMsg string) {
245+
t.Helper()
246+
// Make sure this test does not dead lock or wait for too long
247+
// For some reason < 1s sometimes makes the test fail.
248+
select {
249+
case <-time.Tick(1300 * time.Millisecond):
250+
t.Fatal("timed out waiting for Upgrade to return")
251+
case msg := <-c:
252+
require.Equal(t, expected, msg, errMsg)
253+
}
254+
}
255+
256+
// Send both upgrade actions, a1 will error before a2 succeeds
159257
err1 := u.Handle(ctx, &a1, ack)
160258
require.NoError(t, err1)
161-
time.Sleep(1 * time.Second)
259+
checkMsg(upgradeCalledChan, "8.2.0", "first call must be with version 8.2.0")
260+
162261
err2 := u.Handle(ctx, &a2, ack)
163262
require.NoError(t, err2)
164-
msg1 := <-msgChan
165-
require.Equal(t, "canceled 8.2.0", msg1)
166-
msg2 := <-msgChan
167-
require.Equal(t, "completed 8.5.0", msg2)
263+
checkMsg(upgradeCalledChan, "8.5.0", "second call to Upgrade must be with version 8.5.0")
168264
}

internal/pkg/agent/application/upgrade/upgrade.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -353,14 +353,14 @@ func waitForWatcherWithTimeoutCreationFunc(ctx context.Context, log *logger.Logg
353353
return fmt.Errorf("error starting update marker watcher: %w", err)
354354
}
355355

356-
log.Info("waiting up to %s for upgrade watcher to set %s state in upgrade marker", waitTime, details.StateWatching)
356+
log.Infof("waiting up to %s for upgrade watcher to set %s state in upgrade marker", waitTime, details.StateWatching)
357357

358358
for {
359359
select {
360360
case updMarker := <-markerWatcher.Watch():
361361
if updMarker.Details != nil && updMarker.Details.State == details.StateWatching {
362362
// watcher started and it is watching, all good
363-
log.Info("upgrade watcher set %s state in upgrade marker: exiting wait loop", details.StateWatching)
363+
log.Infof("upgrade watcher set %s state in upgrade marker: exiting wait loop", details.StateWatching)
364364
return nil
365365
}
366366

internal/pkg/agent/cmd/container_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,13 @@ func TestEnvWithDefault(t *testing.T) {
2424
require.Equal(t, def, res)
2525

2626
err := os.Setenv(key1, "key1")
27+
defer os.Unsetenv(key1)
2728
if err != nil {
2829
t.Skipf("could not export env var: %s", err)
2930
}
3031

3132
err = os.Setenv(key2, "key2")
33+
defer os.Unsetenv(key2)
3234
if err != nil {
3335
t.Skipf("could not export env var: %s", err)
3436
}

pkg/testing/define/define.go

+7-12
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import (
1717
"sync"
1818
"testing"
1919

20+
"github.com/gofrs/uuid/v5"
21+
2022
"github.com/elastic/elastic-agent-libs/kibana"
2123
"github.com/elastic/go-elasticsearch/v8"
2224
"github.com/elastic/go-sysinfo"
@@ -199,28 +201,21 @@ func getOSInfo() (*types.OSInfo, error) {
199201
// getNamespace is a general namespace that the test can use that will ensure that it
200202
// is unique and won't collide with other tests (even the same test from a different batch).
201203
//
202-
// this function uses a sha256 of the prefix, package and test name, to ensure that the
204+
// This function uses a sha256 of an UUIDv4 to ensure that the
203205
// length of the namespace is not over the 100 byte limit from Fleet
204206
// see: https://www.elastic.co/guide/en/fleet/current/data-streams.html#data-streams-naming-scheme
205207
func getNamespace(t *testing.T, local bool) (string, error) {
206-
prefix := os.Getenv("TEST_DEFINE_PREFIX")
207-
if prefix == "" {
208-
if local {
209-
prefix = "local"
210-
}
211-
if prefix == "" {
212-
return "", errors.New("TEST_DEFINE_PREFIX must be defined by the test runner")
213-
}
208+
nsUUID, err := uuid.NewV4()
209+
if err != nil {
210+
return "", fmt.Errorf("cannot generate UUID V4: %w", err)
214211
}
215-
name := fmt.Sprintf("%s-%s", prefix, t.Name())
216212
hasher := sha256.New()
217-
hasher.Write([]byte(name))
213+
hasher.Write([]byte(nsUUID.String()))
218214

219215
// Fleet API requires the namespace to be lowercased and not contain
220216
// special characters.
221217
namespace := strings.ToLower(base64.URLEncoding.EncodeToString(hasher.Sum(nil)))
222218
namespace = noSpecialCharsRegexp.ReplaceAllString(namespace, "")
223-
224219
return namespace, nil
225220
}
226221

pkg/testing/fixture.go

+27-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
2828
"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
29+
"github.com/elastic/elastic-agent/internal/pkg/agent/install"
2930
"github.com/elastic/elastic-agent/pkg/component"
3031
"github.com/elastic/elastic-agent/pkg/control"
3132
"github.com/elastic/elastic-agent/pkg/control/v2/client"
@@ -206,7 +207,7 @@ func (f *Fixture) Prepare(ctx context.Context, components ...UsableComponent) er
206207
if err != nil {
207208
return err
208209
}
209-
workDir := f.t.TempDir()
210+
workDir := createTempDir(f.t)
210211
finalDir := filepath.Join(workDir, name)
211212
err = ExtractArtifact(f.t, src, workDir)
212213
if err != nil {
@@ -1196,6 +1197,31 @@ func performConfigure(ctx context.Context, c client.Client, cfg string, timeout
11961197
return nil
11971198
}
11981199

1200+
// createTempDir creates a temporary directory that will be
1201+
// removed after the tests passes. If the test fails, the
1202+
// directory is kept for further investigation.
1203+
//
1204+
// If the test is run with -v and fails the temporary directory is logged
1205+
func createTempDir(t *testing.T) string {
1206+
tempDir, err := os.MkdirTemp("", strings.ReplaceAll(t.Name(), "/", "-"))
1207+
if err != nil {
1208+
t.Fatalf("failed to make temp directory: %s", err)
1209+
}
1210+
1211+
cleanup := func() {
1212+
if !t.Failed() {
1213+
if err := install.RemovePath(tempDir); err != nil {
1214+
t.Errorf("could not remove temp dir '%s': %s", tempDir, err)
1215+
}
1216+
} else {
1217+
t.Logf("Temporary directory %q preserved for investigation/debugging", tempDir)
1218+
}
1219+
}
1220+
t.Cleanup(cleanup)
1221+
1222+
return tempDir
1223+
}
1224+
11991225
type AgentStatusOutput struct {
12001226
Info struct {
12011227
ID string `json:"id"`

pkg/testing/fixture_install.go

+7
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ func (f *Fixture) installDeb(ctx context.Context, installOpts *InstallOpts, opts
416416

417417
f.t.Cleanup(func() {
418418
f.t.Logf("[test %s] Inside fixture installDeb cleanup function", f.t.Name())
419+
419420
uninstallCtx, uninstallCancel := context.WithTimeout(context.Background(), 5*time.Minute)
420421
defer uninstallCancel()
421422
// stop elastic-agent, non fatal if error, might have been stopped before this.
@@ -424,6 +425,12 @@ func (f *Fixture) installDeb(ctx context.Context, installOpts *InstallOpts, opts
424425
if err != nil {
425426
f.t.Logf("error systemctl stop elastic-agent: %s, output: %s", err, string(out))
426427
}
428+
429+
if keepInstalledFlag() {
430+
f.t.Logf("skipping uninstall; test failed and AGENT_KEEP_INSTALLED=true")
431+
return
432+
}
433+
427434
// apt-get purge elastic-agent
428435
f.t.Logf("running 'sudo apt-get -y -q purge elastic-agent'")
429436
out, err = exec.CommandContext(uninstallCtx, "sudo", "apt-get", "-y", "-q", "purge", "elastic-agent").CombinedOutput()

pkg/testing/tools/estools/elasticsearch.go

+1
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ func PerformQueryForRawQuery(ctx context.Context, queryRaw map[string]interface{
593593
es.Search.WithContext(ctx),
594594
es.Search.WithSize(300),
595595
)
596+
596597
if err != nil {
597598
return Documents{}, fmt.Errorf("error performing ES search: %w", err)
598599
}

0 commit comments

Comments
 (0)