Skip to content

Commit 179ffdd

Browse files
[8.15](backport #5420) [Flaky Test] TestComponentBuildHashInDiagnostics improve agent state check (#5435)
* [Flaky Test] TestComponentBuildHashInDiagnostics improve agent state check (#5420) ensure the agent status has components, all components are healthy and the version info is up-to-date (cherry picked from commit 116e73f) * manually backport de3dec4 --------- Co-authored-by: Anderson Queiroz <anderson.queiroz@elastic.co>
1 parent fc28030 commit 179ffdd

File tree

2 files changed

+102
-14
lines changed

2 files changed

+102
-14
lines changed

pkg/testing/fixture_install.go

+17-5
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ func getProcesses(t *gotesting.T, regex string) []runningProcess {
402402
// - an error if any.
403403
func (f *Fixture) installDeb(ctx context.Context, installOpts *InstallOpts, opts []process.CmdOption) ([]byte, error) {
404404
f.t.Logf("[test %s] Inside fixture installDeb function", f.t.Name())
405-
//Prepare so that the f.srcPackage string is populated
405+
// Prepare so that the f.srcPackage string is populated
406406
err := f.EnsurePrepared(ctx)
407407
if err != nil {
408408
return nil, fmt.Errorf("failed to prepare: %w", err)
@@ -483,7 +483,7 @@ func (f *Fixture) installDeb(ctx context.Context, installOpts *InstallOpts, opts
483483
// - an error if any.
484484
func (f *Fixture) installRpm(ctx context.Context, installOpts *InstallOpts, opts []process.CmdOption) ([]byte, error) {
485485
f.t.Logf("[test %s] Inside fixture installRpm function", f.t.Name())
486-
//Prepare so that the f.srcPackage string is populated
486+
// Prepare so that the f.srcPackage string is populated
487487
err := f.EnsurePrepared(ctx)
488488
if err != nil {
489489
return nil, fmt.Errorf("failed to prepare: %w", err)
@@ -649,12 +649,12 @@ func (f *Fixture) collectDiagnostics() {
649649
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
650650
defer cancel()
651651

652-
dir, err := findProjectRoot(f.caller)
652+
diagPath, err := f.DiagDir()
653653
if err != nil {
654-
f.t.Logf("failed to collect diagnostics; failed to find project root: %s", err)
654+
f.t.Logf("failed to collect diagnostics: %v", err)
655655
return
656656
}
657-
diagPath := filepath.Join(dir, "build", "diagnostics")
657+
658658
err = os.MkdirAll(diagPath, 0755)
659659
if err != nil {
660660
f.t.Logf("failed to collect diagnostics; failed to create %s: %s", diagPath, err)
@@ -699,6 +699,18 @@ func (f *Fixture) collectDiagnostics() {
699699
}
700700
}
701701

702+
// DiagDir returned {projectRoot}/build/diagnostics path. Files on this path
703+
// are saved if any test fails. Use it to save files for further investigation.
704+
func (f *Fixture) DiagDir() (string, error) {
705+
dir, err := findProjectRoot(f.caller)
706+
if err != nil {
707+
return "", fmt.Errorf("failed to find project root: %w", err)
708+
}
709+
710+
diagPath := filepath.Join(dir, "build", "diagnostics")
711+
return diagPath, nil
712+
}
713+
702714
func (f *Fixture) archiveInstallDirectory(installPath string, outputPath string) error {
703715
file, err := os.Create(outputPath)
704716
if err != nil {

testing/integration/package_version_test.go

+85-9
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"context"
1212
"encoding/json"
1313
"fmt"
14+
"io"
1415
"os"
1516
"os/exec"
1617
"path/filepath"
@@ -91,27 +92,60 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {
9192
"failed to install start agent [output: %s]", string(output))
9293

9394
stateBuff := bytes.Buffer{}
95+
var status atesting.AgentStatusOutput
9496
allHealthy := func() bool {
9597
stateBuff.Reset()
9698

97-
status, err := f.ExecStatus(ctx)
99+
status, err = f.ExecStatus(ctx)
98100
if err != nil {
99101
stateBuff.WriteString(fmt.Sprintf("failed to get agent status: %v",
100102
err))
101103
return false
102104
}
103105

106+
if client.State(status.State) != client.Healthy {
107+
stateBuff.WriteString(fmt.Sprintf(
108+
"agent isn't healthy: %s-%s",
109+
client.State(status.State), status.Message))
110+
return false
111+
}
112+
113+
if len(status.Components) == 0 {
114+
stateBuff.WriteString(fmt.Sprintf(
115+
"healthy but without components: agent status: %s-%s",
116+
client.State(status.State), status.Message))
117+
return false
118+
}
119+
120+
// the agent might be healthy but waiting its first configuration,
121+
// in that case, there would be no components yet. Therefore, ensure
122+
// the agent received the policy with components before proceeding with
123+
// the test.
104124
for _, c := range status.Components {
125+
bs, err := json.MarshalIndent(status, "", " ")
126+
if err != nil {
127+
stateBuff.WriteString(fmt.Sprintf(
128+
"%s not healthy, could not marshal status outptu: %v",
129+
c.Name, err))
130+
return false
131+
}
132+
105133
state := client.State(c.State)
106134
if state != client.Healthy {
107-
bs, err := json.MarshalIndent(status, "", " ")
108-
if err != nil {
109-
stateBuff.WriteString(fmt.Sprintf("%s not health, could not marshal status outptu: %v",
110-
c.Name, err))
111-
return false
112-
}
113-
114-
stateBuff.WriteString(fmt.Sprintf("%s not health, agent status output: %s",
135+
stateBuff.WriteString(fmt.Sprintf(
136+
"%s not health, agent status output: %s",
137+
c.Name, bs))
138+
return false
139+
}
140+
141+
// there is a rare a race condition unlike to happen on a
142+
// production scenario where the component is healthy but the
143+
// version info delays to update. As the Status command and the
144+
// diagnostics fetch this information in the same way, it guarantees
145+
// the version info is up-to-date before proceeding with the test.
146+
if c.VersionInfo.Meta.Commit == "" {
147+
stateBuff.WriteString(fmt.Sprintf(
148+
"%s health, but no versionInfo. agent status output: %s",
115149
c.Name, bs))
116150
return false
117151
}
@@ -123,6 +157,13 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {
123157
allHealthy,
124158
5*time.Minute, 10*time.Second,
125159
"agent never became healthy. Last status: %v", &stateBuff)
160+
defer func() {
161+
if !t.Failed() {
162+
return
163+
}
164+
165+
t.Logf("test failed: last status output: %#v", status)
166+
}()
126167

127168
agentbeat := "agentbeat"
128169
if runtime.GOOS == "windows" {
@@ -159,6 +200,28 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {
159200

160201
diag := t.TempDir()
161202
extractZipArchive(t, diagZip, diag)
203+
// if the test fails, the diagnostics used is useful for debugging.
204+
defer func() {
205+
if !t.Failed() {
206+
return
207+
}
208+
209+
t.Logf("the test failed: trying to save the diagnostics used on the test")
210+
diagDir, err := f.DiagDir()
211+
if err != nil {
212+
t.Logf("could not get diagnostics directory to save the diagnostics used on the test")
213+
return
214+
}
215+
216+
err = os.Rename(diagZip, filepath.Join(diagDir,
217+
fmt.Sprintf("TestComponentBuildHashInDiagnostics-used-diag-%d.zip",
218+
time.Now().Unix())))
219+
if err != nil {
220+
t.Logf("could not move diagnostics used in the test to %s: %v",
221+
diagDir, err)
222+
return
223+
}
224+
}()
162225

163226
stateFilePath := filepath.Join(diag, "state.yaml")
164227
stateYAML, err := os.Open(stateFilePath)
@@ -192,6 +255,19 @@ func TestComponentBuildHashInDiagnostics(t *testing.T) {
192255
assert.Equalf(t, wantBuildHash, c.State.VersionInfo.Meta.Commit,
193256
"component %s: VersionInfo.Meta.Commit mismatch", c.ID)
194257
}
258+
259+
if t.Failed() {
260+
_, seek := stateYAML.Seek(0, 0)
261+
if seek != nil {
262+
t.Logf("could not reset state.yaml offset to print it")
263+
return
264+
}
265+
data, err := io.ReadAll(stateYAML)
266+
if err != nil {
267+
t.Logf("could not read state.yaml: %v", err)
268+
}
269+
t.Logf("test failed: state.yaml contents: %q", string(data))
270+
}
195271
}
196272

197273
func testVersionWithRunningAgent(runCtx context.Context, f *atesting.Fixture) func(*testing.T) {

0 commit comments

Comments
 (0)