Skip to content

Commit 8aa3477

Browse files
authoredJul 2, 2024
Fix indefinite memory and CPU consumption when waiting fleet to be ready (#5034)
* exit if timeout is reached while waiting for fleet server to start * clarify exponential backoff behaviour * add test * add changelog * fix changelog
1 parent f858169 commit 8aa3477

File tree

4 files changed

+67
-2
lines changed

4 files changed

+67
-2
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Fix indefinite memory and CPU consumption when waiting fleet to be ready
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; a word indicating the component this changeset affects.
22+
component:
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/5034
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
issue: https://github.com/elastic/elastic-agent/issues/5033

‎internal/pkg/agent/cmd/enroll_cmd.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -751,8 +751,16 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat
751751
msg := ""
752752
msgCount := 0
753753
backExp := expBackoffWithContext(innerCtx, 1*time.Second, maxBackoff)
754+
754755
for {
755-
backExp.Wait()
756+
// if the timeout is reached, no response was sent on `res`, therefore
757+
// send an error
758+
if !backExp.Wait() {
759+
resChan <- waitResult{err: fmt.Errorf(
760+
"timed out waiting for Fleet Server to start after %s",
761+
timeout)}
762+
}
763+
756764
state, err := getDaemonState(innerCtx)
757765
if errors.Is(err, context.Canceled) {
758766
resChan <- waitResult{err: err}

‎internal/pkg/agent/cmd/enroll_cmd_test.go

+22
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,28 @@ func TestDaemonReloadWithBackoff(t *testing.T) {
577577
}
578578
}
579579

580+
func TestWaitForFleetServer_timeout(t *testing.T) {
581+
log, _ := logger.NewTesting("TestWaitForFleetServer_timeout")
582+
timeout := 5 * time.Second
583+
testTimeout := 2 * timeout
584+
585+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
586+
defer cancel()
587+
var got string
588+
var err error
589+
require.Eventuallyf(t,
590+
func() bool {
591+
got, err = waitForFleetServer(ctx, make(chan *os.ProcessState, 1), log, timeout)
592+
return true
593+
},
594+
testTimeout,
595+
500*time.Millisecond,
596+
"waitForFleetServer never returned")
597+
598+
assert.Empty(t, got, "waitForFleetServer should have returned and empty enrollmentToken")
599+
assert.Error(t, err, "waitForFleetServer should have returned an error")
600+
}
601+
580602
func withServer(
581603
m func(t *testing.T) *http.ServeMux,
582604
test func(t *testing.T, host string),

‎internal/pkg/core/backoff/exponential.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,10 @@ func (b *ExpBackoff) NextWait() time.Duration {
4545
return nextWait
4646
}
4747

48-
// Wait block until either the timer is completed or channel is done.
48+
// Wait blocks until either the exponential backoff timer is completed or the
49+
// done channel is closed.
50+
// Wait returns true until done is closed. When done is closed, wait returns
51+
// immediately, therefore callers should always check the return value.
4952
func (b *ExpBackoff) Wait() bool {
5053
b.duration = b.NextWait()
5154

0 commit comments

Comments
 (0)
Please sign in to comment.