elastic · pierrehilbert · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: bug-fix
+
+# Change summary; a 80ish characters long description of the change.
+summary: Fix indefinite memory and CPU consumption when waiting fleet to be ready
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; a word indicating the component this changeset affects.
+component:
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/5034
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+issue: https://github.com/elastic/elastic-agent/issues/5033
@@ -751,8 +751,16 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat
 		msg := ""
 		msgCount := 0
 		backExp := expBackoffWithContext(innerCtx, 1*time.Second, maxBackoff)
+
 		for {
-			backExp.Wait()
+			// if the timeout is reached, no response was sent on `res`, therefore
+			// send an error
+			if !backExp.Wait() {
+				resChan <- waitResult{err: fmt.Errorf(
+					"timed out waiting for Fleet Server to start after %s",
+					timeout)}
+			}
+
 			state, err := getDaemonState(innerCtx)
 			if errors.Is(err, context.Canceled) {
 				resChan <- waitResult{err: err}

@@ -577,6 +577,28 @@ func TestDaemonReloadWithBackoff(t *testing.T) {
 	}
 }
 
+func TestWaitForFleetServer_timeout(t *testing.T) {
+	log, _ := logger.NewTesting("TestWaitForFleetServer_timeout")
+	timeout := 5 * time.Second
+	testTimeout := 2 * timeout
+
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	var got string
+	var err error
+	require.Eventuallyf(t,
+		func() bool {
+			got, err = waitForFleetServer(ctx, make(chan *os.ProcessState, 1), log, timeout)
+			return true
+		},
+		testTimeout,
+		500*time.Millisecond,
+		"waitForFleetServer never returned")
+
+	assert.Empty(t, got, "waitForFleetServer should have returned and empty enrollmentToken")
+	assert.Error(t, err, "waitForFleetServer should have returned an error")
+}
+
 func withServer(
 	m func(t *testing.T) *http.ServeMux,
 	test func(t *testing.T, host string),

@@ -45,7 +45,10 @@ func (b *ExpBackoff) NextWait() time.Duration {
 	return nextWait
 }
 
-// Wait block until either the timer is completed or channel is done.
+// Wait blocks until either the exponential backoff timer is completed or the
+// done channel is closed.
+// Wait returns true until done is closed. When done is closed, wait returns
+// immediately, therefore callers should always check the return value.
 func (b *ExpBackoff) Wait() bool {
 	b.duration = b.NextWait()