Skip to content

Commit fc28030

Browse files
improve watcher logs and TestWatcher_AgentErrorQuick logs (#5345) (#5357)
(cherry picked from commit a9de876) Co-authored-by: Anderson Queiroz <anderson.queiroz@elastic.co>
1 parent 1fa4c2d commit fc28030

File tree

3 files changed

+34
-16
lines changed

3 files changed

+34
-16
lines changed

internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -182,9 +182,9 @@ func TestUpgradeHandlerSameVersion(t *testing.T) {
182182
require.NoError(t, err1)
183183
require.NoError(t, err2)
184184

185-
// Make sure this test does not dead lock or wait for too long
185+
// Make sure this test does not deadlock or wait for too long
186186
select {
187-
case <-time.Tick(50 * time.Millisecond):
187+
case <-time.Tick(200 * time.Millisecond):
188188
t.Fatal("mockUpgradeManager.Upgrade was not called")
189189
case <-upgradeCalledChan:
190190
}

internal/pkg/agent/application/upgrade/watcher.go

+16-9
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,14 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
9494
if failedErr == nil {
9595
flipFlopCount++
9696
failedTimer.Reset(ch.checkInterval)
97-
ch.log.Error("Agent reported failure (starting failed timer): %s", err)
97+
ch.log.Errorf("Agent reported failure (starting failed timer): %s", err)
9898
} else {
99-
ch.log.Error("Agent reported failure (failed timer already started): %s", err)
99+
ch.log.Errorf("Agent reported failure (failed timer already started): %s", err)
100100
}
101101
} else {
102102
if failedErr != nil {
103103
failedTimer.Stop()
104-
ch.log.Error("Agent reported healthy (failed timer stopped): %s", err)
104+
ch.log.Info("Agent reported healthy (failed timer stopped)")
105105
}
106106
}
107107
failedErr = err
@@ -116,7 +116,8 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
116116
continue
117117
}
118118
// error lasted longer than the checkInterval, notify!
119-
ch.notifyChan <- failedErr
119+
ch.notifyChan <- fmt.Errorf("last error was not cleared before checkInterval (%s) elapsed: %w",
120+
ch.checkInterval, failedErr)
120121
}
121122
}
122123
}()
@@ -134,11 +135,12 @@ LOOP:
134135
// block on connection, don't retry connection, and fail on temp dial errors
135136
// always a local connection it should connect quickly so the timeout is only 1 second
136137
connectCtx, connectCancel := context.WithTimeout(ctx, 1*time.Second)
138+
//nolint:staticcheck // requires changing client signature
137139
err := ch.agentClient.Connect(connectCtx, grpc.WithBlock(), grpc.WithDisableRetry(), grpc.FailOnNonTempDialError(true))
138140
connectCancel()
139141
if err != nil {
140142
ch.connectCounter++
141-
ch.log.Error("Failed connecting to running daemon: ", err)
143+
ch.log.Errorf("Failed connecting to running daemon: %s", err)
142144
if ch.checkFailures() {
143145
return
144146
}
@@ -152,7 +154,7 @@ LOOP:
152154
// considered a connect error
153155
stateCancel()
154156
ch.agentClient.Disconnect()
155-
ch.log.Error("Failed to start state watch: ", err)
157+
ch.log.Errorf("Failed to start state watch: %s", err)
156158
ch.connectCounter++
157159
if ch.checkFailures() {
158160
return
@@ -178,25 +180,30 @@ LOOP:
178180
for {
179181
state, err := watch.Recv()
180182
if err != nil {
183+
ch.log.Debugf("received state: error: %s",
184+
err)
185+
181186
// agent has crashed or exited
182187
stateCancel()
183188
ch.agentClient.Disconnect()
184-
ch.log.Error("Lost connection: failed reading next state: ", err)
189+
ch.log.Errorf("Lost connection: failed reading next state: %s", err)
185190
ch.lostCounter++
186191
if ch.checkFailures() {
187192
return
188193
}
189194
continue LOOP
190195
}
196+
ch.log.Debugf("received state: %s:%s",
197+
state.State, state.Message)
191198

192199
// gRPC is good at hiding the fact that connection was lost
193200
// to ensure that we don't miss a restart a changed PID means
194201
// we are now talking to a different spawned Elastic Agent
195202
if ch.lastPid == -1 {
196203
ch.lastPid = state.Info.PID
197-
ch.log.Info(fmt.Sprintf("Communicating with PID %d", ch.lastPid))
204+
ch.log.Infof("Communicating with PID %d", ch.lastPid)
198205
} else if ch.lastPid != state.Info.PID {
199-
ch.log.Error(fmt.Sprintf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID))
206+
ch.log.Errorf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID)
200207
ch.lastPid = state.Info.PID
201208
// count the PID change as a lost connection, but allow
202209
// the communication to continue unless has become a failure

internal/pkg/agent/application/upgrade/watcher_test.go

+16-5
Original file line numberDiff line numberDiff line change
@@ -270,14 +270,25 @@ func TestWatcher_AgentError(t *testing.T) {
270270
}
271271

272272
func TestWatcher_AgentErrorQuick(t *testing.T) {
273-
// test tests for success, which only happens when no error comes in
274-
// during this time period
273+
// Success only happens when no error comes in during this time period
275274
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
276275
defer cancel()
277276

278277
errCh := make(chan error)
279-
logger, _ := logger.NewTesting("watcher")
280-
w := NewAgentWatcher(errCh, logger, 100*time.Millisecond)
278+
log, obs := logger.NewTesting("watcher")
279+
defer func() {
280+
if t.Failed() {
281+
rawLogs := obs.All()
282+
for _, rawLog := range rawLogs {
283+
msg := fmt.Sprintf("[%s] %s", rawLog.Level, rawLog.Message)
284+
for k, v := range rawLog.ContextMap() {
285+
msg += fmt.Sprintf("%s=%v", k, v)
286+
}
287+
t.Log(msg)
288+
}
289+
}
290+
}()
291+
w := NewAgentWatcher(errCh, log, 100*time.Millisecond)
281292

282293
// reports an error state, followed by a healthy state (should not error)
283294
mockHandler := func(srv cproto.ElasticAgentControl_StateWatchServer) error {
@@ -302,7 +313,7 @@ func TestWatcher_AgentErrorQuick(t *testing.T) {
302313
return nil
303314
}
304315
mock := &mockDaemon{watch: mockHandler}
305-
require.NoError(t, mock.Start())
316+
require.NoError(t, mock.Start(), "could not start mock agent daemon")
306317
defer mock.Stop()
307318

308319
// set client to mock; before running

0 commit comments

Comments
 (0)