Skip to content

Commit a9de876

Browse files
authored
improve watcher logs and TestWatcher_AgentErrorQuick logs (#5345)
1 parent eb572da commit a9de876

File tree

2 files changed

+31
-14
lines changed

2 files changed

+31
-14
lines changed

internal/pkg/agent/application/upgrade/watcher.go

+15-9
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,14 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
9393
if failedErr == nil {
9494
flipFlopCount++
9595
failedTimer.Reset(ch.checkInterval)
96-
ch.log.Error("Agent reported failure (starting failed timer): %s", err)
96+
ch.log.Errorf("Agent reported failure (starting failed timer): %s", err)
9797
} else {
98-
ch.log.Error("Agent reported failure (failed timer already started): %s", err)
98+
ch.log.Errorf("Agent reported failure (failed timer already started): %s", err)
9999
}
100100
} else {
101101
if failedErr != nil {
102102
failedTimer.Stop()
103-
ch.log.Error("Agent reported healthy (failed timer stopped): %s", err)
103+
ch.log.Info("Agent reported healthy (failed timer stopped)")
104104
}
105105
}
106106
failedErr = err
@@ -115,7 +115,8 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
115115
continue
116116
}
117117
// error lasted longer than the checkInterval, notify!
118-
ch.notifyChan <- failedErr
118+
ch.notifyChan <- fmt.Errorf("last error was not cleared before checkInterval (%s) elapsed: %w",
119+
ch.checkInterval, failedErr)
119120
}
120121
}
121122
}()
@@ -138,7 +139,7 @@ LOOP:
138139
connectCancel()
139140
if err != nil {
140141
ch.connectCounter++
141-
ch.log.Error("Failed connecting to running daemon: ", err)
142+
ch.log.Errorf("Failed connecting to running daemon: %s", err)
142143
if ch.checkFailures() {
143144
return
144145
}
@@ -152,7 +153,7 @@ LOOP:
152153
// considered a connect error
153154
stateCancel()
154155
ch.agentClient.Disconnect()
155-
ch.log.Error("Failed to start state watch: ", err)
156+
ch.log.Errorf("Failed to start state watch: %s", err)
156157
ch.connectCounter++
157158
if ch.checkFailures() {
158159
return
@@ -178,25 +179,30 @@ LOOP:
178179
for {
179180
state, err := watch.Recv()
180181
if err != nil {
182+
ch.log.Debugf("received state: error: %s",
183+
err)
184+
181185
// agent has crashed or exited
182186
stateCancel()
183187
ch.agentClient.Disconnect()
184-
ch.log.Error("Lost connection: failed reading next state: ", err)
188+
ch.log.Errorf("Lost connection: failed reading next state: %s", err)
185189
ch.lostCounter++
186190
if ch.checkFailures() {
187191
return
188192
}
189193
continue LOOP
190194
}
195+
ch.log.Debugf("received state: %s:%s",
196+
state.State, state.Message)
191197

192198
// gRPC is good at hiding the fact that connection was lost
193199
// to ensure that we don't miss a restart a changed PID means
194200
// we are now talking to a different spawned Elastic Agent
195201
if ch.lastPid == -1 {
196202
ch.lastPid = state.Info.PID
197-
ch.log.Info(fmt.Sprintf("Communicating with PID %d", ch.lastPid))
203+
ch.log.Infof("Communicating with PID %d", ch.lastPid)
198204
} else if ch.lastPid != state.Info.PID {
199-
ch.log.Error(fmt.Sprintf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID))
205+
ch.log.Errorf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID)
200206
ch.lastPid = state.Info.PID
201207
// count the PID change as a lost connection, but allow
202208
// the communication to continue unless has become a failure

internal/pkg/agent/application/upgrade/watcher_test.go

+16-5
Original file line numberDiff line numberDiff line change
@@ -270,14 +270,25 @@ func TestWatcher_AgentError(t *testing.T) {
270270
}
271271

272272
func TestWatcher_AgentErrorQuick(t *testing.T) {
273-
// test tests for success, which only happens when no error comes in
274-
// during this time period
273+
// Success only happens when no error comes in during this time period
275274
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
276275
defer cancel()
277276

278277
errCh := make(chan error)
279-
logger, _ := logger.NewTesting("watcher")
280-
w := NewAgentWatcher(errCh, logger, 100*time.Millisecond)
278+
log, obs := logger.NewTesting("watcher")
279+
defer func() {
280+
if t.Failed() {
281+
rawLogs := obs.All()
282+
for _, rawLog := range rawLogs {
283+
msg := fmt.Sprintf("[%s] %s", rawLog.Level, rawLog.Message)
284+
for k, v := range rawLog.ContextMap() {
285+
msg += fmt.Sprintf("%s=%v", k, v)
286+
}
287+
t.Log(msg)
288+
}
289+
}
290+
}()
291+
w := NewAgentWatcher(errCh, log, 100*time.Millisecond)
281292

282293
// reports an error state, followed by a healthy state (should not error)
283294
mockHandler := func(srv cproto.ElasticAgentControl_StateWatchServer) error {
@@ -302,7 +313,7 @@ func TestWatcher_AgentErrorQuick(t *testing.T) {
302313
return nil
303314
}
304315
mock := &mockDaemon{watch: mockHandler}
305-
require.NoError(t, mock.Start())
316+
require.NoError(t, mock.Start(), "could not start mock agent daemon")
306317
defer mock.Stop()
307318

308319
// set client to mock; before running

0 commit comments

Comments
 (0)