improve watcher logs and TestWatcher_AgentErrorQuick logs (#5345) (#5357)

mergify[bot] · AndersonQ · web-flow · commit fc280308e502 · 2024-09-09T15:26:04.000Z
(cherry picked from commit a9de876) Co-authored-by: Anderson Queiroz <anderson.queiroz@elastic.co>
diff --git a/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go b/internal/pkg/agent/application/actions/handlers/handler_action_upgrade_test.go
@@ -182,9 +182,9 @@ func TestUpgradeHandlerSameVersion(t *testing.T) {
 	require.NoError(t, err1)
 	require.NoError(t, err2)
 
-	// Make sure this test does not dead lock or wait for too long
+	// Make sure this test does not deadlock or wait for too long
 	select {
-	case <-time.Tick(50 * time.Millisecond):
+	case <-time.Tick(200 * time.Millisecond):
 		t.Fatal("mockUpgradeManager.Upgrade was not called")
 	case <-upgradeCalledChan:
 	}
diff --git a/internal/pkg/agent/application/upgrade/watcher.go b/internal/pkg/agent/application/upgrade/watcher.go
@@ -94,14 +94,14 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
 					if failedErr == nil {
 						flipFlopCount++
 						failedTimer.Reset(ch.checkInterval)
-						ch.log.Error("Agent reported failure (starting failed timer): %s", err)
+						ch.log.Errorf("Agent reported failure (starting failed timer): %s", err)
 					} else {
-						ch.log.Error("Agent reported failure (failed timer already started): %s", err)
+						ch.log.Errorf("Agent reported failure (failed timer already started): %s", err)
 					}
 				} else {
 					if failedErr != nil {
 						failedTimer.Stop()
-						ch.log.Error("Agent reported healthy (failed timer stopped): %s", err)
+						ch.log.Info("Agent reported healthy (failed timer stopped)")
 					}
 				}
 				failedErr = err
@@ -116,7 +116,8 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
 					continue
 				}
 				// error lasted longer than the checkInterval, notify!
-				ch.notifyChan <- failedErr
+				ch.notifyChan <- fmt.Errorf("last error was not cleared before checkInterval (%s) elapsed: %w",
+					ch.checkInterval, failedErr)
 			}
 		}
 	}()
@@ -134,11 +135,12 @@ LOOP:
 			// block on connection, don't retry connection, and fail on temp dial errors
 			// always a local connection it should connect quickly so the timeout is only 1 second
 			connectCtx, connectCancel := context.WithTimeout(ctx, 1*time.Second)
+			//nolint:staticcheck // requires changing client signature
 			err := ch.agentClient.Connect(connectCtx, grpc.WithBlock(), grpc.WithDisableRetry(), grpc.FailOnNonTempDialError(true))
 			connectCancel()
 			if err != nil {
 				ch.connectCounter++
-				ch.log.Error("Failed connecting to running daemon: ", err)
+				ch.log.Errorf("Failed connecting to running daemon: %s", err)
 				if ch.checkFailures() {
 					return
 				}
@@ -152,7 +154,7 @@ LOOP:
 				// considered a connect error
 				stateCancel()
 				ch.agentClient.Disconnect()
-				ch.log.Error("Failed to start state watch: ", err)
+				ch.log.Errorf("Failed to start state watch: %s", err)
 				ch.connectCounter++
 				if ch.checkFailures() {
 					return
@@ -178,25 +180,30 @@ LOOP:
 			for {
 				state, err := watch.Recv()
 				if err != nil {
+					ch.log.Debugf("received state: error: %s",
+						err)
+
 					// agent has crashed or exited
 					stateCancel()
 					ch.agentClient.Disconnect()
-					ch.log.Error("Lost connection: failed reading next state: ", err)
+					ch.log.Errorf("Lost connection: failed reading next state: %s", err)
 					ch.lostCounter++
 					if ch.checkFailures() {
 						return
 					}
 					continue LOOP
 				}
+				ch.log.Debugf("received state: %s:%s",
+					state.State, state.Message)
 
 				// gRPC is good at hiding the fact that connection was lost
 				// to ensure that we don't miss a restart a changed PID means
 				// we are now talking to a different spawned Elastic Agent
 				if ch.lastPid == -1 {
 					ch.lastPid = state.Info.PID
-					ch.log.Info(fmt.Sprintf("Communicating with PID %d", ch.lastPid))
+					ch.log.Infof("Communicating with PID %d", ch.lastPid)
 				} else if ch.lastPid != state.Info.PID {
-					ch.log.Error(fmt.Sprintf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID))
+					ch.log.Errorf("Communication with PID %d lost, now communicating with PID %d", ch.lastPid, state.Info.PID)
 					ch.lastPid = state.Info.PID
 					// count the PID change as a lost connection, but allow
 					// the communication to continue unless has become a failure
diff --git a/internal/pkg/agent/application/upgrade/watcher_test.go b/internal/pkg/agent/application/upgrade/watcher_test.go
@@ -270,14 +270,25 @@ func TestWatcher_AgentError(t *testing.T) {
 }
 
 func TestWatcher_AgentErrorQuick(t *testing.T) {
-	// test tests for success, which only happens when no error comes in
-	// during this time period
+	// Success only happens when no error comes in during this time period
 	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
 	defer cancel()
 
 	errCh := make(chan error)
-	logger, _ := logger.NewTesting("watcher")
-	w := NewAgentWatcher(errCh, logger, 100*time.Millisecond)
+	log, obs := logger.NewTesting("watcher")
+	defer func() {
+		if t.Failed() {
+			rawLogs := obs.All()
+			for _, rawLog := range rawLogs {
+				msg := fmt.Sprintf("[%s] %s", rawLog.Level, rawLog.Message)
+				for k, v := range rawLog.ContextMap() {
+					msg += fmt.Sprintf("%s=%v", k, v)
+				}
+				t.Log(msg)
+			}
+		}
+	}()
+	w := NewAgentWatcher(errCh, log, 100*time.Millisecond)
 
 	// reports an error state, followed by a healthy state (should not error)
 	mockHandler := func(srv cproto.ElasticAgentControl_StateWatchServer) error {
@@ -302,7 +313,7 @@ func TestWatcher_AgentErrorQuick(t *testing.T) {
 		return nil
 	}
 	mock := &mockDaemon{watch: mockHandler}
-	require.NoError(t, mock.Start())
+	require.NoError(t, mock.Start(), "could not start mock agent daemon")
 	defer mock.Stop()
 
 	// set client to mock; before running