@@ -94,14 +94,14 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
94
94
if failedErr == nil {
95
95
flipFlopCount ++
96
96
failedTimer .Reset (ch .checkInterval )
97
- ch .log .Error ("Agent reported failure (starting failed timer): %s" , err )
97
+ ch .log .Errorf ("Agent reported failure (starting failed timer): %s" , err )
98
98
} else {
99
- ch .log .Error ("Agent reported failure (failed timer already started): %s" , err )
99
+ ch .log .Errorf ("Agent reported failure (failed timer already started): %s" , err )
100
100
}
101
101
} else {
102
102
if failedErr != nil {
103
103
failedTimer .Stop ()
104
- ch .log .Error ("Agent reported healthy (failed timer stopped): %s" , err )
104
+ ch .log .Info ("Agent reported healthy (failed timer stopped)" )
105
105
}
106
106
}
107
107
failedErr = err
@@ -116,7 +116,8 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
116
116
continue
117
117
}
118
118
// error lasted longer than the checkInterval, notify!
119
- ch .notifyChan <- failedErr
119
+ ch .notifyChan <- fmt .Errorf ("last error was not cleared before checkInterval (%s) elapsed: %w" ,
120
+ ch .checkInterval , failedErr )
120
121
}
121
122
}
122
123
}()
@@ -134,11 +135,12 @@ LOOP:
134
135
// block on connection, don't retry connection, and fail on temp dial errors
135
136
// always a local connection it should connect quickly so the timeout is only 1 second
136
137
connectCtx , connectCancel := context .WithTimeout (ctx , 1 * time .Second )
138
+ //nolint:staticcheck // requires changing client signature
137
139
err := ch .agentClient .Connect (connectCtx , grpc .WithBlock (), grpc .WithDisableRetry (), grpc .FailOnNonTempDialError (true ))
138
140
connectCancel ()
139
141
if err != nil {
140
142
ch .connectCounter ++
141
- ch .log .Error ("Failed connecting to running daemon: " , err )
143
+ ch .log .Errorf ("Failed connecting to running daemon: %s " , err )
142
144
if ch .checkFailures () {
143
145
return
144
146
}
@@ -152,7 +154,7 @@ LOOP:
152
154
// considered a connect error
153
155
stateCancel ()
154
156
ch .agentClient .Disconnect ()
155
- ch .log .Error ("Failed to start state watch: " , err )
157
+ ch .log .Errorf ("Failed to start state watch: %s " , err )
156
158
ch .connectCounter ++
157
159
if ch .checkFailures () {
158
160
return
@@ -178,25 +180,30 @@ LOOP:
178
180
for {
179
181
state , err := watch .Recv ()
180
182
if err != nil {
183
+ ch .log .Debugf ("received state: error: %s" ,
184
+ err )
185
+
181
186
// agent has crashed or exited
182
187
stateCancel ()
183
188
ch .agentClient .Disconnect ()
184
- ch .log .Error ("Lost connection: failed reading next state: " , err )
189
+ ch .log .Errorf ("Lost connection: failed reading next state: %s " , err )
185
190
ch .lostCounter ++
186
191
if ch .checkFailures () {
187
192
return
188
193
}
189
194
continue LOOP
190
195
}
196
+ ch .log .Debugf ("received state: %s:%s" ,
197
+ state .State , state .Message )
191
198
192
199
// gRPC is good at hiding the fact that connection was lost
193
200
// to ensure that we don't miss a restart a changed PID means
194
201
// we are now talking to a different spawned Elastic Agent
195
202
if ch .lastPid == - 1 {
196
203
ch .lastPid = state .Info .PID
197
- ch .log .Info ( fmt . Sprintf ( "Communicating with PID %d" , ch .lastPid ) )
204
+ ch .log .Infof ( "Communicating with PID %d" , ch .lastPid )
198
205
} else if ch .lastPid != state .Info .PID {
199
- ch .log .Error ( fmt . Sprintf ( "Communication with PID %d lost, now communicating with PID %d" , ch .lastPid , state .Info .PID ) )
206
+ ch .log .Errorf ( "Communication with PID %d lost, now communicating with PID %d" , ch .lastPid , state .Info .PID )
200
207
ch .lastPid = state .Info .PID
201
208
// count the PID change as a lost connection, but allow
202
209
// the communication to continue unless has become a failure
0 commit comments