@@ -93,14 +93,14 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
93
93
if failedErr == nil {
94
94
flipFlopCount ++
95
95
failedTimer .Reset (ch .checkInterval )
96
- ch .log .Error ("Agent reported failure (starting failed timer): %s" , err )
96
+ ch .log .Errorf ("Agent reported failure (starting failed timer): %s" , err )
97
97
} else {
98
- ch .log .Error ("Agent reported failure (failed timer already started): %s" , err )
98
+ ch .log .Errorf ("Agent reported failure (failed timer already started): %s" , err )
99
99
}
100
100
} else {
101
101
if failedErr != nil {
102
102
failedTimer .Stop ()
103
- ch .log .Error ("Agent reported healthy (failed timer stopped): %s" , err )
103
+ ch .log .Info ("Agent reported healthy (failed timer stopped)" )
104
104
}
105
105
}
106
106
failedErr = err
@@ -115,7 +115,8 @@ func (ch *AgentWatcher) Run(ctx context.Context) {
115
115
continue
116
116
}
117
117
// error lasted longer than the checkInterval, notify!
118
- ch .notifyChan <- failedErr
118
+ ch .notifyChan <- fmt .Errorf ("last error was not cleared before checkInterval (%s) elapsed: %w" ,
119
+ ch .checkInterval , failedErr )
119
120
}
120
121
}
121
122
}()
@@ -138,7 +139,7 @@ LOOP:
138
139
connectCancel ()
139
140
if err != nil {
140
141
ch .connectCounter ++
141
- ch .log .Error ("Failed connecting to running daemon: " , err )
142
+ ch .log .Errorf ("Failed connecting to running daemon: %s " , err )
142
143
if ch .checkFailures () {
143
144
return
144
145
}
@@ -152,7 +153,7 @@ LOOP:
152
153
// considered a connect error
153
154
stateCancel ()
154
155
ch .agentClient .Disconnect ()
155
- ch .log .Error ("Failed to start state watch: " , err )
156
+ ch .log .Errorf ("Failed to start state watch: %s " , err )
156
157
ch .connectCounter ++
157
158
if ch .checkFailures () {
158
159
return
@@ -178,25 +179,30 @@ LOOP:
178
179
for {
179
180
state , err := watch .Recv ()
180
181
if err != nil {
182
+ ch .log .Debugf ("received state: error: %s" ,
183
+ err )
184
+
181
185
// agent has crashed or exited
182
186
stateCancel ()
183
187
ch .agentClient .Disconnect ()
184
- ch .log .Error ("Lost connection: failed reading next state: " , err )
188
+ ch .log .Errorf ("Lost connection: failed reading next state: %s " , err )
185
189
ch .lostCounter ++
186
190
if ch .checkFailures () {
187
191
return
188
192
}
189
193
continue LOOP
190
194
}
195
+ ch .log .Debugf ("received state: %s:%s" ,
196
+ state .State , state .Message )
191
197
192
198
// gRPC is good at hiding the fact that connection was lost
193
199
// to ensure that we don't miss a restart a changed PID means
194
200
// we are now talking to a different spawned Elastic Agent
195
201
if ch .lastPid == - 1 {
196
202
ch .lastPid = state .Info .PID
197
- ch .log .Info ( fmt . Sprintf ( "Communicating with PID %d" , ch .lastPid ) )
203
+ ch .log .Infof ( "Communicating with PID %d" , ch .lastPid )
198
204
} else if ch .lastPid != state .Info .PID {
199
- ch .log .Error ( fmt . Sprintf ( "Communication with PID %d lost, now communicating with PID %d" , ch .lastPid , state .Info .PID ) )
205
+ ch .log .Errorf ( "Communication with PID %d lost, now communicating with PID %d" , ch .lastPid , state .Info .PID )
200
206
ch .lastPid = state .Info .PID
201
207
// count the PID change as a lost connection, but allow
202
208
// the communication to continue unless has become a failure
0 commit comments