Skip to content

Commit f84c05b

Browse files
authored
Metricbeat: add configurable failure threshold before reporting streams as degraded (elastic#41570)
* Metricbeat: add configurable failure threshold before reporting streams as degraded With this change it is possible to configure a threshold for the number of consecutive errors that may happen while fetching metrics for a given stream before the stream gets marked as DEGRADED. To configure such threshold, add a "failure_threshold": <n> to a module configuration block. Depending on the value of <n> the threshold will be configured in different ways: n == 0: status reporting for the stream has been disabled, the stream will never become DEGRADED no matter how many errors are encountered while fetching metrics n==1 or failure_threshold not specified: backward compatible behavior, the stream will become DEGRADED at the first error encountered n > 1: stream will become DEGRADED after at least n consecutive errors have been encountered When a fetch operation completes without errors the consecutive errors counter is reset and the stream is set to HEALTHY.
1 parent b219763 commit f84c05b

File tree

2 files changed

+636
-44
lines changed

2 files changed

+636
-44
lines changed

metricbeat/mb/module/wrapper.go

+69-44
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,15 @@ import (
3636
"github.com/elastic/elastic-agent-libs/testing"
3737
)
3838

39-
// Expvar metric names.
4039
const (
41-
successesKey = "success"
42-
failuresKey = "failures"
43-
eventsKey = "events"
40+
// Expvar metric names.
41+
successesKey = "success"
42+
failuresKey = "failures"
43+
eventsKey = "events"
44+
consecutiveFailuresKey = "consecutive_failures"
45+
46+
// Failure threshold config key
47+
failureThresholdKey = "failure_threshold"
4448
)
4549

4650
var (
@@ -70,16 +74,18 @@ type metricSetWrapper struct {
7074
module *Wrapper // Parent Module.
7175
stats *stats // stats for this MetricSet.
7276

73-
periodic bool // Set to true if this metricset is a periodic fetcher
77+
periodic bool // Set to true if this metricset is a periodic fetcher
78+
failureThreshold uint // threshold of consecutive errors needed to set the stream as degraded
7479
}
7580

7681
// stats bundles common metricset stats.
7782
type stats struct {
78-
key string // full stats key
79-
ref uint32 // number of modules/metricsets reusing stats instance
80-
success *monitoring.Int // Total success events.
81-
failures *monitoring.Int // Total error events.
82-
events *monitoring.Int // Total events published.
83+
key string // full stats key
84+
ref uint32 // number of modules/metricsets reusing stats instance
85+
success *monitoring.Int // Total success events.
86+
failures *monitoring.Int // Total error events.
87+
events *monitoring.Int // Total events published.
88+
consecutiveFailures *monitoring.Uint // Consecutive failures fetching this metricset
8389
}
8490

8591
// NewWrapper creates a new module and its associated metricsets based on the given configuration.
@@ -106,11 +112,28 @@ func createWrapper(module mb.Module, metricSets []mb.MetricSet, options ...Optio
106112
applyOption(wrapper)
107113
}
108114

115+
failureThreshold := uint(1)
116+
117+
var streamHealthSettings struct {
118+
FailureThreshold *uint `config:"failure_threshold"`
119+
}
120+
121+
err := module.UnpackConfig(&streamHealthSettings)
122+
123+
if err != nil {
124+
return nil, fmt.Errorf("unpacking raw config: %w", err)
125+
}
126+
127+
if streamHealthSettings.FailureThreshold != nil {
128+
failureThreshold = *streamHealthSettings.FailureThreshold
129+
}
130+
109131
for i, metricSet := range metricSets {
110132
wrapper.metricSets[i] = &metricSetWrapper{
111-
MetricSet: metricSet,
112-
module: wrapper,
113-
stats: getMetricSetStats(wrapper.Name(), metricSet.Name()),
133+
MetricSet: metricSet,
134+
module: wrapper,
135+
stats: getMetricSetStats(wrapper.Name(), metricSet.Name()),
136+
failureThreshold: failureThreshold,
114137
}
115138
}
116139
return wrapper, nil
@@ -254,35 +277,11 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) {
254277
case mb.ReportingMetricSetV2Error:
255278
reporter.StartFetchTimer()
256279
err := fetcher.Fetch(reporter.V2())
257-
if err != nil {
258-
reporter.V2().Error(err)
259-
if errors.As(err, &mb.PartialMetricsError{}) {
260-
// mark module as running if metrics are partially available and display the error message
261-
msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
262-
} else {
263-
// mark it as degraded for any other issue encountered
264-
msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
265-
}
266-
logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
267-
} else {
268-
msw.module.UpdateStatus(status.Running, "")
269-
}
280+
msw.handleFetchError(err, reporter.V2())
270281
case mb.ReportingMetricSetV2WithContext:
271282
reporter.StartFetchTimer()
272283
err := fetcher.Fetch(ctx, reporter.V2())
273-
if err != nil {
274-
reporter.V2().Error(err)
275-
if errors.As(err, &mb.PartialMetricsError{}) {
276-
// mark module as running if metrics are partially available and display the error message
277-
msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
278-
} else {
279-
// mark it as degraded for any other issue encountered
280-
msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
281-
}
282-
logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
283-
} else {
284-
msw.module.UpdateStatus(status.Running, "")
285-
}
284+
msw.handleFetchError(err, reporter.V2())
286285
default:
287286
panic(fmt.Sprintf("unexpected fetcher type for %v", msw))
288287
}
@@ -311,6 +310,31 @@ func (msw *metricSetWrapper) Test(d testing.Driver) {
311310
})
312311
}
313312

313+
func (msw *metricSetWrapper) handleFetchError(err error, reporter mb.PushReporterV2) {
314+
switch {
315+
case err == nil:
316+
msw.stats.consecutiveFailures.Set(0)
317+
msw.module.UpdateStatus(status.Running, "")
318+
319+
case errors.As(err, &mb.PartialMetricsError{}):
320+
reporter.Error(err)
321+
msw.stats.consecutiveFailures.Set(0)
322+
// mark module as running if metrics are partially available and display the error message
323+
msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
324+
logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
325+
326+
default:
327+
reporter.Error(err)
328+
msw.stats.consecutiveFailures.Inc()
329+
if msw.failureThreshold > 0 && msw.stats.consecutiveFailures != nil && uint(msw.stats.consecutiveFailures.Get()) >= msw.failureThreshold {
330+
// mark it as degraded for any other issue encountered
331+
msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err))
332+
}
333+
logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err)
334+
335+
}
336+
}
337+
314338
type reporter interface {
315339
StartFetchTimer()
316340
V1() mb.PushReporter //nolint:staticcheck // PushReporter is deprecated but not removed
@@ -437,11 +461,12 @@ func getMetricSetStats(module, name string) *stats {
437461

438462
reg := monitoring.Default.NewRegistry(key)
439463
s := &stats{
440-
key: key,
441-
ref: 1,
442-
success: monitoring.NewInt(reg, successesKey),
443-
failures: monitoring.NewInt(reg, failuresKey),
444-
events: monitoring.NewInt(reg, eventsKey),
464+
key: key,
465+
ref: 1,
466+
success: monitoring.NewInt(reg, successesKey),
467+
failures: monitoring.NewInt(reg, failuresKey),
468+
events: monitoring.NewInt(reg, eventsKey),
469+
consecutiveFailures: monitoring.NewUint(reg, consecutiveFailuresKey),
445470
}
446471

447472
fetches[key] = s

0 commit comments

Comments
 (0)