Skip to content

Commit 6a45256

Browse files
Introduce agent.monitoring.metrics_period (elastic#4961)
* feat: introduce agent.monitoring.metrics_period * doc: add changelog/fragments * fix: TestDiagnosticLocalConfig unit-test * doc: reword summary in changelog fragment
1 parent 6b78791 commit 6a45256

11 files changed

+206
-74
lines changed

_meta/config/common.p2.yml.tmpl

+3-1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ inputs:
6060
# logs: true
6161
# # enables metrics monitoring
6262
# metrics: true
63+
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
64+
# metrics_period: 60s
6365
# # exposes /debug/pprof/ endpoints
6466
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
6567
# pprof.enabled: false
@@ -77,7 +79,7 @@ inputs:
7779
# # The possible values for `failon` are:
7880
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
7981
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
80-
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
82+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
8183
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
8284
# http:
8385
# # enables http endpoint

_meta/config/common.reference.p2.yml.tmpl

+3-1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ inputs:
139139
# logs: false
140140
# # enables metrics monitoring
141141
# metrics: false
142+
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
143+
# metrics_period: 60s
142144
# # exposes /debug/pprof/ endpoints
143145
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
144146
# pprof.enabled: false
@@ -156,7 +158,7 @@ inputs:
156158
# # The possible values for `failon` are:
157159
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
158160
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
159-
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
161+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
160162
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
161163
# http:
162164
# # enables http endpoint

_meta/config/elastic-agent.docker.yml.tmpl

+7-5
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,17 @@ inputs:
1818
data_stream.namespace: default
1919
use_output: default
2020
streams:
21-
- metricsets:
21+
- metricsets:
2222
- cpu
2323
# Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
2424
data_stream.dataset: system.cpu
25-
- metricsets:
25+
- metricsets:
2626
- memory
2727
data_stream.dataset: system.memory
28-
- metricsets:
28+
- metricsets:
2929
- network
3030
data_stream.dataset: system.network
31-
- metricsets:
31+
- metricsets:
3232
- filesystem
3333
data_stream.dataset: system.filesystem
3434

@@ -112,6 +112,8 @@ inputs:
112112
# logs: false
113113
# # enables metrics monitoring
114114
# metrics: false
115+
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
116+
# metrics_period: 60s
115117
# # exposes /debug/pprof/ endpoints
116118
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
117119
# pprof.enabled: false
@@ -127,7 +129,7 @@ inputs:
127129
# # The possible values for `failon` are:
128130
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
129131
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
130-
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
132+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
131133
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
132134
# http:
133135
# # enables http endpoint
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: feature
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Allow configuring `agent.monitoring.metrics_period`.
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: elastic-agent
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/4961
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
#issue: https://github.com/owner/repo/1234

elastic-agent.docker.yml

+7-5
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,17 @@ inputs:
1818
data_stream.namespace: default
1919
use_output: default
2020
streams:
21-
- metricsets:
21+
- metricsets:
2222
- cpu
2323
# Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
2424
data_stream.dataset: system.cpu
25-
- metricsets:
25+
- metricsets:
2626
- memory
2727
data_stream.dataset: system.memory
28-
- metricsets:
28+
- metricsets:
2929
- network
3030
data_stream.dataset: system.network
31-
- metricsets:
31+
- metricsets:
3232
- filesystem
3333
data_stream.dataset: system.filesystem
3434

@@ -112,6 +112,8 @@ inputs:
112112
# logs: false
113113
# # enables metrics monitoring
114114
# metrics: false
115+
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
116+
# metrics_period: 60s
115117
# # exposes /debug/pprof/ endpoints
116118
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
117119
# pprof.enabled: false
@@ -127,7 +129,7 @@ inputs:
127129
# # The possible values for `failon` are:
128130
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
129131
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
130-
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
132+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
131133
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
132134
# http:
133135
# # enables http endpoint

elastic-agent.reference.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,8 @@ inputs:
145145
# logs: false
146146
# # enables metrics monitoring
147147
# metrics: false
148+
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
149+
# metrics_period: 60s
148150
# # exposes /debug/pprof/ endpoints
149151
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
150152
# pprof.enabled: false
@@ -162,7 +164,7 @@ inputs:
162164
# # The possible values for `failon` are:
163165
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
164166
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
165-
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
167+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
166168
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
167169
# http:
168170
# # enables http endpoint

elastic-agent.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ inputs:
6666
# logs: true
6767
# # enables metrics monitoring
6868
# metrics: true
69+
# # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
70+
# metrics_period: 60s
6971
# # exposes /debug/pprof/ endpoints
7072
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
7173
# pprof.enabled: false
@@ -83,7 +85,7 @@ inputs:
8385
# # The possible values for `failon` are:
8486
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
8587
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
86-
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
88+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
8789
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
8890
# http:
8991
# # enables http endpoint

internal/pkg/agent/application/coordinator/diagnostics_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ agent:
9797
http: null
9898
logs: false
9999
metrics: false
100+
metrics_period: ""
100101
namespace: ""
101102
pprof: null
102103
traces: true

internal/pkg/agent/application/monitoring/v1_monitor.go

+20-4
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ const (
4747
agentKey = "agent"
4848
monitoringKey = "monitoring"
4949
useOutputKey = "use_output"
50+
monitoringMetricsPeriodKey = "metrics_period"
5051
monitoringOutput = "monitoring"
5152
defaultMonitoringNamespace = "default"
5253
agentName = "elastic-agent"
@@ -58,7 +59,7 @@ const (
5859

5960
// metricset execution period used for the monitoring metrics inputs
6061
// we set this to 60s to reduce the load/data volume on the monitoring cluster
61-
metricsCollectionInterval = 60 * time.Second
62+
defaultMetricsCollectionInterval = 60 * time.Second
6263
)
6364

6465
var (
@@ -129,6 +130,7 @@ func (b *BeatsMonitor) MonitoringConfig(
129130
cfg := make(map[string]interface{})
130131

131132
monitoringOutputName := defaultOutputName
133+
metricsCollectionIntervalString := b.config.C.MetricsPeriod
132134
if agentCfg, found := policy[agentKey]; found {
133135
// The agent section is required for feature flags
134136
cfg[agentKey] = agentCfg
@@ -143,6 +145,12 @@ func (b *BeatsMonitor) MonitoringConfig(
143145
monitoringOutputName = useStr
144146
}
145147
}
148+
149+
if metricsPeriod, found := monitoringMap[monitoringMetricsPeriodKey]; found {
150+
if metricsPeriodStr, ok := metricsPeriod.(string); ok {
151+
metricsCollectionIntervalString = metricsPeriodStr
152+
}
153+
}
146154
}
147155
}
148156
}
@@ -165,7 +173,7 @@ func (b *BeatsMonitor) MonitoringConfig(
165173
}
166174

167175
if b.config.C.MonitorMetrics {
168-
if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap); err != nil {
176+
if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap, metricsCollectionIntervalString); err != nil {
169177
return nil, errors.New(err, "failed to inject monitoring output")
170178
}
171179
}
@@ -542,8 +550,16 @@ func (b *BeatsMonitor) monitoringNamespace() string {
542550
}
543551

544552
// injectMetricsInput injects monitoring config for agent monitoring to the `cfg` object.
545-
func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentIDToBinary map[string]string, componentList []component.Component, existingStateServicePids map[string]uint64) error {
546-
metricsCollectionIntervalString := metricsCollectionInterval.String()
553+
func (b *BeatsMonitor) injectMetricsInput(
554+
cfg map[string]interface{},
555+
componentIDToBinary map[string]string,
556+
componentList []component.Component,
557+
existingStateServicePids map[string]uint64,
558+
metricsCollectionIntervalString string,
559+
) error {
560+
if metricsCollectionIntervalString == "" {
561+
metricsCollectionIntervalString = defaultMetricsCollectionInterval.String()
562+
}
547563
monitoringNamespace := b.monitoringNamespace()
548564
fixedAgentName := strings.ReplaceAll(agentName, "-", "_")
549565
beatsStreams := make([]interface{}, 0, len(componentIDToBinary))

0 commit comments

Comments
 (0)