Introduce agent.monitoring.metrics_period (elastic#4961)

pkoutsovasilis · web-flow · commit 6a452564dd42 · 2024-06-20T13:46:45.000+03:00
* feat: introduce agent.monitoring.metrics_period

* doc: add changelog/fragments

* fix: TestDiagnosticLocalConfig unit-test

* doc: reword summary in changelog fragment
diff --git a/_meta/config/common.p2.yml.tmpl b/_meta/config/common.p2.yml.tmpl
@@ -60,6 +60,8 @@ inputs:
 #   logs: true
 #   # enables metrics monitoring
 #   metrics: true
+#   # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
+#   metrics_period: 60s
 #   # exposes /debug/pprof/ endpoints
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
@@ -77,7 +79,7 @@ inputs:
 #   # The possible values for `failon` are:
 #   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
 #   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive.
 #   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
diff --git a/_meta/config/common.reference.p2.yml.tmpl b/_meta/config/common.reference.p2.yml.tmpl
@@ -139,6 +139,8 @@ inputs:
 #   logs: false
 #   # enables metrics monitoring
 #   metrics: false
+#   # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
+#   metrics_period: 60s
 #   # exposes /debug/pprof/ endpoints
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
@@ -156,7 +158,7 @@ inputs:
 #   # The possible values for `failon` are:
 #   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
 #   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive.
 #   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
diff --git a/_meta/config/elastic-agent.docker.yml.tmpl b/_meta/config/elastic-agent.docker.yml.tmpl
@@ -18,17 +18,17 @@ inputs:
     data_stream.namespace: default
     use_output: default
     streams:
-      - metricsets: 
+      - metricsets:
         - cpu
         # Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
         data_stream.dataset: system.cpu
-      - metricsets: 
+      - metricsets:
         - memory
         data_stream.dataset: system.memory
-      - metricsets: 
+      - metricsets:
         - network
         data_stream.dataset: system.network
-      - metricsets: 
+      - metricsets:
         - filesystem
         data_stream.dataset: system.filesystem
 
@@ -112,6 +112,8 @@ inputs:
 #   logs: false
 #   # enables metrics monitoring
 #   metrics: false
+#   # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
+#   metrics_period: 60s
 #   # exposes /debug/pprof/ endpoints
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
@@ -127,7 +129,7 @@ inputs:
 #   # The possible values for `failon` are:
 #   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
 #   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive.
 #   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
diff --git a/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml b/changelog/fragments/1718818524-support-monitoring-metrics-interval.yaml
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: feature
+
+# Change summary; a 80ish characters long description of the change.
+summary: Allow configuring `agent.monitoring.metrics_period`.
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+#description:
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component: elastic-agent
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/4961
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+#issue: https://github.com/owner/repo/1234
diff --git a/elastic-agent.docker.yml b/elastic-agent.docker.yml
@@ -18,17 +18,17 @@ inputs:
     data_stream.namespace: default
     use_output: default
     streams:
-      - metricsets: 
+      - metricsets:
         - cpu
         # Dataset name must conform to the naming conventions for Elasticsearch indices, cannot contain dashes (-), and cannot exceed 100 bytes
         data_stream.dataset: system.cpu
-      - metricsets: 
+      - metricsets:
         - memory
         data_stream.dataset: system.memory
-      - metricsets: 
+      - metricsets:
         - network
         data_stream.dataset: system.network
-      - metricsets: 
+      - metricsets:
         - filesystem
         data_stream.dataset: system.filesystem
 
@@ -112,6 +112,8 @@ inputs:
 #   logs: false
 #   # enables metrics monitoring
 #   metrics: false
+#   # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
+#   metrics_period: 60s
 #   # exposes /debug/pprof/ endpoints
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
@@ -127,7 +129,7 @@ inputs:
 #   # The possible values for `failon` are:
 #   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
 #   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive.
 #   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
diff --git a/elastic-agent.reference.yml b/elastic-agent.reference.yml
@@ -145,6 +145,8 @@ inputs:
 #   logs: false
 #   # enables metrics monitoring
 #   metrics: false
+#   # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
+#   metrics_period: 60s
 #   # exposes /debug/pprof/ endpoints
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
@@ -162,7 +164,7 @@ inputs:
 #   # The possible values for `failon` are:
 #   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
 #   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive.
 #   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
diff --git a/elastic-agent.yml b/elastic-agent.yml
@@ -66,6 +66,8 @@ inputs:
 #   logs: true
 #   # enables metrics monitoring
 #   metrics: true
+#   # metrics_period defines how frequent we should sample monitoring metrics. Default is 60 seconds.
+#   metrics_period: 60s
 #   # exposes /debug/pprof/ endpoints
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
@@ -83,7 +85,7 @@ inputs:
 #   # The possible values for `failon` are:
 #   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
 #   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
-#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive.
 #   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
diff --git a/internal/pkg/agent/application/coordinator/diagnostics_test.go b/internal/pkg/agent/application/coordinator/diagnostics_test.go
@@ -97,6 +97,7 @@ agent:
     http: null
     logs: false
     metrics: false
+    metrics_period: ""
     namespace: ""
     pprof: null
     traces: true
diff --git a/internal/pkg/agent/application/monitoring/v1_monitor.go b/internal/pkg/agent/application/monitoring/v1_monitor.go
@@ -47,6 +47,7 @@ const (
 	agentKey                   = "agent"
 	monitoringKey              = "monitoring"
 	useOutputKey               = "use_output"
+	monitoringMetricsPeriodKey = "metrics_period"
 	monitoringOutput           = "monitoring"
 	defaultMonitoringNamespace = "default"
 	agentName                  = "elastic-agent"
@@ -58,7 +59,7 @@ const (
 
 	// metricset execution period used for the monitoring metrics inputs
 	// we set this to 60s to reduce the load/data volume on the monitoring cluster
-	metricsCollectionInterval = 60 * time.Second
+	defaultMetricsCollectionInterval = 60 * time.Second
 )
 
 var (
@@ -129,6 +130,7 @@ func (b *BeatsMonitor) MonitoringConfig(
 	cfg := make(map[string]interface{})
 
 	monitoringOutputName := defaultOutputName
+	metricsCollectionIntervalString := b.config.C.MetricsPeriod
 	if agentCfg, found := policy[agentKey]; found {
 		// The agent section is required for feature flags
 		cfg[agentKey] = agentCfg
@@ -143,6 +145,12 @@ func (b *BeatsMonitor) MonitoringConfig(
 							monitoringOutputName = useStr
 						}
 					}
+
+					if metricsPeriod, found := monitoringMap[monitoringMetricsPeriodKey]; found {
+						if metricsPeriodStr, ok := metricsPeriod.(string); ok {
+							metricsCollectionIntervalString = metricsPeriodStr
+						}
+					}
 				}
 			}
 		}
@@ -165,7 +173,7 @@ func (b *BeatsMonitor) MonitoringConfig(
 	}
 
 	if b.config.C.MonitorMetrics {
-		if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap); err != nil {
+		if err := b.injectMetricsInput(cfg, componentIDToBinary, components, componentIDPidMap, metricsCollectionIntervalString); err != nil {
 			return nil, errors.New(err, "failed to inject monitoring output")
 		}
 	}
@@ -542,8 +550,16 @@ func (b *BeatsMonitor) monitoringNamespace() string {
 }
 
 // injectMetricsInput injects monitoring config for agent monitoring to the `cfg` object.
-func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentIDToBinary map[string]string, componentList []component.Component, existingStateServicePids map[string]uint64) error {
-	metricsCollectionIntervalString := metricsCollectionInterval.String()
+func (b *BeatsMonitor) injectMetricsInput(
+	cfg map[string]interface{},
+	componentIDToBinary map[string]string,
+	componentList []component.Component,
+	existingStateServicePids map[string]uint64,
+	metricsCollectionIntervalString string,
+) error {
+	if metricsCollectionIntervalString == "" {
+		metricsCollectionIntervalString = defaultMetricsCollectionInterval.String()
+	}
 	monitoringNamespace := b.monitoringNamespace()
 	fixedAgentName := strings.ReplaceAll(agentName, "-", "_")
 	beatsStreams := make([]interface{}, 0, len(componentIDToBinary))
diff --git a/internal/pkg/agent/application/monitoring/v1_monitor_test.go b/internal/pkg/agent/application/monitoring/v1_monitor_test.go
diff --git a/internal/pkg/core/monitoring/config/config.go b/internal/pkg/core/monitoring/config/config.go