Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow agent to monitor endpoint #4789

Merged
merged 39 commits into from
Jun 16, 2024
Merged
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
e0621fa
first attempt
fearful-symmetry May 7, 2024
3d21818
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry May 7, 2024
6aad6e1
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry May 7, 2024
f165295
still tinkering
fearful-symmetry May 15, 2024
c869512
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry May 15, 2024
99de46c
first draft change
fearful-symmetry May 21, 2024
30028ff
cleanup
fearful-symmetry May 21, 2024
1359938
clean up
fearful-symmetry May 21, 2024
354b7d0
add changelog
fearful-symmetry May 21, 2024
5f5a7a3
format
fearful-symmetry May 22, 2024
a09b83d
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry May 22, 2024
704d9ea
fix component lookup
fearful-symmetry May 22, 2024
2e08d79
fix state tests
fearful-symmetry May 22, 2024
f806c93
complete integration tests
fearful-symmetry May 23, 2024
71b5815
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry May 23, 2024
04b64fb
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry May 31, 2024
3451d88
basic cleanup
fearful-symmetry Jun 3, 2024
6771de5
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry Jun 3, 2024
824cc7b
update metrics setup
fearful-symmetry Jun 3, 2024
937f7ac
remove mage changes
fearful-symmetry Jun 3, 2024
4853383
spelling
fearful-symmetry Jun 3, 2024
4d417cd
fix test
fearful-symmetry Jun 4, 2024
5f7d633
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry Jun 4, 2024
baafc48
figured out weird test failures
fearful-symmetry Jun 4, 2024
6f17e4e
still fixing unit tests
fearful-symmetry Jun 4, 2024
8aaa074
major test improvements
fearful-symmetry Jun 4, 2024
1c3121d
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry Jun 5, 2024
e15fba3
finish up tests
fearful-symmetry Jun 5, 2024
0e35f59
remove log line
fearful-symmetry Jun 7, 2024
aa4c5fc
update name
fearful-symmetry Jun 7, 2024
7ed6390
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry Jun 10, 2024
76fea5e
fix comments
fearful-symmetry Jun 10, 2024
9bcdb4f
fix bool logic
fearful-symmetry Jun 10, 2024
88d9bba
fix tests
fearful-symmetry Jun 10, 2024
c381be2
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry Jun 12, 2024
3e3aa15
fix merge
fearful-symmetry Jun 12, 2024
f1d0e5a
add warning and mock output to inspect
fearful-symmetry Jun 13, 2024
0346c58
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry Jun 13, 2024
2147418
Merge remote-tracking branch 'upstream/main' into service-monitoring
fearful-symmetry Jun 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update metrics setup
fearful-symmetry committed Jun 3, 2024
commit 824cc7b85e39d3633b18e7e8cbea7fefb58fba36
39 changes: 27 additions & 12 deletions internal/pkg/agent/application/coordinator/coordinator.go
Original file line number Diff line number Diff line change
@@ -10,6 +10,7 @@ import (
"fmt"
"reflect"
"strings"
"sync/atomic"
"time"

"github.com/hashicorp/go-multierror"
@@ -285,7 +286,13 @@ type Coordinator struct {
// Should only be interacted with via CoordinatorActive() or runLoopIteration()
heartbeatChan chan struct{}

compPidUpdate chan struct{}
// if a component (mostly endpoint) has a new PID, we need to update
// the monitoring components so they have a PID to monitor
// however, if endpoint is in some kind of restart loop,
// we could DOS the config system. Instead,
// run a ticker that checks to see if we have a new PID.
componentPIDTicker *time.Ticker
componentPidRequiresUpdate *atomic.Bool
}

// The channels Coordinator reads to receive updates from the various managers.
@@ -376,11 +383,12 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp.
// synchronization in the subscriber API, just set the input buffer to 0.
stateBroadcaster: broadcaster.New(state, 64, 32),

logLevelCh: make(chan logp.Level),
overrideStateChan: make(chan *coordinatorOverrideState),
upgradeDetailsChan: make(chan *details.Details),
heartbeatChan: make(chan struct{}),
compPidUpdate: make(chan struct{}, 1),
logLevelCh: make(chan logp.Level),
overrideStateChan: make(chan *coordinatorOverrideState),
upgradeDetailsChan: make(chan *details.Details),
heartbeatChan: make(chan struct{}),
componentPIDTicker: time.NewTicker(time.Second * 30),
componentPidRequiresUpdate: &atomic.Bool{},
}
// Setup communication channels for any non-nil components. This pattern
// lets us transparently accept nil managers / simulated events during
@@ -923,6 +931,8 @@ func (c *Coordinator) runner(ctx context.Context) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()

defer c.componentPIDTicker.Stop()

// We run nil checks before starting the various managers so that unit tests
// only have to initialize / mock the specific components they're testing.
// If a manager is nil, we prebuffer its return channel with nil also so
@@ -1035,12 +1045,17 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {

case c.heartbeatChan <- struct{}{}:

case <-c.compPidUpdate:
err := c.refreshComponentModel(ctx)
if err != nil {
err = fmt.Errorf("error refreshing component model for PID update: %w", err)
c.setConfigManagerError(err)
c.logger.Errorf("%s", err)
case <-c.componentPIDTicker.C:
// if we hit the ticker and we've got a new PID,
// reload the component model
if c.componentPidRequiresUpdate.Load() {
c.componentPidRequiresUpdate.Store(false)
err := c.refreshComponentModel(ctx)
if err != nil {
err = fmt.Errorf("error refreshing component model for PID update: %w", err)
c.setConfigManagerError(err)
c.logger.Errorf("%s", err)
}
}

case componentState := <-c.managerChans.runtimeManagerUpdate:
Original file line number Diff line number Diff line change
@@ -136,6 +136,7 @@ func (c *Coordinator) refreshState() {
// Must be called on the main Coordinator goroutine.
func (c *Coordinator) applyComponentState(state runtime.ComponentComponentState) {
found := false
// check for any component updates to the PID, so we can update the component monitoring
pidRequiresUpdate := false
for i, other := range c.state.Components {
if other.Component.ID == state.Component.ID {
@@ -169,7 +170,7 @@ func (c *Coordinator) applyComponentState(state runtime.ComponentComponentState)
c.stateNeedsRefresh = true

if pidRequiresUpdate {
c.compPidUpdate <- struct{}{}
c.componentPidRequiresUpdate.Store(true)
}
}

133 changes: 69 additions & 64 deletions internal/pkg/agent/application/monitoring/v1_monitor.go
Original file line number Diff line number Diff line change
@@ -910,84 +910,89 @@ func (b *BeatsMonitor) injectMetricsInput(cfg map[string]interface{}, componentI
},
}

for _, comp := range componentList {
logp.L().Infof("input component %s: %#v", comp.ID, comp.InputSpec.Spec.Service)
}

// add system/process metrics for services that can't be monitored via json/beats metrics
// If there's a checkin PID and it contains the "endpoint" string, assume we want to monitor it
for id, comp := range existingStateServicePids {
logp.L().Infof("component/pid monitoring map is: %#v", existingStateServicePids)
if comp != 0 && strings.Contains(id, "endpoint") {
logp.L().Infof("creating system/process watcher for pid %d", comp)
inputs = append(inputs, map[string]interface{}{
idKey: fmt.Sprintf("%s-endpoint_security", monitoringMetricsUnitID),
"name": fmt.Sprintf("%s-endpoint_security", monitoringMetricsUnitID),
"type": "system/metrics",
useOutputKey: monitoringOutput,
"data_stream": map[string]interface{}{
"namespace": monitoringNamespace,
},
"streams": []interface{}{
map[string]interface{}{
idKey: fmt.Sprintf("%s-endpoint_security", monitoringMetricsUnitID),
"data_stream": map[string]interface{}{
"type": "metrics",
"dataset": "elastic_agent.endpoint_security",
"namespace": monitoringNamespace,
},
"metricsets": []interface{}{"process"},
"period": metricsCollectionIntervalString,
"index": fmt.Sprintf("metrics-elastic_agent.endpoint_security-%s", monitoringNamespace),
"process.pid": comp,
"process.cgroups.enabled": false,
"processors": []interface{}{
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "data_stream",
"fields": map[string]interface{}{
"type": "metrics",
"dataset": "elastic_agent.endpoint_security",
"namespace": monitoringNamespace,
// If there's a checkin PID and the corrisponding component has a service spec section, add a system/process config
for _, compState := range componentList {
if compState.InputSpec != nil && compState.InputSpec.Spec.Service != nil {
if comp, ok := existingStateServicePids[compState.ID]; ok && comp != 0 {
inputs = append(inputs, map[string]interface{}{
idKey: fmt.Sprintf("%s-endpoint_security", monitoringMetricsUnitID),
"name": fmt.Sprintf("%s-endpoint_security", monitoringMetricsUnitID),
"type": "system/metrics",
useOutputKey: monitoringOutput,
"data_stream": map[string]interface{}{
"namespace": monitoringNamespace,
},
"streams": []interface{}{
map[string]interface{}{
idKey: fmt.Sprintf("%s-endpoint_security", monitoringMetricsUnitID),
"data_stream": map[string]interface{}{
"type": "metrics",
"dataset": "elastic_agent.endpoint_security",
"namespace": monitoringNamespace,
},
"metricsets": []interface{}{"process"},
"period": metricsCollectionIntervalString,
"index": fmt.Sprintf("metrics-elastic_agent.endpoint_security-%s", monitoringNamespace),
"process.pid": comp,
"process.cgroups.enabled": false,
"processors": []interface{}{
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "data_stream",
"fields": map[string]interface{}{
"type": "metrics",
"dataset": "elastic_agent.endpoint_security",
"namespace": monitoringNamespace,
},
},
},
},
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "event",
"fields": map[string]interface{}{
"dataset": "elastic_agent.endpoint_security",
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "event",
"fields": map[string]interface{}{
"dataset": "elastic_agent.endpoint_security",
},
},
},
},
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "elastic_agent",
"fields": map[string]interface{}{
"id": b.agentInfo.AgentID(),
"version": b.agentInfo.Version(),
"snapshot": b.agentInfo.Snapshot(),
"process": "endpoint_security",
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "elastic_agent",
"fields": map[string]interface{}{
"id": b.agentInfo.AgentID(),
"version": b.agentInfo.Version(),
"snapshot": b.agentInfo.Snapshot(),
"process": "endpoint_security",
},
},
},
},
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "agent",
"fields": map[string]interface{}{
"id": b.agentInfo.AgentID(),
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "agent",
"fields": map[string]interface{}{
"id": b.agentInfo.AgentID(),
},
},
},
},
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "component",
"fields": map[string]interface{}{
"binary": "endpoint_security",
"id": id,
map[string]interface{}{
"add_fields": map[string]interface{}{
"target": "component",
"fields": map[string]interface{}{
"binary": "endpoint_security",
"id": compState.ID,
},
},
},
},
},
},
},
})
})
}

}
}

14 changes: 12 additions & 2 deletions internal/pkg/agent/application/monitoring/v1_monitor_test.go
Original file line number Diff line number Diff line change
@@ -23,7 +23,6 @@ import (
)

func TestMonitoringWithEndpoint(t *testing.T) {

agentInfo, err := info.NewAgentInfo(context.Background(), false)
require.NoError(t, err, "Error creating agent info")

@@ -59,7 +58,18 @@ func TestMonitoringWithEndpoint(t *testing.T) {
// manually declaring all the MonitoringConfig() args since there's a lot of them, and this makes
// the test a little more self-describing

var compList []component.Component
compList := []component.Component{
{
ID: "endpoint-default",
InputSpec: &component.InputRuntimeSpec{
Spec: component.InputSpec{
Service: &component.ServiceSpec{
CPort: 7688,
},
},
},
},
}

compIdToBinary := map[string]string{
"endpoint-default": "endpoint-security",
Loading