Skip to content

Commit 6cdb8af

Browse files
Fix liveness reload config handling (#4586)
* Reapply "Add `/liveness` endpoint to elastic-agent (#4499)" (#4583) This reverts commit eca5bc7. * add behavior to not disable http monitor on reload with nil config, add tests * improve comments * linter * more linter... * fix spelling * check original config state when reloading config * change behavior of config set from overrides * fix tests * add second test to make sure old behavior with hard-coded monitoring config still works * rename method
1 parent adef7f7 commit 6cdb8af

24 files changed

+1397
-103
lines changed

_meta/config/common.p2.yml.tmpl

+13
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,19 @@ inputs:
6666
# # The name of the output to use for monitoring data.
6767
# use_output: monitoring
6868
# # exposes agent metrics using http, by default sockets and named pipes are used
69+
# #
70+
# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
71+
# # 200: Agent is healthy
72+
# # 500: A component or unit is in a failed state
73+
# # 503: The agent coordinator is unresponsive
74+
# #
75+
# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
76+
# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
77+
# # The possible values for `failon` are:
78+
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
79+
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
80+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
81+
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
6982
# http:
7083
# # enables http endpoint
7184
# enabled: false

_meta/config/common.reference.p2.yml.tmpl

+14-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,20 @@ inputs:
144144
# pprof.enabled: false
145145
# # The name of the output to use for monitoring data.
146146
# use_output: monitoring
147-
# # exposes agent metrics using http, by default sockets and named pipes are used
147+
# # Exposes agent metrics using http, by default sockets and named pipes are used.
148+
# #
149+
# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
150+
# # 200: Agent is healthy
151+
# # 500: A component or unit is in a failed state
152+
# # 503: The agent coordinator is unresponsive
153+
# #
154+
# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
155+
# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
156+
# # The possible values for `failon` are:
157+
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
158+
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
159+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
160+
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
148161
# http:
149162
# # enables http endpoint
150163
# enabled: false

_meta/config/elastic-agent.docker.yml.tmpl

+13
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,19 @@ inputs:
116116
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
117117
# pprof.enabled: false
118118
# # exposes agent metrics using http, by default sockets and named pipes are used
119+
# #
120+
# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
121+
# # 200: Agent is healthy
122+
# # 500: A component or unit is in a failed state
123+
# # 503: The agent coordinator is unresponsive
124+
# #
125+
# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
126+
# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
127+
# # The possible values for `failon` are:
128+
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
129+
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
130+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
131+
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
119132
# http:
120133
# # enables http endpoint
121134
# enabled: false

_meta/elastic-agent.yml

+9-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,15 @@ inputs:
103103
# logs: false
104104
# # enables metrics monitoring
105105
# metrics: false
106-
# # exposes agent metrics using http, by default sockets and named pipes are used
106+
# # Exposes agent metrics using http, by default sockets and named pipes are used.
107+
# # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
108+
# # 200: Agent is healthy
109+
# # 500: A component or unit is in a failed state
110+
# # 503: The agent coordinator is unresponsive
111+
# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
112+
# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
113+
# # The two possible values for `failon` are `degraded` and `failed`. If no `failon` parameter is provided, the default
114+
# # behavior is `failon=failed`
107115
# http:
108116
# # enables http endpoint
109117
# enabled: false
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: feature
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Add a configurable /liveness endpoint.
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
description: Adds a liveness endpoint suitable for use as a k8s liveness probe.
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: monitoring
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/4499
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
#issue: https://github.com/owner/repo/1234

elastic-agent.docker.yml

+13
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,19 @@ inputs:
116116
# # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
117117
# pprof.enabled: false
118118
# # exposes agent metrics using http, by default sockets and named pipes are used
119+
# #
120+
# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
121+
# # 200: Agent is healthy
122+
# # 500: A component or unit is in a failed state
123+
# # 503: The agent coordinator is unresponsive
124+
# #
125+
# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
126+
# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
127+
# # The possible values for `failon` are:
128+
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
129+
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
130+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
131+
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
119132
# http:
120133
# # enables http endpoint
121134
# enabled: false

elastic-agent.reference.yml

+14-1
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,20 @@ inputs:
150150
# pprof.enabled: false
151151
# # The name of the output to use for monitoring data.
152152
# use_output: monitoring
153-
# # exposes agent metrics using http, by default sockets and named pipes are used
153+
# # Exposes agent metrics using http, by default sockets and named pipes are used.
154+
# #
155+
# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
156+
# # 200: Agent is healthy
157+
# # 500: A component or unit is in a failed state
158+
# # 503: The agent coordinator is unresponsive
159+
# #
160+
# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
161+
# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
162+
# # The possible values for `failon` are:
163+
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
164+
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
165+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
166+
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
154167
# http:
155168
# # enables http endpoint
156169
# enabled: false

elastic-agent.yml

+13
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,19 @@ inputs:
7272
# # The name of the output to use for monitoring data.
7373
# use_output: monitoring
7474
# # exposes agent metrics using http, by default sockets and named pipes are used
75+
# #
76+
# # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
77+
# # 200: Agent is healthy
78+
# # 500: A component or unit is in a failed state
79+
# # 503: The agent coordinator is unresponsive
80+
# #
81+
# # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
82+
# # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
83+
# # The possible values for `failon` are:
84+
# # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
85+
# # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
86+
# # `heartbeat`: return an error only if the agent coordinator is unresponsive.
87+
# # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
7588
# http:
7689
# # enables http endpoint
7790
# enabled: false

internal/pkg/agent/application/coordinator/coordinator.go

+24
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,11 @@ type Coordinator struct {
279279

280280
// mx sync.RWMutex
281281
// protection protection.Config
282+
283+
// a sync channel that can be called by other components to check if the main coordinator
284+
// loop in runLoopIteration() is active and listening.
285+
// Should only be interacted with via CoordinatorActive() or runLoopIteration()
286+
heartbeatChan chan struct{}
282287
}
283288

284289
// The channels Coordinator reads to receive updates from the various managers.
@@ -372,6 +377,7 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp.
372377
logLevelCh: make(chan logp.Level),
373378
overrideStateChan: make(chan *coordinatorOverrideState),
374379
upgradeDetailsChan: make(chan *details.Details),
380+
heartbeatChan: make(chan struct{}),
375381
}
376382
// Setup communication channels for any non-nil components. This pattern
377383
// lets us transparently accept nil managers / simulated events during
@@ -412,6 +418,22 @@ func (c *Coordinator) State() State {
412418
return c.stateBroadcaster.Get()
413419
}
414420

421+
// IsActive is a blocking method that waits for a channel response
422+
// from the coordinator loop. This can be used to as a basic health check,
423+
// as we'll timeout and return false if the coordinator run loop doesn't
424+
// respond to our channel.
425+
func (c *Coordinator) IsActive(timeout time.Duration) bool {
426+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
427+
defer cancel()
428+
429+
select {
430+
case <-c.heartbeatChan:
431+
return true
432+
case <-ctx.Done():
433+
return false
434+
}
435+
}
436+
415437
func (c *Coordinator) RegisterMonitoringServer(s configReloader) {
416438
c.monitoringServerReloader = s
417439
}
@@ -977,6 +999,8 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {
977999
case upgradeDetails := <-c.upgradeDetailsChan:
9781000
c.setUpgradeDetails(upgradeDetails)
9791001

1002+
case c.heartbeatChan <- struct{}{}:
1003+
9801004
case componentState := <-c.managerChans.runtimeManagerUpdate:
9811005
// New component change reported by the runtime manager via
9821006
// Coordinator.watchRuntimeComponents(), merge it with the

internal/pkg/agent/application/coordinator/coordinator_unit_test.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"context"
1515
"errors"
1616
"fmt"
17+
"net"
1718
"testing"
1819
"time"
1920

@@ -570,7 +571,7 @@ func TestCoordinatorPolicyChangeUpdatesMonitorReloader(t *testing.T) {
570571
}
571572

572573
monitoringServer := &fakeMonitoringServer{}
573-
newServerFn := func() (reload.ServerController, error) {
574+
newServerFn := func(*monitoringCfg.MonitoringConfig) (reload.ServerController, error) {
574575
return monitoringServer, nil
575576
}
576577
monitoringReloader := reload.NewServerReloader(newServerFn, logger, monitoringCfg.DefaultConfig())
@@ -1054,3 +1055,7 @@ func (fs *fakeMonitoringServer) Reset() {
10541055
fs.stopTriggered = false
10551056
fs.startTriggered = false
10561057
}
1058+
1059+
func (fs *fakeMonitoringServer) Addr() net.Addr {
1060+
return nil
1061+
}

internal/pkg/agent/application/monitoring/handler.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ import (
88
"encoding/json"
99
"fmt"
1010
"net/http"
11+
"time"
12+
13+
"github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator"
1114
)
1215

1316
const errTypeUnexpected = "UNEXPECTED"
@@ -16,6 +19,13 @@ type apiError interface {
1619
Status() int
1720
}
1821

22+
// CoordinatorState is used by the HTTP handlers that take a coordinator object.
23+
// This interface exists to help make testing easier.
24+
type CoordinatorState interface {
25+
State() coordinator.State
26+
IsActive(timeout time.Duration) bool
27+
}
28+
1929
func createHandler(fn func(w http.ResponseWriter, r *http.Request) error) *apiHandler {
2030
return &apiHandler{
2131
innerFn: fn,
@@ -30,7 +40,7 @@ type apiHandler struct {
3040
func (h *apiHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
3141
err := h.innerFn(w, r)
3242
if err != nil {
33-
switch e := err.(type) { // nolint:errorlint // Will need refactor.
43+
switch e := err.(type) { //nolint:errorlint // Will need refactor.
3444
case apiError:
3545
w.WriteHeader(e.Status())
3646
default:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
2+
// or more contributor license agreements. Licensed under the Elastic License;
3+
// you may not use this file except in compliance with the Elastic License.
4+
5+
package monitoring
6+
7+
import (
8+
"fmt"
9+
"net/http"
10+
"time"
11+
12+
"github.com/elastic/elastic-agent-client/v7/pkg/client"
13+
)
14+
15+
const formValueKey = "failon"
16+
17+
type LivenessFailConfig struct {
18+
Degraded bool `yaml:"degraded" config:"degraded"`
19+
Failed bool `yaml:"failed" config:"failed"`
20+
Heartbeat bool `yaml:"heartbeat" config:"heartbeat"`
21+
}
22+
23+
// process the form values we get via HTTP
24+
func handleFormValues(req *http.Request) (LivenessFailConfig, error) {
25+
err := req.ParseForm()
26+
if err != nil {
27+
return LivenessFailConfig{}, fmt.Errorf("Error parsing form: %w", err)
28+
}
29+
30+
defaultUserCfg := LivenessFailConfig{Degraded: false, Failed: false, Heartbeat: true}
31+
32+
for formKey := range req.Form {
33+
if formKey != formValueKey {
34+
return defaultUserCfg, fmt.Errorf("got invalid HTTP form key: '%s'", formKey)
35+
}
36+
}
37+
38+
userConfig := req.Form.Get(formValueKey)
39+
switch userConfig {
40+
case "failed":
41+
return LivenessFailConfig{Degraded: false, Failed: true, Heartbeat: true}, nil
42+
case "degraded":
43+
return LivenessFailConfig{Failed: true, Degraded: true, Heartbeat: true}, nil
44+
case "heartbeat", "":
45+
return defaultUserCfg, nil
46+
default:
47+
return defaultUserCfg, fmt.Errorf("got unexpected value for `%s` attribute: %s", formValueKey, userConfig)
48+
}
49+
}
50+
51+
func livenessHandler(coord CoordinatorState) func(http.ResponseWriter, *http.Request) error {
52+
return func(w http.ResponseWriter, r *http.Request) error {
53+
w.Header().Set("Content-Type", "application/json; charset=utf-8")
54+
55+
state := coord.State()
56+
isUp := coord.IsActive(time.Second * 10)
57+
// the coordinator check is always on, so if that fails, always return false
58+
if !isUp {
59+
w.WriteHeader(http.StatusServiceUnavailable)
60+
return nil
61+
}
62+
63+
failConfig, err := handleFormValues(r)
64+
if err != nil {
65+
return fmt.Errorf("error handling form values: %w", err)
66+
}
67+
68+
// if user has requested `coordinator` mode, just revert to that, skip everything else
69+
if !failConfig.Degraded && !failConfig.Failed && failConfig.Heartbeat {
70+
if !isUp {
71+
w.WriteHeader(http.StatusServiceUnavailable)
72+
return nil
73+
}
74+
}
75+
76+
unhealthyComponent := false
77+
for _, comp := range state.Components {
78+
if (failConfig.Failed && comp.State.State == client.UnitStateFailed) || (failConfig.Degraded && comp.State.State == client.UnitStateDegraded) {
79+
unhealthyComponent = true
80+
}
81+
}
82+
// bias towards the coordinator check, since it can be otherwise harder to diagnose
83+
if unhealthyComponent {
84+
w.WriteHeader(http.StatusInternalServerError)
85+
}
86+
return nil
87+
}
88+
}

0 commit comments

Comments
 (0)