elastic
diff --git a/‎_meta/config/common.p2.yml.tmpl
+13 b/‎_meta/config/common.p2.yml.tmpl
+13
diff --git a/‎_meta/config/common.reference.p2.yml.tmpl
+14-1 b/‎_meta/config/common.reference.p2.yml.tmpl
+14-1
diff --git a/‎_meta/config/elastic-agent.docker.yml.tmpl
+13 b/‎_meta/config/elastic-agent.docker.yml.tmpl
+13
diff --git a/‎_meta/elastic-agent.yml
+9-1 b/‎_meta/elastic-agent.yml
+9-1
diff --git a/‎changelog/fragments/1711653910-add-liveness-endpoint.yaml
+32 b/‎changelog/fragments/1711653910-add-liveness-endpoint.yaml
+32
diff --git a/‎elastic-agent.docker.yml
+13 b/‎elastic-agent.docker.yml
+13
diff --git a/‎elastic-agent.reference.yml
+14-1 b/‎elastic-agent.reference.yml
+14-1
diff --git a/‎elastic-agent.yml
+13 b/‎elastic-agent.yml
+13
diff --git a/‎internal/pkg/agent/application/coordinator/coordinator.go
+24 b/‎internal/pkg/agent/application/coordinator/coordinator.go
+24
diff --git a/‎internal/pkg/agent/application/coordinator/coordinator_unit_test.go
+6-1 b/‎internal/pkg/agent/application/coordinator/coordinator_unit_test.go
+6-1
diff --git a/‎internal/pkg/agent/application/monitoring/handler.go
+11-1 b/‎internal/pkg/agent/application/monitoring/handler.go
+11-1
diff --git a/‎internal/pkg/agent/application/monitoring/liveness.go
+88 b/‎internal/pkg/agent/application/monitoring/liveness.go
+88
@@ -66,6 +66,19 @@ inputs:
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
 #   # exposes agent metrics using http, by default sockets and named pipes are used
+#   #
+#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
+#   #
+#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
+#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
+#   # The possible values for `failon` are:
+#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
+#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false
 
@@ -144,7 +144,20 @@ inputs:
 #   pprof.enabled: false
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
-#   # exposes agent metrics using http, by default sockets and named pipes are used
+#   # Exposes agent metrics using http, by default sockets and named pipes are used.
+#   #
+#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
+#   #
+#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
+#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
+#   # The possible values for `failon` are:
+#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
+#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false
 
@@ -116,6 +116,19 @@ inputs:
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
 #   # exposes agent metrics using http, by default sockets and named pipes are used
+#   #
+#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
+#   #
+#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
+#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
+#   # The possible values for `failon` are:
+#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
+#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false
 
@@ -103,7 +103,15 @@ inputs:
 #   logs: false
 #   # enables metrics monitoring
 #   metrics: false
-#   # exposes agent metrics using http, by default sockets and named pipes are used
+#   # Exposes agent metrics using http, by default sockets and named pipes are used.
+#   # Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
+#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
+#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
+#   # The two possible values for `failon` are `degraded` and `failed`. If no `failon` parameter is provided, the default 
+#   # behavior is `failon=failed`
 #   http:
 #       # enables http endpoint
 #       enabled: false
 
@@ -0,0 +1,32 @@
+# Kind can be one of:
+# - breaking-change: a change to previously-documented behavior
+# - deprecation: functionality that is being removed in a later release
+# - bug-fix: fixes a problem in a previous version
+# - enhancement: extends functionality but does not break or fix existing behavior
+# - feature: new functionality
+# - known-issue: problems that we are aware of in a given version
+# - security: impacts on the security of a product or a user’s deployment.
+# - upgrade: important information for someone upgrading from a prior version
+# - other: does not fit into any of the other categories
+kind: feature
+
+# Change summary; a 80ish characters long description of the change.
+summary: Add a configurable /liveness endpoint.
+
+# Long description; in case the summary is not enough to describe the change
+# this field accommodate a description without length limits.
+# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
+description: Adds a liveness endpoint suitable for use as a k8s liveness probe.
+
+# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
+component: monitoring
+
+# PR URL; optional; the PR number that added the changeset.
+# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
+# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
+# Please provide it if you are adding a fragment for a different PR.
+pr: https://github.com/elastic/elastic-agent/pull/4499
+
+# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
+# If not present is automatically filled by the tooling with the issue linked to the PR number.
+#issue: https://github.com/owner/repo/1234
@@ -116,6 +116,19 @@ inputs:
 #   # recommended that these endpoints are only enabled if the monitoring endpoint is set to localhost
 #   pprof.enabled: false
 #   # exposes agent metrics using http, by default sockets and named pipes are used
+#   #
+#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
+#   #
+#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
+#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
+#   # The possible values for `failon` are:
+#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
+#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false
 
@@ -150,7 +150,20 @@ inputs:
 #   pprof.enabled: false
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
-#   # exposes agent metrics using http, by default sockets and named pipes are used
+#   # Exposes agent metrics using http, by default sockets and named pipes are used.
+#   #
+#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
+#   #
+#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
+#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
+#   # The possible values for `failon` are:
+#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
+#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false
 
@@ -72,6 +72,19 @@ inputs:
 #   # The name of the output to use for monitoring data.
 #   use_output: monitoring
 #   # exposes agent metrics using http, by default sockets and named pipes are used
+#   #
+#   # `http` Also exposes a /liveness endpoint that will return an HTTP code depending on agent status:
+#   # 200: Agent is healthy
+#   # 500: A component or unit is in a failed state
+#   # 503: The agent coordinator is unresponsive
+#   #
+#   # You can pass a `failon` parameter to the /liveness endpoint to determine what component state will result in a 500.
+#   # For example: `curl 'localhost:6792/liveness?failon=degraded'` will return 500 if a component is in a degraded state.
+#   # The possible values for `failon` are:
+#   # `degraded`: return an error if a component is in a degraded state or failed state, or if the agent coordinator is unresponsive.
+#   # `failed`: return an error if a unit is in a failed state, or if the agent coordinator is unresponsive.
+#   # `heartbeat`: return an error only if the agent coordinator is unresponsive. 
+#   # If no `failon` parameter is provided, the default behavior is `failon=heartbeat`
 #   http:
 #       # enables http endpoint
 #       enabled: false
 
@@ -279,6 +279,11 @@ type Coordinator struct {
 
 	// mx         sync.RWMutex
 	// protection protection.Config
+
+	// a sync channel that can be called by other components to check if the main coordinator
+	// loop in runLoopIteration() is active and listening.
+	// Should only be interacted with via CoordinatorActive() or runLoopIteration()
+	heartbeatChan chan struct{}
 }
 
 // The channels Coordinator reads to receive updates from the various managers.
@@ -372,6 +377,7 @@ func New(logger *logger.Logger, cfg *configuration.Configuration, logLevel logp.
 		logLevelCh:         make(chan logp.Level),
 		overrideStateChan:  make(chan *coordinatorOverrideState),
 		upgradeDetailsChan: make(chan *details.Details),
+		heartbeatChan:      make(chan struct{}),
 	}
 	// Setup communication channels for any non-nil components. This pattern
 	// lets us transparently accept nil managers / simulated events during
@@ -412,6 +418,22 @@ func (c *Coordinator) State() State {
 	return c.stateBroadcaster.Get()
 }
 
+// IsActive is a blocking method that waits for a channel response
+// from the coordinator loop. This can be used to as a basic health check,
+// as we'll timeout and return false if the coordinator run loop doesn't
+// respond to our channel.
+func (c *Coordinator) IsActive(timeout time.Duration) bool {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+
+	select {
+	case <-c.heartbeatChan:
+		return true
+	case <-ctx.Done():
+		return false
+	}
+}
+
 func (c *Coordinator) RegisterMonitoringServer(s configReloader) {
 	c.monitoringServerReloader = s
 }
@@ -977,6 +999,8 @@ func (c *Coordinator) runLoopIteration(ctx context.Context) {
 	case upgradeDetails := <-c.upgradeDetailsChan:
 		c.setUpgradeDetails(upgradeDetails)
 
+	case c.heartbeatChan <- struct{}{}:
+
 	case componentState := <-c.managerChans.runtimeManagerUpdate:
 		// New component change reported by the runtime manager via
 		// Coordinator.watchRuntimeComponents(), merge it with the
 
@@ -14,6 +14,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"net"
 	"testing"
 	"time"
 
@@ -570,7 +571,7 @@ func TestCoordinatorPolicyChangeUpdatesMonitorReloader(t *testing.T) {
 	}
 
 	monitoringServer := &fakeMonitoringServer{}
-	newServerFn := func() (reload.ServerController, error) {
+	newServerFn := func(*monitoringCfg.MonitoringConfig) (reload.ServerController, error) {
 		return monitoringServer, nil
 	}
 	monitoringReloader := reload.NewServerReloader(newServerFn, logger, monitoringCfg.DefaultConfig())
@@ -1054,3 +1055,7 @@ func (fs *fakeMonitoringServer) Reset() {
 	fs.stopTriggered = false
 	fs.startTriggered = false
 }
+
+func (fs *fakeMonitoringServer) Addr() net.Addr {
+	return nil
+}
@@ -8,6 +8,9 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
+	"time"
+
+	"github.com/elastic/elastic-agent/internal/pkg/agent/application/coordinator"
 )
 
 const errTypeUnexpected = "UNEXPECTED"
@@ -16,6 +19,13 @@ type apiError interface {
 	Status() int
 }
 
+// CoordinatorState is used by the HTTP handlers that take a coordinator object.
+// This interface exists to help make testing easier.
+type CoordinatorState interface {
+	State() coordinator.State
+	IsActive(timeout time.Duration) bool
+}
+
 func createHandler(fn func(w http.ResponseWriter, r *http.Request) error) *apiHandler {
 	return &apiHandler{
 		innerFn: fn,
@@ -30,7 +40,7 @@ type apiHandler struct {
 func (h *apiHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 	err := h.innerFn(w, r)
 	if err != nil {
-		switch e := err.(type) { // nolint:errorlint // Will need refactor.
+		switch e := err.(type) { //nolint:errorlint // Will need refactor.
 		case apiError:
 			w.WriteHeader(e.Status())
 		default:
 
@@ -0,0 +1,88 @@
+// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+// or more contributor license agreements. Licensed under the Elastic License;
+// you may not use this file except in compliance with the Elastic License.
+
+package monitoring
+
+import (
+	"fmt"
+	"net/http"
+	"time"
+
+	"github.com/elastic/elastic-agent-client/v7/pkg/client"
+)
+
+const formValueKey = "failon"
+
+type LivenessFailConfig struct {
+	Degraded  bool `yaml:"degraded" config:"degraded"`
+	Failed    bool `yaml:"failed" config:"failed"`
+	Heartbeat bool `yaml:"heartbeat" config:"heartbeat"`
+}
+
+// process the form values we get via HTTP
+func handleFormValues(req *http.Request) (LivenessFailConfig, error) {
+	err := req.ParseForm()
+	if err != nil {
+		return LivenessFailConfig{}, fmt.Errorf("Error parsing form: %w", err)
+	}
+
+	defaultUserCfg := LivenessFailConfig{Degraded: false, Failed: false, Heartbeat: true}
+
+	for formKey := range req.Form {
+		if formKey != formValueKey {
+			return defaultUserCfg, fmt.Errorf("got invalid HTTP form key: '%s'", formKey)
+		}
+	}
+
+	userConfig := req.Form.Get(formValueKey)
+	switch userConfig {
+	case "failed":
+		return LivenessFailConfig{Degraded: false, Failed: true, Heartbeat: true}, nil
+	case "degraded":
+		return LivenessFailConfig{Failed: true, Degraded: true, Heartbeat: true}, nil
+	case "heartbeat", "":
+		return defaultUserCfg, nil
+	default:
+		return defaultUserCfg, fmt.Errorf("got unexpected value for `%s` attribute: %s", formValueKey, userConfig)
+	}
+}
+
+func livenessHandler(coord CoordinatorState) func(http.ResponseWriter, *http.Request) error {
+	return func(w http.ResponseWriter, r *http.Request) error {
+		w.Header().Set("Content-Type", "application/json; charset=utf-8")
+
+		state := coord.State()
+		isUp := coord.IsActive(time.Second * 10)
+		// the coordinator check is always on, so if that fails, always return false
+		if !isUp {
+			w.WriteHeader(http.StatusServiceUnavailable)
+			return nil
+		}
+
+		failConfig, err := handleFormValues(r)
+		if err != nil {
+			return fmt.Errorf("error handling form values: %w", err)
+		}
+
+		// if user has requested `coordinator` mode, just revert to that, skip everything else
+		if !failConfig.Degraded && !failConfig.Failed && failConfig.Heartbeat {
+			if !isUp {
+				w.WriteHeader(http.StatusServiceUnavailable)
+				return nil
+			}
+		}
+
+		unhealthyComponent := false
+		for _, comp := range state.Components {
+			if (failConfig.Failed && comp.State.State == client.UnitStateFailed) || (failConfig.Degraded && comp.State.State == client.UnitStateDegraded) {
+				unhealthyComponent = true
+			}
+		}
+		// bias towards the coordinator check, since it can be otherwise harder to diagnose
+		if unhealthyComponent {
+			w.WriteHeader(http.StatusInternalServerError)
+		}
+		return nil
+	}
+}