Skip to content

Commit f7e558f

Browse files
authored
install fails if enroll fails (#3554)
* fix install/enroll cmd not failing when agent restart fails * surface errors that might occur during enroll * fail install command if agent cannot be restarted * do not print success message if there was an enroll error. Print an error message and the error instead * add logs to show the different enroll attempts * add more context t errors * refactor internal/pkg/agent/install/perms_unix.go and add more context to errors restore main version * ignore agent restart error on enroll tests as there is no agent to be restarted * daemonReloadWithBackoff does not retry on context deadline exceeded and context cancelled * fix typos
1 parent fa357a8 commit f7e558f

File tree

7 files changed

+156
-55
lines changed

7 files changed

+156
-55
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Surface errors during Agent's enroll process, failing if any happens.
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; a word indicating the component this changeset affects.
22+
component: install/enroll
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/3207
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
#issue: https://github.com/owner/repo/1234

dev-tools/mage/godaemon.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ var (
2121
}
2222
)
2323

24-
// BuildGoDaemon builds the go-deamon binary.
24+
// BuildGoDaemon builds the go-daemon binary.
2525
func BuildGoDaemon() error {
2626
if GOOS != "linux" {
2727
return errors.New("go-daemon only builds for linux")

internal/pkg/agent/cmd/enroll.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
351351
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
352352
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
353353
// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
354-
var fixPermissions bool = fromInstall
354+
fixPermissions := fromInstall
355355
if runtime.GOOS == "darwin" {
356356
fixPermissions = false
357357
}

internal/pkg/agent/cmd/enroll_cmd.go

+49-18
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ func newEnrollCmd(
172172
)
173173
}
174174

175-
// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
175+
// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
176176
func newEnrollCmdWithStore(
177177
log *logger.Logger,
178178
options *enrollCmdOption,
@@ -187,10 +187,11 @@ func newEnrollCmdWithStore(
187187
}, nil
188188
}
189189

190-
// Execute tries to enroll the agent into Fleet.
190+
// Execute enrolls the agent into Fleet.
191191
func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
192192
var err error
193193
defer c.stopAgent() // ensure its stopped no matter what
194+
194195
span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
195196
defer func() {
196197
apm.CaptureError(ctx, err).Send()
@@ -235,7 +236,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
235236
// Ensure that the agent does not use a proxy configuration
236237
// when connecting to the local fleet server.
237238
// Note that when running fleet-server the enroll request will be sent to :8220,
238-
// however when the agent is running afterwards requests will be sent to :8221
239+
// however when the agent is running afterward requests will be sent to :8221
239240
c.remoteConfig.Transport.Proxy.Disable = true
240241
}
241242

@@ -256,7 +257,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
256257

257258
err = c.enrollWithBackoff(ctx, persistentConfig)
258259
if err != nil {
259-
return errors.New(err, "fail to enroll")
260+
return fmt.Errorf("fail to enroll: %w", err)
260261
}
261262

262263
if c.options.FixPermissions {
@@ -267,17 +268,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
267268
}
268269

269270
defer func() {
270-
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
271+
if err != nil {
272+
fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
273+
} else {
274+
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
275+
}
271276
}()
272277

273278
if c.agentProc == nil {
274-
if err := c.daemonReload(ctx); err != nil {
275-
c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
276-
} else {
277-
c.log.Info("Successfully triggered restart on running Elastic Agent.")
279+
if err = c.daemonReloadWithBackoff(ctx); err != nil {
280+
c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
281+
return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
278282
}
283+
284+
c.log.Info("Successfully triggered restart on running Elastic Agent.")
279285
return nil
280286
}
287+
281288
c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
282289
return nil
283290
}
@@ -443,24 +450,35 @@ func (c *enrollCmd) prepareFleetTLS() error {
443450

444451
func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
445452
err := c.daemonReload(ctx)
453+
if err != nil &&
454+
(errors.Is(err, context.DeadlineExceeded) ||
455+
errors.Is(err, context.Canceled)) {
456+
return fmt.Errorf("could not reload daemon: %w", err)
457+
}
446458
if err == nil {
447459
return nil
448460
}
449461

450462
signal := make(chan struct{})
463+
defer close(signal)
451464
backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)
452465

453-
for i := 5; i >= 0; i-- {
466+
for i := 0; i < 5; i++ {
454467
backExp.Wait()
455468
c.log.Info("Retrying to restart...")
456469
err = c.daemonReload(ctx)
470+
if err != nil &&
471+
(errors.Is(err, context.DeadlineExceeded) ||
472+
errors.Is(err, context.Canceled)) {
473+
return fmt.Errorf("could not reload daemon after %d retries: %w",
474+
i+1, err)
475+
}
457476
if err == nil {
458-
break
477+
return nil
459478
}
460479
}
461480

462-
close(signal)
463-
return err
481+
return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err)
464482
}
465483

466484
func (c *enrollCmd) daemonReload(ctx context.Context) error {
@@ -478,8 +496,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
478496

479497
c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
480498
err := c.enroll(ctx, persistentConfig)
499+
if err == nil {
500+
return nil
501+
}
502+
503+
const deadline = 10 * time.Minute
504+
const frequency = 60 * time.Second
505+
506+
c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
507+
deadline,
508+
frequency,
509+
c.client.URI())
481510
signal := make(chan struct{})
482-
backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)
511+
defer close(signal)
512+
backExp := backoff.NewExpBackoff(signal, frequency, deadline)
483513

484514
for {
485515
retry := false
@@ -498,7 +528,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
498528
err = c.enroll(ctx, persistentConfig)
499529
}
500530

501-
close(signal)
502531
return err
503532
}
504533

@@ -547,8 +576,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
547576
c.options.FleetServer.ElasticsearchInsecure,
548577
)
549578
if err != nil {
550-
return err
579+
return fmt.Errorf(
580+
"failed creating fleet-server bootstrap config: %w", err)
551581
}
582+
552583
// no longer need bootstrap at this point
553584
serverConfig.Server.Bootstrap = false
554585
fleetConfig.Server = serverConfig.Server
@@ -568,11 +599,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
568599

569600
reader, err := yamlToReader(configToStore)
570601
if err != nil {
571-
return err
602+
return fmt.Errorf("yamlToReader failed: %w", err)
572603
}
573604

574605
if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
575-
return err
606+
return fmt.Errorf("failed to store agent config: %w", err)
576607
}
577608

578609
// clear action store

internal/pkg/agent/cmd/enroll_cmd_test.go

+50-22
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@ import (
1616
"os"
1717
"runtime"
1818
"strconv"
19+
"strings"
1920
"testing"
21+
"time"
2022

23+
"github.com/stretchr/testify/assert"
2124
"github.com/stretchr/testify/require"
2225

2326
"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
@@ -159,14 +162,23 @@ func TestEnroll(t *testing.T) {
159162
require.NoError(t, err)
160163

161164
streams, _, _, _ := cli.NewTestingIOStreams()
162-
err = cmd.Execute(context.Background(), streams)
163-
require.NoError(t, err)
164-
165+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
166+
defer cancel()
167+
err = cmd.Execute(ctx, streams)
168+
169+
// There is no agent running, therefore nothing to be restarted.
170+
// However, this will cause the Enroll command to return an error
171+
// which we'll ignore here.
172+
require.ErrorContainsf(t, err,
173+
"could not reload agent daemon, unable to trigger restart",
174+
"enroll command returned an unexpected error")
175+
require.ErrorContainsf(t, err, context.DeadlineExceeded.Error(),
176+
"it should fail only due to %q", context.DeadlineExceeded)
165177
config, err := readConfig(store.Content)
166-
167178
require.NoError(t, err)
168-
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
169-
require.Equal(t, host, config.Client.Host)
179+
180+
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
181+
assert.Equal(t, host, config.Client.Host)
170182
},
171183
))
172184

@@ -216,16 +228,24 @@ func TestEnroll(t *testing.T) {
216228
require.NoError(t, err)
217229

218230
streams, _, _, _ := cli.NewTestingIOStreams()
219-
err = cmd.Execute(context.Background(), streams)
220-
require.NoError(t, err)
221-
222-
require.True(t, store.Called)
223-
231+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
232+
defer cancel()
233+
err = cmd.Execute(ctx, streams)
234+
if err != nil &&
235+
// There is no agent running, therefore nothing to be restarted.
236+
// However, this will cause the Enroll command to return an error
237+
// which we'll ignore here.
238+
!strings.Contains(err.Error(),
239+
"could not reload agent daemon, unable to trigger restart") {
240+
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
241+
}
242+
243+
assert.True(t, store.Called)
224244
config, err := readConfig(store.Content)
225245

226-
require.NoError(t, err)
227-
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
228-
require.Equal(t, host, config.Client.Host)
246+
assert.NoError(t, err)
247+
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
248+
assert.Equal(t, host, config.Client.Host)
229249
},
230250
))
231251

@@ -275,16 +295,24 @@ func TestEnroll(t *testing.T) {
275295
require.NoError(t, err)
276296

277297
streams, _, _, _ := cli.NewTestingIOStreams()
278-
err = cmd.Execute(context.Background(), streams)
279-
require.NoError(t, err)
280-
281-
require.True(t, store.Called)
282-
298+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
299+
defer cancel()
300+
err = cmd.Execute(ctx, streams)
301+
302+
if err != nil &&
303+
// There is no agent running, therefore nothing to be restarted.
304+
// However, this will cause the Enroll command to return an error
305+
// which we'll ignore here.
306+
!strings.Contains(err.Error(),
307+
"could not reload agent daemon, unable to trigger restart") {
308+
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
309+
}
310+
311+
assert.True(t, store.Called)
283312
config, err := readConfig(store.Content)
284-
285313
require.NoError(t, err)
286-
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
287-
require.Equal(t, host, config.Client.Host)
314+
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
315+
assert.Equal(t, host, config.Client.Host)
288316
},
289317
))
290318

internal/pkg/agent/cmd/install.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
154154
return fmt.Errorf("problem reading prompt response")
155155
}
156156
if url == "" {
157-
fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n")
157+
fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.")
158158
return nil
159159
}
160160
}
@@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
224224
}
225225
}()
226226
}
227+
228+
fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.")
227229
}
228230

229231
if enroll {

internal/pkg/agent/install/perms_unix.go

+20-12
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package install
88

99
import (
1010
"errors"
11+
"fmt"
1112
"io/fs"
1213
"os"
1314
"path/filepath"
@@ -18,19 +19,26 @@ func fixPermissions(topPath string) error {
1819
return recursiveRootPermissions(topPath)
1920
}
2021

21-
func recursiveRootPermissions(path string) error {
22-
return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error {
23-
if err == nil {
24-
// all files should be owned by root:root
25-
err = os.Chown(name, 0, 0)
26-
if err != nil {
27-
return err
28-
}
29-
// remove any world permissions from the file
30-
err = os.Chmod(name, info.Mode().Perm()&0770)
31-
} else if errors.Is(err, fs.ErrNotExist) {
22+
func recursiveRootPermissions(root string) error {
23+
return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error {
24+
if errors.Is(err, fs.ErrNotExist) {
3225
return nil
3326
}
34-
return err
27+
if err != nil {
28+
return fmt.Errorf("walk on %q failed: %w", path, err)
29+
}
30+
31+
// all files should be owned by root:root
32+
err = os.Chown(path, 0, 0)
33+
if err != nil {
34+
return fmt.Errorf("could not fix ownership of %q: %w", path, err)
35+
}
36+
// remove any world permissions from the file
37+
err = os.Chmod(path, info.Mode().Perm()&0770)
38+
if err != nil {
39+
return fmt.Errorf("could not fix permissions of %q: %w", path, err)
40+
}
41+
42+
return nil
3543
})
3644
}

0 commit comments

Comments
 (0)