Skip to content

Commit d8b9f6d

Browse files
committed
Revert "Revert "install fails if enroll fails (elastic#3554)" (elastic#3629)"
This reverts commit 1fd44fe.
1 parent bbac783 commit d8b9f6d

File tree

7 files changed

+156
-55
lines changed

7 files changed

+156
-55
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: bug-fix
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Surface errors during Agent's enroll process, failing if any happens.
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; a word indicating the component this changeset affects.
22+
component: install/enroll
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/3207
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
#issue: https://github.com/owner/repo/1234

dev-tools/mage/godaemon.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ var (
2121
}
2222
)
2323

24-
// BuildGoDaemon builds the go-deamon binary.
24+
// BuildGoDaemon builds the go-daemon binary.
2525
func BuildGoDaemon() error {
2626
if GOOS != "linux" {
2727
return errors.New("go-daemon only builds for linux")

internal/pkg/agent/cmd/enroll.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ func enroll(streams *cli.IOStreams, cmd *cobra.Command) error {
351351
// Error: failed to fix permissions: chown /Library/Elastic/Agent/data/elastic-agent-c13f91/elastic-agent.app: operation not permitted
352352
// This is because we are fixing permissions twice, once during installation and again during the enrollment step.
353353
// When we are enrolling as part of installation on MacOS, skip the second attempt to fix permissions.
354-
var fixPermissions bool = fromInstall
354+
fixPermissions := fromInstall
355355
if runtime.GOOS == "darwin" {
356356
fixPermissions = false
357357
}

internal/pkg/agent/cmd/enroll_cmd.go

+49-18
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ func newEnrollCmd(
189189
)
190190
}
191191

192-
// newEnrollCmdWithStore creates an new enrollment and accept a custom store.
192+
// newEnrollCmdWithStore creates a new enrollment and accept a custom store.
193193
func newEnrollCmdWithStore(
194194
log *logger.Logger,
195195
options *enrollCmdOption,
@@ -204,10 +204,11 @@ func newEnrollCmdWithStore(
204204
}, nil
205205
}
206206

207-
// Execute tries to enroll the agent into Fleet.
207+
// Execute enrolls the agent into Fleet.
208208
func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
209209
var err error
210210
defer c.stopAgent() // ensure its stopped no matter what
211+
211212
span, ctx := apm.StartSpan(ctx, "enroll", "app.internal")
212213
defer func() {
213214
apm.CaptureError(ctx, err).Send()
@@ -252,7 +253,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
252253
// Ensure that the agent does not use a proxy configuration
253254
// when connecting to the local fleet server.
254255
// Note that when running fleet-server the enroll request will be sent to :8220,
255-
// however when the agent is running afterwards requests will be sent to :8221
256+
// however when the agent is running afterward requests will be sent to :8221
256257
c.remoteConfig.Transport.Proxy.Disable = true
257258
}
258259

@@ -273,7 +274,7 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
273274

274275
err = c.enrollWithBackoff(ctx, persistentConfig)
275276
if err != nil {
276-
return errors.New(err, "fail to enroll")
277+
return fmt.Errorf("fail to enroll: %w", err)
277278
}
278279

279280
if c.options.FixPermissions {
@@ -284,17 +285,23 @@ func (c *enrollCmd) Execute(ctx context.Context, streams *cli.IOStreams) error {
284285
}
285286

286287
defer func() {
287-
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
288+
if err != nil {
289+
fmt.Fprintf(streams.Err, "Something went wrong while enrolling the Elastic Agent: %v\n", err)
290+
} else {
291+
fmt.Fprintln(streams.Out, "Successfully enrolled the Elastic Agent.")
292+
}
288293
}()
289294

290295
if c.agentProc == nil {
291-
if err := c.daemonReload(ctx); err != nil {
292-
c.log.Infow("Elastic Agent might not be running; unable to trigger restart", "error", err)
293-
} else {
294-
c.log.Info("Successfully triggered restart on running Elastic Agent.")
296+
if err = c.daemonReloadWithBackoff(ctx); err != nil {
297+
c.log.Errorf("Elastic Agent might not be running; unable to trigger restart: %v", err)
298+
return fmt.Errorf("could not reload agent daemon, unable to trigger restart: %w", err)
295299
}
300+
301+
c.log.Info("Successfully triggered restart on running Elastic Agent.")
296302
return nil
297303
}
304+
298305
c.log.Info("Elastic Agent has been enrolled; start Elastic Agent")
299306
return nil
300307
}
@@ -460,24 +467,35 @@ func (c *enrollCmd) prepareFleetTLS() error {
460467

461468
func (c *enrollCmd) daemonReloadWithBackoff(ctx context.Context) error {
462469
err := c.daemonReload(ctx)
470+
if err != nil &&
471+
(errors.Is(err, context.DeadlineExceeded) ||
472+
errors.Is(err, context.Canceled)) {
473+
return fmt.Errorf("could not reload daemon: %w", err)
474+
}
463475
if err == nil {
464476
return nil
465477
}
466478

467479
signal := make(chan struct{})
480+
defer close(signal)
468481
backExp := backoff.NewExpBackoff(signal, 10*time.Second, 1*time.Minute)
469482

470-
for i := 5; i >= 0; i-- {
483+
for i := 0; i < 5; i++ {
471484
backExp.Wait()
472485
c.log.Info("Retrying to restart...")
473486
err = c.daemonReload(ctx)
487+
if err != nil &&
488+
(errors.Is(err, context.DeadlineExceeded) ||
489+
errors.Is(err, context.Canceled)) {
490+
return fmt.Errorf("could not reload daemon after %d retries: %w",
491+
i+1, err)
492+
}
474493
if err == nil {
475-
break
494+
return nil
476495
}
477496
}
478497

479-
close(signal)
480-
return err
498+
return fmt.Errorf("could not reload agent's daemon, all retries failed. Last error: %w", err)
481499
}
482500

483501
func (c *enrollCmd) daemonReload(ctx context.Context) (err error) {
@@ -501,8 +519,20 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
501519

502520
c.log.Infof("Starting enrollment to URL: %s", c.client.URI())
503521
err := c.enroll(ctx, persistentConfig)
522+
if err == nil {
523+
return nil
524+
}
525+
526+
const deadline = 10 * time.Minute
527+
const frequency = 60 * time.Second
528+
529+
c.log.Infof("1st enrollment attempt failed, retrying for %s, every %s enrolling to URL: %s",
530+
deadline,
531+
frequency,
532+
c.client.URI())
504533
signal := make(chan struct{})
505-
backExp := backoff.NewExpBackoff(signal, 60*time.Second, 10*time.Minute)
534+
defer close(signal)
535+
backExp := backoff.NewExpBackoff(signal, frequency, deadline)
506536

507537
for {
508538
retry := false
@@ -521,7 +551,6 @@ func (c *enrollCmd) enrollWithBackoff(ctx context.Context, persistentConfig map[
521551
err = c.enroll(ctx, persistentConfig)
522552
}
523553

524-
close(signal)
525554
return err
526555
}
527556

@@ -570,8 +599,10 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
570599
c.options.FleetServer.ElasticsearchInsecure,
571600
)
572601
if err != nil {
573-
return err
602+
return fmt.Errorf(
603+
"failed creating fleet-server bootstrap config: %w", err)
574604
}
605+
575606
// no longer need bootstrap at this point
576607
serverConfig.Server.Bootstrap = false
577608
fleetConfig.Server = serverConfig.Server
@@ -591,11 +622,11 @@ func (c *enrollCmd) enroll(ctx context.Context, persistentConfig map[string]inte
591622

592623
reader, err := yamlToReader(configToStore)
593624
if err != nil {
594-
return err
625+
return fmt.Errorf("yamlToReader failed: %w", err)
595626
}
596627

597628
if err := safelyStoreAgentInfo(c.configStore, reader); err != nil {
598-
return err
629+
return fmt.Errorf("failed to store agent config: %w", err)
599630
}
600631

601632
// clear action store

internal/pkg/agent/cmd/enroll_cmd_test.go

+50-22
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,11 @@ import (
1717
"os"
1818
"runtime"
1919
"strconv"
20+
"strings"
2021
"testing"
22+
"time"
2123

24+
"github.com/stretchr/testify/assert"
2225
"github.com/stretchr/testify/require"
2326

2427
"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
@@ -160,14 +163,23 @@ func TestEnroll(t *testing.T) {
160163
require.NoError(t, err)
161164

162165
streams, _, _, _ := cli.NewTestingIOStreams()
163-
err = cmd.Execute(context.Background(), streams)
164-
require.NoError(t, err)
165-
166+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
167+
defer cancel()
168+
err = cmd.Execute(ctx, streams)
169+
170+
// There is no agent running, therefore nothing to be restarted.
171+
// However, this will cause the Enroll command to return an error
172+
// which we'll ignore here.
173+
require.ErrorContainsf(t, err,
174+
"could not reload agent daemon, unable to trigger restart",
175+
"enroll command returned an unexpected error")
176+
require.ErrorContainsf(t, err, context.DeadlineExceeded.Error(),
177+
"it should fail only due to %q", context.DeadlineExceeded)
166178
config, err := readConfig(store.Content)
167-
168179
require.NoError(t, err)
169-
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
170-
require.Equal(t, host, config.Client.Host)
180+
181+
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
182+
assert.Equal(t, host, config.Client.Host)
171183
},
172184
))
173185

@@ -217,16 +229,24 @@ func TestEnroll(t *testing.T) {
217229
require.NoError(t, err)
218230

219231
streams, _, _, _ := cli.NewTestingIOStreams()
220-
err = cmd.Execute(context.Background(), streams)
221-
require.NoError(t, err)
222-
223-
require.True(t, store.Called)
224-
232+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
233+
defer cancel()
234+
err = cmd.Execute(ctx, streams)
235+
if err != nil &&
236+
// There is no agent running, therefore nothing to be restarted.
237+
// However, this will cause the Enroll command to return an error
238+
// which we'll ignore here.
239+
!strings.Contains(err.Error(),
240+
"could not reload agent daemon, unable to trigger restart") {
241+
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
242+
}
243+
244+
assert.True(t, store.Called)
225245
config, err := readConfig(store.Content)
226246

227-
require.NoError(t, err)
228-
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
229-
require.Equal(t, host, config.Client.Host)
247+
assert.NoError(t, err)
248+
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
249+
assert.Equal(t, host, config.Client.Host)
230250
},
231251
))
232252

@@ -276,16 +296,24 @@ func TestEnroll(t *testing.T) {
276296
require.NoError(t, err)
277297

278298
streams, _, _, _ := cli.NewTestingIOStreams()
279-
err = cmd.Execute(context.Background(), streams)
280-
require.NoError(t, err)
281-
282-
require.True(t, store.Called)
283-
299+
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
300+
defer cancel()
301+
err = cmd.Execute(ctx, streams)
302+
303+
if err != nil &&
304+
// There is no agent running, therefore nothing to be restarted.
305+
// However, this will cause the Enroll command to return an error
306+
// which we'll ignore here.
307+
!strings.Contains(err.Error(),
308+
"could not reload agent daemon, unable to trigger restart") {
309+
t.Fatalf("enrrol coms returned and unexpected error: %v", err)
310+
}
311+
312+
assert.True(t, store.Called)
284313
config, err := readConfig(store.Content)
285-
286314
require.NoError(t, err)
287-
require.Equal(t, "my-access-api-key", config.AccessAPIKey)
288-
require.Equal(t, host, config.Client.Host)
315+
assert.Equal(t, "my-access-api-key", config.AccessAPIKey)
316+
assert.Equal(t, host, config.Client.Host)
289317
},
290318
))
291319

internal/pkg/agent/cmd/install.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
154154
return fmt.Errorf("problem reading prompt response")
155155
}
156156
if url == "" {
157-
fmt.Fprintf(streams.Out, "Enrollment cancelled because no URL was provided.\n")
157+
fmt.Fprintln(streams.Out, "Enrollment cancelled because no URL was provided.")
158158
return nil
159159
}
160160
}
@@ -224,6 +224,8 @@ func installCmd(streams *cli.IOStreams, cmd *cobra.Command) error {
224224
}
225225
}()
226226
}
227+
228+
fmt.Fprintln(streams.Out, "Elastic Agent successfully installed, starting enrollment.")
227229
}
228230

229231
if enroll {

internal/pkg/agent/install/perms_unix.go

+20-12
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package install
88

99
import (
1010
"errors"
11+
"fmt"
1112
"io/fs"
1213
"os"
1314
"path/filepath"
@@ -18,19 +19,26 @@ func fixPermissions(topPath string) error {
1819
return recursiveRootPermissions(topPath)
1920
}
2021

21-
func recursiveRootPermissions(path string) error {
22-
return filepath.Walk(path, func(name string, info fs.FileInfo, err error) error {
23-
if err == nil {
24-
// all files should be owned by root:root
25-
err = os.Chown(name, 0, 0)
26-
if err != nil {
27-
return err
28-
}
29-
// remove any world permissions from the file
30-
err = os.Chmod(name, info.Mode().Perm()&0770)
31-
} else if errors.Is(err, fs.ErrNotExist) {
22+
func recursiveRootPermissions(root string) error {
23+
return filepath.Walk(root, func(path string, info fs.FileInfo, err error) error {
24+
if errors.Is(err, fs.ErrNotExist) {
3225
return nil
3326
}
34-
return err
27+
if err != nil {
28+
return fmt.Errorf("walk on %q failed: %w", path, err)
29+
}
30+
31+
// all files should be owned by root:root
32+
err = os.Chown(path, 0, 0)
33+
if err != nil {
34+
return fmt.Errorf("could not fix ownership of %q: %w", path, err)
35+
}
36+
// remove any world permissions from the file
37+
err = os.Chmod(path, info.Mode().Perm()&0770)
38+
if err != nil {
39+
return fmt.Errorf("could not fix permissions of %q: %w", path, err)
40+
}
41+
42+
return nil
3543
})
3644
}

0 commit comments

Comments
 (0)