Skip to content

Commit 272c8e4

Browse files
authored
Implementing RetryUntil and RetryError fields in Upgrade Details metadata (#3845)
* Implementing RetryUntil and RetryableError fields in Upgrade Details metadata * Add unit test case * Adding CHANGELOG entry * Adding unit test case * Update unit test * (re)tried -> retried * retryable_error_msg/RetryableErrorMsg -> retry_error_msg/RetryErrorMsg * Adding godoc for mockUpgradeDetails
1 parent d80cadd commit 272c8e4

File tree

12 files changed

+505
-219
lines changed

12 files changed

+505
-219
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Kind can be one of:
2+
# - breaking-change: a change to previously-documented behavior
3+
# - deprecation: functionality that is being removed in a later release
4+
# - bug-fix: fixes a problem in a previous version
5+
# - enhancement: extends functionality but does not break or fix existing behavior
6+
# - feature: new functionality
7+
# - known-issue: problems that we are aware of in a given version
8+
# - security: impacts on the security of a product or a user’s deployment.
9+
# - upgrade: important information for someone upgrading from a prior version
10+
# - other: does not fit into any of the other categories
11+
kind: feature
12+
13+
# Change summary; a 80ish characters long description of the change.
14+
summary: Add metadata for retryable upgrade steps to upgrade details
15+
16+
# Long description; in case the summary is not enough to describe the change
17+
# this field accommodate a description without length limits.
18+
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
19+
#description:
20+
21+
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
22+
component: elastic-agent
23+
24+
# PR URL; optional; the PR number that added the changeset.
25+
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
26+
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
27+
# Please provide it if you are adding a fragment for a different PR.
28+
pr: https://github.com/elastic/elastic-agent/pull/3845
29+
30+
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
31+
# If not present is automatically filled by the tooling with the issue linked to the PR number.
32+
issue: https://github.com/elastic/elastic-agent/issues/3818

control_v2.proto

+9-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ message UpgradeDetails {
213213
message UpgradeDetailsMetadata {
214214
// If the upgrade is a scheduled upgrade, the timestamp of when the
215215
// upgrade is expected to start.
216-
google.protobuf.Timestamp scheduled_at = 1;
216+
string scheduled_at = 1;
217217

218218
// If the upgrade is in the UPG_DOWNLOADING state, the percentage of
219219
// the Elastic Agent artifact that has already been downloaded, to
@@ -225,6 +225,14 @@ message UpgradeDetailsMetadata {
225225

226226
// Any error encountered during the upgrade process.
227227
string error_msg = 4;
228+
229+
// Any error message that is a result of a retryable upgrade
230+
// step, e.g. the download step, being retried.
231+
string retry_error_msg = 5;
232+
233+
// The deadline until when a retryable upgrade step, e.g. the download
234+
// step, will be retried.
235+
string retry_until = 6;
228236
}
229237

230238
// DiagnosticFileResult is a file result from a diagnostic result.

internal/pkg/agent/application/coordinator/diagnostics_test.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,7 @@ func TestDiagnosticState(t *testing.T) {
441441
DownloadPercent: 0.17469,
442442
ScheduledAt: &now,
443443
DownloadRate: 123.56,
444+
RetryUntil: &now,
444445
},
445446
},
446447
}
@@ -470,7 +471,8 @@ upgrade_details:
470471
download_percent: 0.17469
471472
scheduled_at: %s
472473
download_rate: 123.56
473-
`, now.Format(time.RFC3339Nano))
474+
retry_until: %s
475+
`, now.Format(time.RFC3339Nano), now.Format(time.RFC3339Nano))
474476

475477
coord := &Coordinator{
476478
// This test needs a broadcaster since the components-actual diagnostic

internal/pkg/agent/application/upgrade/details/details.go

+33-1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ type Metadata struct {
3737
// is progressing.
3838
DownloadRate details.DownloadRate `json:"download_rate,omitempty" yaml:"download_rate,omitempty"`
3939

40+
// RetryErrorMsg is any error message that is a result of a retryable upgrade
41+
// step, e.g. the download step, being retried.
42+
RetryErrorMsg string `json:"retry_error_msg,omitempty" yaml:"retry_error_msg,omitempty"`
43+
44+
// RetryUntil is the deadline until when a retryable upgrade step, e.g. the download
45+
// step, will be retried.
46+
RetryUntil *time.Time `json:"retry_until,omitempty" yaml:"retry_until"`
47+
4048
// FailedState is the state an upgrade was in if/when it failed. Use the
4149
// Fail() method of UpgradeDetails to correctly record details when
4250
// an upgrade fails.
@@ -89,6 +97,28 @@ func (d *Details) SetDownloadProgress(percent, rateBytesPerSecond float64) {
8997
d.notifyObservers()
9098
}
9199

100+
// SetRetryableError sets the RetryErrorMsg metadata field.
101+
func (d *Details) SetRetryableError(retryableError error) {
102+
d.mu.Lock()
103+
defer d.mu.Unlock()
104+
105+
if retryableError == nil {
106+
d.Metadata.RetryErrorMsg = ""
107+
} else {
108+
d.Metadata.RetryErrorMsg = retryableError.Error()
109+
}
110+
d.notifyObservers()
111+
}
112+
113+
// SetRetryUntil sets the RetryUntil metadata field.
114+
func (d *Details) SetRetryUntil(retryUntil *time.Time) {
115+
d.mu.Lock()
116+
defer d.mu.Unlock()
117+
118+
d.Metadata.RetryUntil = retryUntil
119+
d.notifyObservers()
120+
}
121+
92122
// Fail is a convenience method to set the state of the upgrade
93123
// to StateFailed, set metadata associated with the failure, and
94124
// notify all observers.
@@ -163,7 +193,9 @@ func (m Metadata) Equals(otherM Metadata) bool {
163193
m.FailedState == otherM.FailedState &&
164194
m.ErrorMsg == otherM.ErrorMsg &&
165195
m.DownloadPercent == otherM.DownloadPercent &&
166-
m.DownloadRate == otherM.DownloadRate
196+
m.DownloadRate == otherM.DownloadRate &&
197+
equalTimePointers(m.RetryUntil, otherM.RetryUntil) &&
198+
m.RetryErrorMsg == otherM.RetryErrorMsg
167199
}
168200

169201
func equalTimePointers(t, otherT *time.Time) bool {

internal/pkg/agent/application/upgrade/details/details_test.go

+7
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"errors"
1010
"math"
1111
"testing"
12+
"time"
1213

1314
"github.com/stretchr/testify/require"
1415
)
@@ -102,10 +103,16 @@ func TestDetailsDownloadRateJSON(t *testing.T) {
102103
func TestEquals(t *testing.T) {
103104
details1 := NewDetails("8.12.0", StateDownloading, "foobar")
104105
details1.SetDownloadProgress(0.1234, 34.56)
106+
details1.SetRetryableError(errors.New("retryable error"))
107+
retryUntil1 := time.Date(2023, 11, 29, 11, 00, 32, 0, time.UTC)
108+
details1.SetRetryUntil(&retryUntil1)
105109
details1.Fail(errors.New("download failed"))
106110

107111
details2 := NewDetails("8.12.0", StateDownloading, "foobar")
108112
details2.SetDownloadProgress(0.1234, 34.56)
113+
details2.SetRetryableError(errors.New("retryable error"))
114+
retryUntil2 := time.Date(2023, 11, 29, 11, 00, 32, 0, time.UTC)
115+
details2.SetRetryUntil(&retryUntil2)
109116
details2.Fail(errors.New("download failed"))
110117

111118
details3 := NewDetails("8.12.0", StateDownloading, "foobar")

internal/pkg/agent/application/upgrade/step_download.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,12 @@ func (u *Upgrader) downloadWithRetries(
235235
settings *artifact.Config,
236236
upgradeDetails *details.Details,
237237
) (string, error) {
238-
cancelCtx, cancel := context.WithTimeout(ctx, settings.Timeout)
238+
cancelDeadline := time.Now().Add(settings.Timeout)
239+
cancelCtx, cancel := context.WithDeadline(ctx, cancelDeadline)
239240
defer cancel()
240241

242+
upgradeDetails.SetRetryUntil(&cancelDeadline)
243+
241244
expBo := backoff.NewExponentialBackOff()
242245
expBo.InitialInterval = settings.RetrySleepInitDuration
243246
boCtx := backoff.WithContext(expBo, cancelCtx)
@@ -259,11 +262,16 @@ func (u *Upgrader) downloadWithRetries(
259262
opFailureNotificationFn := func(err error, retryAfter time.Duration) {
260263
u.log.Warnf("download attempt %d failed: %s; retrying in %s.",
261264
attempt, err.Error(), retryAfter)
265+
upgradeDetails.SetRetryableError(err)
262266
}
263267

264268
if err := backoff.RetryNotify(opFn, boCtx, opFailureNotificationFn); err != nil {
265269
return "", err
266270
}
267271

272+
// Clear retry details upon success
273+
upgradeDetails.SetRetryableError(nil)
274+
upgradeDetails.SetRetryUntil(nil)
275+
268276
return path, nil
269277
}

internal/pkg/agent/application/upgrade/step_download_test.go

+95-4
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,27 @@ func TestDownloadWithRetries(t *testing.T) {
9494

9595
parsedVersion, err := agtversion.ParseVersion("8.9.0")
9696
require.NoError(t, err)
97-
upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
97+
98+
upgradeDetails, upgradeDetailsRetryUntil, upgradeDetailsRetryUntilWasUnset, upgradeDetailsRetryErrorMsg := mockUpgradeDetails(parsedVersion)
99+
minRetryDeadline := time.Now().Add(settings.Timeout)
100+
98101
path, err := u.downloadWithRetries(context.Background(), mockDownloaderCtor, parsedVersion, &settings, upgradeDetails)
99102
require.NoError(t, err)
100103
require.Equal(t, expectedDownloadPath, path)
101104

102105
logs := obs.TakeAll()
103106
require.Len(t, logs, 1)
104107
require.Equal(t, "download attempt 1", logs[0].Message)
108+
109+
// Check that upgradeDetails.Metadata.RetryUntil was set at some point
110+
// during the retryable download and then check that it was unset upon
111+
// successful download.
112+
require.GreaterOrEqual(t, *upgradeDetailsRetryUntil, minRetryDeadline)
113+
require.True(t, *upgradeDetailsRetryUntilWasUnset)
114+
require.Nil(t, upgradeDetails.Metadata.RetryUntil)
115+
116+
// Check that upgradeDetails.Metadata.RetryErrorMsg was never set.
117+
require.Empty(t, *upgradeDetailsRetryErrorMsg)
105118
})
106119

107120
// Downloader constructor failing on first attempt, but succeeding on second attempt (= first retry)
@@ -131,7 +144,10 @@ func TestDownloadWithRetries(t *testing.T) {
131144

132145
parsedVersion, err := agtversion.ParseVersion("8.9.0")
133146
require.NoError(t, err)
134-
upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
147+
148+
upgradeDetails, upgradeDetailsRetryUntil, upgradeDetailsRetryUntilWasUnset, upgradeDetailsRetryErrorMsg := mockUpgradeDetails(parsedVersion)
149+
minRetryDeadline := time.Now().Add(settings.Timeout)
150+
135151
path, err := u.downloadWithRetries(context.Background(), mockDownloaderCtor, parsedVersion, &settings, upgradeDetails)
136152
require.NoError(t, err)
137153
require.Equal(t, expectedDownloadPath, path)
@@ -141,6 +157,19 @@ func TestDownloadWithRetries(t *testing.T) {
141157
require.Equal(t, "download attempt 1", logs[0].Message)
142158
require.Contains(t, logs[1].Message, "unable to create fetcher: failed to construct downloader")
143159
require.Equal(t, "download attempt 2", logs[2].Message)
160+
161+
// Check that upgradeDetails.Metadata.RetryUntil was set at some point
162+
// during the retryable download and then check that it was unset upon
163+
// successful download.
164+
require.GreaterOrEqual(t, *upgradeDetailsRetryUntil, minRetryDeadline)
165+
require.True(t, *upgradeDetailsRetryUntilWasUnset)
166+
require.Nil(t, upgradeDetails.Metadata.RetryUntil)
167+
168+
// Check that upgradeDetails.Metadata.RetryErrorMsg was set at some point
169+
// during the retryable download and then check that it was unset upon
170+
// successful download.
171+
require.NotEmpty(t, *upgradeDetailsRetryErrorMsg)
172+
require.Empty(t, upgradeDetails.Metadata.RetryErrorMsg)
144173
})
145174

146175
// Download failing on first attempt, but succeeding on second attempt (= first retry)
@@ -170,7 +199,10 @@ func TestDownloadWithRetries(t *testing.T) {
170199

171200
parsedVersion, err := agtversion.ParseVersion("8.9.0")
172201
require.NoError(t, err)
173-
upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
202+
203+
upgradeDetails, upgradeDetailsRetryUntil, upgradeDetailsRetryUntilWasUnset, upgradeDetailsRetryErrorMsg := mockUpgradeDetails(parsedVersion)
204+
minRetryDeadline := time.Now().Add(settings.Timeout)
205+
174206
path, err := u.downloadWithRetries(context.Background(), mockDownloaderCtor, parsedVersion, &settings, upgradeDetails)
175207
require.NoError(t, err)
176208
require.Equal(t, expectedDownloadPath, path)
@@ -180,6 +212,19 @@ func TestDownloadWithRetries(t *testing.T) {
180212
require.Equal(t, "download attempt 1", logs[0].Message)
181213
require.Contains(t, logs[1].Message, "unable to download package: download failed; retrying")
182214
require.Equal(t, "download attempt 2", logs[2].Message)
215+
216+
// Check that upgradeDetails.Metadata.RetryUntil was set at some point
217+
// during the retryable download and then check that it was unset upon
218+
// successful download.
219+
require.GreaterOrEqual(t, *upgradeDetailsRetryUntil, minRetryDeadline)
220+
require.True(t, *upgradeDetailsRetryUntilWasUnset)
221+
require.Nil(t, upgradeDetails.Metadata.RetryUntil)
222+
223+
// Check that upgradeDetails.Metadata.RetryErrorMsg was set at some point
224+
// during the retryable download and then check that it was unset upon
225+
// successful download.
226+
require.NotEmpty(t, *upgradeDetailsRetryErrorMsg)
227+
require.Empty(t, upgradeDetails.Metadata.RetryErrorMsg)
183228
})
184229

185230
// Download timeout expired (before all retries are exhausted)
@@ -197,7 +242,10 @@ func TestDownloadWithRetries(t *testing.T) {
197242

198243
parsedVersion, err := agtversion.ParseVersion("8.9.0")
199244
require.NoError(t, err)
200-
upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
245+
246+
upgradeDetails, upgradeDetailsRetryUntil, upgradeDetailsRetryUntilWasUnset, upgradeDetailsRetryErrorMsg := mockUpgradeDetails(parsedVersion)
247+
minRetryDeadline := time.Now().Add(testCaseSettings.Timeout)
248+
201249
path, err := u.downloadWithRetries(context.Background(), mockDownloaderCtor, parsedVersion, &testCaseSettings, upgradeDetails)
202250
require.Equal(t, "context deadline exceeded", err.Error())
203251
require.Equal(t, "", path)
@@ -209,5 +257,48 @@ func TestDownloadWithRetries(t *testing.T) {
209257
require.Equal(t, fmt.Sprintf("download attempt %d", i+1), logs[(2*i)].Message)
210258
require.Contains(t, logs[(2*i+1)].Message, "unable to download package: download failed; retrying")
211259
}
260+
261+
// Check that upgradeDetails.Metadata.RetryUntil was set at some point
262+
// during the retryable download and then check that it was never unset,
263+
// since we didn't have a successful download.
264+
require.GreaterOrEqual(t, *upgradeDetailsRetryUntil, minRetryDeadline)
265+
require.False(t, *upgradeDetailsRetryUntilWasUnset)
266+
require.Equal(t, *upgradeDetailsRetryUntil, *upgradeDetails.Metadata.RetryUntil)
267+
268+
// Check that upgradeDetails.Metadata.RetryErrorMsg was set at some point
269+
// during the retryable download and then check that it was never unset,
270+
//since we didn't have a successful download.
271+
require.NotEmpty(t, *upgradeDetailsRetryErrorMsg)
272+
require.Equal(t, *upgradeDetailsRetryErrorMsg, upgradeDetails.Metadata.RetryErrorMsg)
212273
})
213274
}
275+
276+
// mockUpgradeDetails returns a *details.Details value that has an observer registered on it for inspecting
277+
// certain properties of the object being set and unset. It also returns:
278+
// - a *time.Time value, which will be not nil if Metadata.RetryUntil is set on the mock value,
279+
// - a *bool value, which will be true if Metadata.RetryUntil is set and then unset on the mock value,
280+
// - a *string value, which will be non-empty if Metadata.RetryErrorMsg is set on the mock value.
281+
func mockUpgradeDetails(parsedVersion *agtversion.ParsedSemVer) (*details.Details, *time.Time, *bool, *string) {
282+
var upgradeDetailsRetryUntil time.Time
283+
var upgradeDetailsRetryUntilWasUnset bool
284+
var upgradeDetailsRetryErrorMsg string
285+
286+
upgradeDetails := details.NewDetails(parsedVersion.String(), details.StateRequested, "")
287+
upgradeDetails.RegisterObserver(func(details *details.Details) {
288+
if details.Metadata.RetryUntil != nil {
289+
upgradeDetailsRetryUntil = *details.Metadata.RetryUntil
290+
}
291+
292+
if !upgradeDetailsRetryUntil.IsZero() && details.Metadata.RetryUntil == nil {
293+
upgradeDetailsRetryUntilWasUnset = true
294+
}
295+
296+
if details.Metadata.RetryErrorMsg != "" {
297+
upgradeDetailsRetryErrorMsg = details.Metadata.RetryErrorMsg
298+
}
299+
})
300+
301+
return upgradeDetails,
302+
&upgradeDetailsRetryUntil, &upgradeDetailsRetryUntilWasUnset,
303+
&upgradeDetailsRetryErrorMsg
304+
}

internal/pkg/agent/cmd/status.go

+19-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"time"
1515

1616
"github.com/elastic/elastic-agent/internal/pkg/agent/application/upgrade/details"
17+
"github.com/elastic/elastic-agent/pkg/control"
1718
"github.com/elastic/elastic-agent/pkg/control/v2/client"
1819
"github.com/elastic/elastic-agent/pkg/control/v2/cproto"
1920

@@ -168,8 +169,8 @@ func listUpgradeDetails(l list.Writer, upgradeDetails *cproto.UpgradeDetails) {
168169
if upgradeDetails.Metadata != nil {
169170
l.AppendItem("metadata")
170171
l.Indent()
171-
if upgradeDetails.Metadata.ScheduledAt != nil && !upgradeDetails.Metadata.ScheduledAt.AsTime().IsZero() {
172-
l.AppendItem("scheduled_at: " + upgradeDetails.Metadata.ScheduledAt.AsTime().UTC().Format(time.RFC3339))
172+
if upgradeDetails.Metadata.ScheduledAt != "" {
173+
l.AppendItem("scheduled_at: " + upgradeDetails.Metadata.ScheduledAt)
173174
}
174175
if upgradeDetails.Metadata.FailedState != "" {
175176
l.AppendItem("failed_state: " + upgradeDetails.Metadata.FailedState)
@@ -180,6 +181,12 @@ func listUpgradeDetails(l list.Writer, upgradeDetails *cproto.UpgradeDetails) {
180181
if upgradeDetails.State == string(details.StateDownloading) {
181182
l.AppendItem(fmt.Sprintf("download_percent: %.2f%%", upgradeDetails.Metadata.DownloadPercent*100))
182183
}
184+
if upgradeDetails.Metadata.RetryUntil != "" {
185+
l.AppendItem("retry_until: " + humanDurationUntil(upgradeDetails.Metadata.RetryUntil, time.Now()))
186+
}
187+
if upgradeDetails.Metadata.RetryErrorMsg != "" {
188+
l.AppendItem("retry_error_msg: " + upgradeDetails.Metadata.RetryErrorMsg)
189+
}
183190
l.UnIndent()
184191
}
185192

@@ -236,3 +243,13 @@ func yamlOutput(w io.Writer, out interface{}) error {
236243
fmt.Fprintf(w, "%s\n", bytes)
237244
return nil
238245
}
246+
247+
func humanDurationUntil(targetTime string, from time.Time) string {
248+
target, err := time.Parse(control.TimeFormat(), targetTime)
249+
if err != nil {
250+
return targetTime
251+
}
252+
253+
until := target.Sub(from)
254+
return until.String()
255+
}

0 commit comments

Comments
 (0)