Skip to content

Commit b272a93

Browse files
authored
Switch to CFT region and add more robust tracking and cleanup of stacks. (#3701)
* Switch to CFT region and add more robust tracking and cleanup of stacks. * Fix tests. * Adjust integration tests. * Fix lint in servless provider. * Fix serverless. * Fix comment and typo. * Fix serverless. * More serverless fixes. * Fix check loop in serverless. * Fix lint. * Code review fixes. * Fix a few missed error messages.
1 parent 697a1bc commit b272a93

12 files changed

+386
-310
lines changed

.buildkite/hooks/pre-command

+2-3
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ DOCKER_REGISTRY="docker.elastic.co"
1717
DOCKER_REGISTRY_SECRET_PATH="kv/ci-shared/platform-ingest/docker_registry_prod"
1818
CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role"
1919
CI_GCP_OBS_PATH="kv/ci-shared/observability-ingest/cloud/gcp"
20-
# CI_AGENT_QA_OBS_PATH="kv/ci-shared/observability-ingest/elastic-agent-ess-qa"
21-
CI_ESS_STAGING_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-staging"
20+
CI_ESS_PATH="kv/ci-shared/platform-ingest/platform-ingest-ec-prod"
2221
CI_DRA_ROLE_PATH="kv/ci-shared/release/dra-role"
2322

2423

@@ -55,7 +54,7 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" ==
5554
export TEST_INTEG_AUTH_GCP_SERVICE_TOKEN_FILE=$(realpath ./gcp.json)
5655

5756
# ESS credentials
58-
export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_STAGING_PATH})
57+
export API_KEY_TOKEN=$(vault kv get -field apiKey ${CI_ESS_PATH})
5958
echo ${API_KEY_TOKEN} > ./apiKey
6059
export TEST_INTEG_AUTH_ESS_APIKEY_FILE=$(realpath ./apiKey)
6160
fi

.buildkite/hooks/pre-exit

+5-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@ if [[ "$BUILDKITE_PIPELINE_SLUG" == "elastic-agent" && "$BUILDKITE_STEP_KEY" ==
1010

1111
# Perform cleanup of integration tests resources
1212
echo "--- Cleaning up integration test resources"
13-
TEST_INTEG_AUTH_ESS_REGION=us-east-1 SNAPSHOT=true mage integration:clean
13+
if [[ "$BUILDKITE_STEP_KEY" == "serverless-integration-tests" ]]; then
14+
STACK_PROVISIONER=serverless SNAPSHOT=true mage integration:clean
15+
else
16+
SNAPSHOT=true mage integration:clean
17+
fi
1418
fi
1519

1620
if [ -n "$GOOGLE_APPLICATION_CREDENTIALS" ]; then

.buildkite/pipeline.yml

-4
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,6 @@ steps:
201201

202202
- label: "Serverless integration test"
203203
key: "serverless-integration-tests"
204-
env:
205-
TEST_INTEG_AUTH_ESS_REGION: us-east-1
206204
command: ".buildkite/scripts/steps/integration_tests.sh serverless integration:single TestLogIngestionFleetManaged" #right now, run a single test in serverless mode as a sort of smoke test, instead of re-running the entire suite
207205
artifact_paths:
208206
- "build/TEST-**"
@@ -213,8 +211,6 @@ steps:
213211

214212
- label: "Integration tests"
215213
key: "integration-tests"
216-
env:
217-
TEST_INTEG_AUTH_ESS_REGION: us-east-1
218214
command: ".buildkite/scripts/steps/integration_tests.sh stateful"
219215
artifact_paths:
220216
- "build/TEST-**"

magefile.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -1750,15 +1750,16 @@ func createTestRunner(matrix bool, singleTest string, goTestFlags string, batche
17501750
}
17511751
datacenter := os.Getenv("TEST_INTEG_AUTH_GCP_DATACENTER")
17521752
if datacenter == "" {
1753+
// us-central1-a is used because T2A instances required for ARM64 testing are only
1754+
// available in the central regions
17531755
datacenter = "us-central1-a"
17541756
}
17551757

1756-
// Valid values are gcp-us-central1 (default), azure-eastus2,
1757-
// aws-eu-central-1, us-east-1 (which is an AWS region but the
1758-
// "aws" CSP prefix is not used by ESS for some reason!)
1758+
// Possible to change the region for deployment, default is gcp-us-west2 which is
1759+
// the CFT region.
17591760
essRegion := os.Getenv("TEST_INTEG_AUTH_ESS_REGION")
17601761
if essRegion == "" {
1761-
essRegion = "gcp-us-central1"
1762+
essRegion = "gcp-us-west2"
17621763
}
17631764

17641765
instanceProvisionerMode := os.Getenv("INSTANCE_PROVISIONER")

pkg/testing/ess/config.go

+6-1
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,13 @@ type Config struct {
1717
}
1818

1919
func defaultConfig() *Config {
20+
baseURL := os.Getenv("TEST_INTEG_AUTH_ESS_URL")
21+
if baseURL == "" {
22+
baseURL = "https://cloud.elastic.co"
23+
}
24+
url := strings.TrimRight(baseURL, "/") + "/api/v1"
2025
return &Config{
21-
BaseUrl: `https://staging.found.no/api/v1`,
26+
BaseUrl: url,
2227
}
2328
}
2429

pkg/testing/ess/provisioner.go

+65-84
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@ import (
1111
"strings"
1212
"time"
1313

14-
"golang.org/x/sync/errgroup"
15-
1614
"github.com/elastic/elastic-agent/pkg/testing/runner"
1715
)
1816

@@ -62,89 +60,77 @@ func (p *provisioner) SetLogger(l runner.Logger) {
6260
p.logger = l
6361
}
6462

65-
func (p *provisioner) Provision(ctx context.Context, requests []runner.StackRequest) ([]runner.Stack, error) {
66-
results := make(map[runner.StackRequest]*CreateDeploymentResponse)
67-
for _, r := range requests {
68-
// allow up to 2 minutes for each create request
69-
createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute)
70-
resp, err := p.createDeployment(createCtx, r,
71-
map[string]string{
72-
"division": "engineering",
73-
"org": "ingest",
74-
"team": "elastic-agent",
75-
"project": "elastic-agent",
76-
"integration-tests": "true",
77-
})
78-
createCancel()
79-
if err != nil {
80-
return nil, err
81-
}
82-
results[r] = resp
83-
}
63+
// Create creates a stack.
64+
func (p *provisioner) Create(ctx context.Context, request runner.StackRequest) (runner.Stack, error) {
65+
// allow up to 2 minutes for request
66+
createCtx, createCancel := context.WithTimeout(ctx, 2*time.Minute)
67+
defer createCancel()
68+
resp, err := p.createDeployment(createCtx, request,
69+
map[string]string{
70+
"division": "engineering",
71+
"org": "ingest",
72+
"team": "elastic-agent",
73+
"project": "elastic-agent",
74+
"integration-tests": "true",
75+
})
76+
if err != nil {
77+
return runner.Stack{}, err
78+
}
79+
return runner.Stack{
80+
ID: request.ID,
81+
Version: request.Version,
82+
Elasticsearch: resp.ElasticsearchEndpoint,
83+
Kibana: resp.KibanaEndpoint,
84+
Username: resp.Username,
85+
Password: resp.Password,
86+
Internal: map[string]interface{}{
87+
"deployment_id": resp.ID,
88+
},
89+
Ready: false,
90+
}, nil
91+
}
8492

85-
// set a long timeout
86-
// this context travels up to the magefile, clients that want a shorter timeout can set
87-
// it via mage's -t flag
88-
readyCtx, readyCancel := context.WithTimeout(ctx, 25*time.Minute)
89-
defer readyCancel()
90-
91-
g, gCtx := errgroup.WithContext(readyCtx)
92-
for req, resp := range results {
93-
g.Go(func(req runner.StackRequest, resp *CreateDeploymentResponse) func() error {
94-
return func() error {
95-
ready, err := p.client.DeploymentIsReady(gCtx, resp.ID, 30*time.Second)
96-
if err != nil {
97-
return fmt.Errorf("failed to check for cloud %s to be ready: %w", req.Version, err)
98-
}
99-
if !ready {
100-
return fmt.Errorf("cloud %s never became ready: %w", req.Version, err)
101-
}
102-
return nil
103-
}
104-
}(req, resp))
93+
// WaitForReady should block until the stack is ready or the context is cancelled.
94+
func (p *provisioner) WaitForReady(ctx context.Context, stack runner.Stack) (runner.Stack, error) {
95+
deploymentID, err := p.getDeploymentID(stack)
96+
if err != nil {
97+
return stack, fmt.Errorf("failed to get deployment ID from the stack: %w", err)
10598
}
106-
err := g.Wait()
99+
// allow up to 10 minutes for it to become ready
100+
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
101+
defer cancel()
102+
p.logger.Logf("Waiting for cloud stack %s to be ready [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID)
103+
ready, err := p.client.DeploymentIsReady(ctx, deploymentID, 30*time.Second)
107104
if err != nil {
108-
return nil, err
105+
return stack, fmt.Errorf("failed to check for cloud %s [stack_id: %s, deployment_id: %s] to be ready: %w", stack.Version, stack.ID, deploymentID, err)
109106
}
110-
111-
var stacks []runner.Stack
112-
for req, resp := range results {
113-
stacks = append(stacks, runner.Stack{
114-
ID: req.ID,
115-
Version: req.Version,
116-
Elasticsearch: resp.ElasticsearchEndpoint,
117-
Kibana: resp.KibanaEndpoint,
118-
Username: resp.Username,
119-
Password: resp.Password,
120-
Internal: map[string]interface{}{
121-
"deployment_id": resp.ID,
122-
},
123-
})
107+
if !ready {
108+
return stack, fmt.Errorf("cloud %s [stack_id: %s, deployment_id: %s] never became ready: %w", stack.Version, stack.ID, deploymentID, err)
124109
}
125-
return stacks, nil
110+
stack.Ready = true
111+
return stack, nil
126112
}
127113

128-
// Clean cleans up all provisioned resources.
129-
func (p *provisioner) Clean(ctx context.Context, stacks []runner.Stack) error {
130-
var errs []error
131-
for _, s := range stacks {
132-
err := p.destroyDeployment(ctx, s)
133-
if err != nil {
134-
errs = append(errs, fmt.Errorf("failed to destroy stack %s (%s): %w", s.Version, s.ID, err))
135-
}
136-
}
137-
if len(errs) > 0 {
138-
return errors.Join(errs...)
114+
// Delete deletes a stack.
115+
func (p *provisioner) Delete(ctx context.Context, stack runner.Stack) error {
116+
deploymentID, err := p.getDeploymentID(stack)
117+
if err != nil {
118+
return err
139119
}
140-
return nil
120+
121+
// allow up to 1 minute for request
122+
ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
123+
defer cancel()
124+
125+
p.logger.Logf("Destroying cloud stack %s [stack_id: %s, deployment_id: %s]", stack.Version, stack.ID, deploymentID)
126+
return p.client.ShutdownDeployment(ctx, deploymentID)
141127
}
142128

143129
func (p *provisioner) createDeployment(ctx context.Context, r runner.StackRequest, tags map[string]string) (*CreateDeploymentResponse, error) {
144130
ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
145131
defer cancel()
146132

147-
p.logger.Logf("Creating stack %s (%s)", r.Version, r.ID)
133+
p.logger.Logf("Creating cloud stack %s [stack_id: %s]", r.Version, r.ID)
148134
name := fmt.Sprintf("%s-%s", strings.Replace(p.cfg.Identifier, ".", "-", -1), r.ID)
149135

150136
// prepare tags
@@ -168,26 +154,21 @@ func (p *provisioner) createDeployment(ctx context.Context, r runner.StackReques
168154
p.logger.Logf("Failed to create ESS cloud %s: %s", r.Version, err)
169155
return nil, fmt.Errorf("failed to create ESS cloud for version %s: %w", r.Version, err)
170156
}
171-
p.logger.Logf("Created stack %s (%s) [id: %s]", r.Version, r.ID, resp.ID)
157+
p.logger.Logf("Created cloud stack %s [stack_id: %s, deployment_id: %s]", r.Version, r.ID, resp.ID)
172158
return resp, nil
173159
}
174160

175-
func (p *provisioner) destroyDeployment(ctx context.Context, s runner.Stack) error {
176-
if s.Internal == nil {
177-
return fmt.Errorf("missing internal information")
161+
func (p *provisioner) getDeploymentID(stack runner.Stack) (string, error) {
162+
if stack.Internal == nil {
163+
return "", fmt.Errorf("missing internal information")
178164
}
179-
deploymentIDRaw, ok := s.Internal["deployment_id"]
165+
deploymentIDRaw, ok := stack.Internal["deployment_id"]
180166
if !ok {
181-
return fmt.Errorf("missing internal deployment_id")
167+
return "", fmt.Errorf("missing internal deployment_id")
182168
}
183169
deploymentID, ok := deploymentIDRaw.(string)
184170
if !ok {
185-
return fmt.Errorf("internal deployment_id not a string")
171+
return "", fmt.Errorf("internal deployment_id not a string")
186172
}
187-
188-
ctx, cancel := context.WithTimeout(ctx, 1*time.Minute)
189-
defer cancel()
190-
191-
p.logger.Logf("Destroying stack %s (%s)", s.Version, s.ID)
192-
return p.client.ShutdownDeployment(ctx, deploymentID)
173+
return deploymentID, nil
193174
}

pkg/testing/ess/serverless.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import (
1717
"github.com/elastic/elastic-agent/pkg/testing/runner"
1818
)
1919

20-
var serverlessURL = "https://staging.found.no"
20+
var serverlessURL = "https://cloud.elastic.co"
2121

2222
// ServerlessClient is the handler the serverless ES instance
2323
type ServerlessClient struct {

0 commit comments

Comments
 (0)