Skip to content

Commit fcb924c

Browse files
committed
[ci] Move monitoring check from github action to code
This avoids requiring that downstream repos be able to produce a tmpnetctl binary.
1 parent 73ee842 commit fcb924c

File tree

6 files changed

+65
-37
lines changed

6 files changed

+65
-37
lines changed

.github/actions/run-monitored-tmpnet-cmd/action.yml

+1-27
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ runs:
6565
run: ${{ inputs.run_env }} nix develop --impure --command bash -x ${{ inputs.run }}
6666
env:
6767
TMPNET_START_COLLECTORS: true
68+
TMPNET_CHECK_MONITORING: true
6869
LOKI_USERNAME: ${{ inputs.loki_username }}
6970
LOKI_PASSWORD: ${{ inputs.loki_password }}
7071
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}
@@ -89,30 +90,3 @@ runs:
8990
~/.tmpnet/prometheus/prometheus.log
9091
~/.tmpnet/promtail/promtail.log
9192
if-no-files-found: error
92-
# TODO(marun) Maybe optionally run these checks in an AfterSuite step?
93-
- name: Check that logs were collected
94-
if: always()
95-
shell: bash
96-
run: go run github.com/ava-labs/avalanchego/tests/fixture/tmpnet/cmd check-logs
97-
env:
98-
LOKI_USERNAME: ${{ inputs.loki_username }}
99-
LOKI_PASSWORD: ${{ inputs.loki_password }}
100-
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
101-
GH_WORKFLOW: ${{ inputs.workflow }}
102-
GH_RUN_ID: ${{ inputs.run_id }}
103-
GH_RUN_NUMBER: ${{ inputs.run_number }}
104-
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
105-
GH_JOB_ID: ${{ inputs.job }}
106-
- name: Check that metrics were collected
107-
if: always()
108-
shell: bash
109-
run: go run github.com/ava-labs/avalanchego/tests/fixture/tmpnet/cmd check-metrics
110-
env:
111-
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}
112-
PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }}
113-
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
114-
GH_WORKFLOW: ${{ inputs.workflow }}
115-
GH_RUN_ID: ${{ inputs.run_id }}
116-
GH_RUN_NUMBER: ${{ inputs.run_number }}
117-
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
118-
GH_JOB_ID: ${{ inputs.job }}

tests/fixture/e2e/env.go

+20-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package e2e
55

66
import (
7+
"context"
78
"encoding/json"
89
"errors"
910
"math/rand"
@@ -79,12 +80,27 @@ func (te *TestEnvironment) Marshal() []byte {
7980
func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment {
8081
require := require.New(tc)
8182

82-
// Start collectors for any command but stop
83-
if flagVars.StartCollectors() && !flagVars.StopNetwork() {
84-
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
83+
var network *tmpnet.Network
84+
85+
// Consider monitoring flags for any command but stop
86+
if !flagVars.StopNetwork() {
87+
if flagVars.StartCollectors() {
88+
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
89+
}
90+
if flagVars.CheckMonitoring() {
91+
// Register cleanup before network start to ensure it runs after the network is stopped (LIFO)
92+
tc.DeferCleanup(func() {
93+
if network == nil {
94+
tc.Log().Warn("unable to check that logs and metrics were collected from an uninitialized network")
95+
return
96+
}
97+
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
98+
defer cancel()
99+
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
100+
})
101+
}
85102
}
86103

87-
var network *tmpnet.Network
88104
// Need to load the network if it is being stopped or reused
89105
if flagVars.StopNetwork() || flagVars.ReuseNetwork() {
90106
networkDir := flagVars.NetworkDir()

tests/fixture/e2e/flags.go

+17-3
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ type FlagVars struct {
2121
networkDir string
2222
reuseNetwork bool
2323
startCollectors bool
24+
checkMonitoring bool
2425
startNetwork bool
2526
stopNetwork bool
2627
restartNetwork bool
@@ -77,6 +78,10 @@ func (v *FlagVars) StartCollectors() bool {
7778
return v.startCollectors
7879
}
7980

81+
func (v *FlagVars) CheckMonitoring() bool {
82+
return v.checkMonitoring
83+
}
84+
8085
func (v *FlagVars) NetworkShutdownDelay() time.Duration {
8186
if v.startCollectors {
8287
// Only return a non-zero value if we want to ensure the collectors have
@@ -140,7 +145,10 @@ func RegisterFlags() *FlagVars {
140145
false,
141146
"[optional] restart an existing network previously started with --reuse-network. Useful for ensuring a network is running with the current state of binaries on disk. Ignored if a network is not already running or --stop-network is provided.",
142147
)
143-
SetStartCollectorsFlag(&vars.startCollectors)
148+
SetMonitoringFlags(
149+
&vars.startCollectors,
150+
&vars.checkMonitoring,
151+
)
144152
flag.BoolVar(
145153
&vars.startNetwork,
146154
"start-network",
@@ -170,11 +178,17 @@ func RegisterFlags() *FlagVars {
170178
}
171179

172180
// Enable reuse by the upgrade job
173-
func SetStartCollectorsFlag(p *bool) {
181+
func SetMonitoringFlags(startCollectors *bool, checkMonitoring *bool) {
174182
flag.BoolVar(
175-
p,
183+
startCollectors,
176184
"start-collectors",
177185
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_START_COLLECTORS", "false")),
178186
"[optional] whether to start collectors of logs and metrics from nodes of the temporary network.",
179187
)
188+
flag.BoolVar(
189+
checkMonitoring,
190+
"check-monitoring",
191+
cast.ToBool(tmpnet.GetEnvWithDefault("TMPNET_CHECK_MONITORING", "false")),
192+
"[optional] whether to check that logs and metrics have been collected from nodes of the temporary network.",
193+
)
180194
}

tests/fixture/tmpnet/check_monitoring.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,15 @@ import (
2727

2828
type getCountFunc func() (int, error)
2929

30+
// CheckMonitoring checks if logs and metrics exist for the given network. Github labels
31+
// are also used as filters if provided as env vars (GH_*).
32+
func CheckMonitoring(ctx context.Context, log logging.Logger, networkUUID string) error {
33+
return errors.Join(
34+
CheckLogsExist(ctx, log, networkUUID),
35+
CheckMetricsExist(ctx, log, networkUUID),
36+
)
37+
}
38+
3039
// waitForCount waits until the provided function returns greater than zero.
3140
func waitForCount(ctx context.Context, log logging.Logger, name string, getCount getCountFunc) error {
3241
err := pollUntilContextCancel(
@@ -56,7 +65,7 @@ func waitForCount(ctx context.Context, log logging.Logger, name string, getCount
5665
}
5766

5867
// CheckLogsExist checks if logs exist for the given network. Github labels are also
59-
// included if provided as env vars (GH_*).
68+
// used as filters if provided as env vars (GH_*).
6069
func CheckLogsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
6170
username, password, err := getCollectorCredentials(promtailCmd)
6271
if err != nil {
@@ -163,7 +172,7 @@ func queryLoki(
163172
}
164173

165174
// CheckMetricsExist checks if metrics exist for the given network. Github labels are also
166-
// included if provided as env vars (GH_*).
175+
// used as filters if provided as env vars (GH_*).
167176
func CheckMetricsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
168177
username, password, err := getCollectorCredentials(prometheusCmd)
169178
if err != nil {

tests/fixture/tmpnet/network.go

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ type Network struct {
121121

122122
func NewDefaultNetwork(owner string) *Network {
123123
return &Network{
124+
UUID: uuid.NewString(),
124125
Owner: owner,
125126
Nodes: NewNodesOrPanic(DefaultNodeCount),
126127
}

tests/upgrade/upgrade_test.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package upgrade
55

66
import (
7+
"context"
78
"flag"
89
"fmt"
910
"testing"
@@ -24,6 +25,7 @@ var (
2425
avalancheGoExecPath string
2526
avalancheGoExecPathToUpgradeTo string
2627
startCollectors bool
28+
checkMonitoring bool
2729
)
2830

2931
func init() {
@@ -39,7 +41,10 @@ func init() {
3941
"",
4042
"avalanchego executable path to upgrade to",
4143
)
42-
e2e.SetStartCollectorsFlag(&startCollectors)
44+
e2e.SetMonitoringFlags(
45+
&startCollectors,
46+
&checkMonitoring,
47+
)
4348
}
4449

4550
var _ = ginkgo.Describe("[Upgrade]", func() {
@@ -59,6 +64,15 @@ var _ = ginkgo.Describe("[Upgrade]", func() {
5964
require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log()))
6065
shutdownDelay = tmpnet.NetworkShutdownDelay // Ensure a final metrics scrape
6166
}
67+
if checkMonitoring {
68+
// Since cleanups are run in LIFO order, adding this cleanup before
69+
// StartNetwork is called ensures network shutdown will be called first.
70+
tc.DeferCleanup(func() {
71+
ctx, cancel := context.WithTimeout(context.Background(), e2e.DefaultTimeout)
72+
defer cancel()
73+
require.NoError(tmpnet.CheckMonitoring(ctx, tc.Log(), network.UUID))
74+
})
75+
}
6276

6377
e2e.StartNetwork(
6478
tc,

0 commit comments

Comments
 (0)