Skip to content

Commit 3a2e53d

Browse files
authored
Merge branch 'master' into ci-check-monitoring-from-code
2 parents ea6006f + a83e692 commit 3a2e53d

21 files changed

+304
-122
lines changed

RELEASES.md

+9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
# Release Notes
22

3+
## [v1.12.3] (pending)
4+
5+
- Extended the network health check by also alerting if a primary network validator has no nodes connected to it. Runs a configurable time after startup or 10 minutes by default.
6+
7+
### Configs
8+
- How long after startup the aforementioned health check runs can be configured via:
9+
`--network-no-ingress-connections-grace-period`
10+
11+
312
## [v1.12.2](https://github.com/ava-labs/avalanchego/releases/tag/v1.12.2)
413

514
This version is backwards compatible to [v1.12.0](https://github.com/ava-labs/avalanchego/releases/tag/v1.12.0). It is optional, but encouraged.

config/config.go

+8-7
Original file line numberDiff line numberDiff line change
@@ -327,13 +327,14 @@ func getNetworkConfig(
327327
},
328328

329329
HealthConfig: network.HealthConfig{
330-
Enabled: sybilProtectionEnabled,
331-
MaxTimeSinceMsgSent: v.GetDuration(NetworkHealthMaxTimeSinceMsgSentKey),
332-
MaxTimeSinceMsgReceived: v.GetDuration(NetworkHealthMaxTimeSinceMsgReceivedKey),
333-
MaxPortionSendQueueBytesFull: v.GetFloat64(NetworkHealthMaxPortionSendQueueFillKey),
334-
MinConnectedPeers: v.GetUint(NetworkHealthMinPeersKey),
335-
MaxSendFailRate: v.GetFloat64(NetworkHealthMaxSendFailRateKey),
336-
SendFailRateHalflife: halflife,
330+
Enabled: sybilProtectionEnabled,
331+
MaxTimeSinceMsgSent: v.GetDuration(NetworkHealthMaxTimeSinceMsgSentKey),
332+
MaxTimeSinceMsgReceived: v.GetDuration(NetworkHealthMaxTimeSinceMsgReceivedKey),
333+
MaxPortionSendQueueBytesFull: v.GetFloat64(NetworkHealthMaxPortionSendQueueFillKey),
334+
MinConnectedPeers: v.GetUint(NetworkHealthMinPeersKey),
335+
MaxSendFailRate: v.GetFloat64(NetworkHealthMaxSendFailRateKey),
336+
SendFailRateHalflife: halflife,
337+
NoIngressValidatorConnectionGracePeriod: v.GetDuration(NetworkNoIngressValidatorConnectionsGracePeriodKey),
337338
},
338339

339340
ProxyEnabled: v.GetBool(NetworkTCPProxyEnabledKey),

config/flags.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ func addNodeFlags(fs *pflag.FlagSet) {
169169
fs.Duration(NetworkReadHandshakeTimeoutKey, constants.DefaultNetworkReadHandshakeTimeout, "Timeout value for reading handshake messages")
170170
fs.Duration(NetworkPingTimeoutKey, constants.DefaultPingPongTimeout, "Timeout value for Ping-Pong with a peer")
171171
fs.Duration(NetworkPingFrequencyKey, constants.DefaultPingFrequency, "Frequency of pinging other peers")
172-
172+
fs.Duration(NetworkNoIngressValidatorConnectionsGracePeriodKey, constants.DefaultNoIngressValidatorConnectionGracePeriod, "Time after which nodes are expected to be connected to us if we are a primary network validator, otherwise a health check fails")
173173
fs.String(NetworkCompressionTypeKey, constants.DefaultNetworkCompressionType.String(), fmt.Sprintf("Compression type for outbound messages. Must be one of [%s, %s]", compression.TypeZstd, compression.TypeNone))
174174

175175
fs.Duration(NetworkMaxClockDifferenceKey, constants.DefaultNetworkMaxClockDifference, "Max allowed clock difference value between this node and peers")

config/keys.go

+1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ const (
120120
NetworkInboundThrottlerMaxConnsPerSecKey = "network-inbound-connection-throttling-max-conns-per-sec"
121121
NetworkOutboundConnectionThrottlingRpsKey = "network-outbound-connection-throttling-rps"
122122
NetworkOutboundConnectionTimeoutKey = "network-outbound-connection-timeout"
123+
NetworkNoIngressValidatorConnectionsGracePeriodKey = "network-no-ingress-connections-grace-period"
123124
BenchlistFailThresholdKey = "benchlist-fail-threshold"
124125
BenchlistDurationKey = "benchlist-duration"
125126
BenchlistMinFailingDurationKey = "benchlist-min-failing-duration"

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ go 1.23.6
1212
require (
1313
github.com/DataDog/zstd v1.5.2
1414
github.com/NYTimes/gziphandler v1.1.1
15-
github.com/StephenButtolph/canoto v0.10.0
15+
github.com/StephenButtolph/canoto v0.13.3
1616
github.com/antithesishq/antithesis-sdk-go v0.3.8
1717
github.com/ava-labs/coreth v0.14.1-rc.2
1818
github.com/ava-labs/ledger-avalanche/go v0.0.0-20241009183145-e6f90a8a1a60

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ github.com/Joker/hpp v1.0.0/go.mod h1:8x5n+M1Hp5hC0g8okX3sR3vFQwynaX/UgSOM9MeBKz
5353
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
5454
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
5555
github.com/Shopify/goreferrer v0.0.0-20181106222321-ec9c9a553398/go.mod h1:a1uqRtAwp2Xwc6WNPJEufxJ7fx3npB4UV/JOLmbu5I0=
56-
github.com/StephenButtolph/canoto v0.10.0 h1:KdW85TYQXH+gwR8vOxfOUf28TRpkLU+X06Kycg1IR7s=
57-
github.com/StephenButtolph/canoto v0.10.0/go.mod h1:MxppdgKRApRBvIg4ZgO2e14m/NSBjFMuydy97OB/gYY=
56+
github.com/StephenButtolph/canoto v0.13.3 h1:tZ0BWaKK6MQr+s45Nvp9YUVa4U5aozNIKSXzQzubXA0=
57+
github.com/StephenButtolph/canoto v0.13.3/go.mod h1:IcnAHC6nJUfQFVR9y60ko2ecUqqHHSB6UwI9NnBFZnE=
5858
github.com/VictoriaMetrics/fastcache v1.12.1 h1:i0mICQuojGDL3KblA7wUNlY5lOK6a4bwt3uRKnkZU40=
5959
github.com/VictoriaMetrics/fastcache v1.12.1/go.mod h1:tX04vaqcNoQeGLD+ra5pU5sWkuxnzWhEzLwhP9w653o=
6060
github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII=

main/main.go

-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
"github.com/ava-labs/avalanchego/app"
1616
"github.com/ava-labs/avalanchego/config"
1717
"github.com/ava-labs/avalanchego/version"
18-
"github.com/ava-labs/avalanchego/vms/platformvm/block/executor"
1918
)
2019

2120
func main() {
@@ -60,8 +59,6 @@ func main() {
6059

6160
if term.IsTerminal(int(os.Stdout.Fd())) {
6261
fmt.Println(app.Header)
63-
} else {
64-
executor.EtnaActivationWasLogged.Set(true)
6562
}
6663

6764
nodeApp, err := app.New(nodeConfig)

network/config.go

+4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ type HealthConfig struct {
2626
// Marks if the health check should be enabled
2727
Enabled bool `json:"-"`
2828

29+
// NoIngressValidatorConnectionGracePeriod denotes the time after which the health check fails
30+
// for primary network validators with no ingress connections.
31+
NoIngressValidatorConnectionGracePeriod time.Duration
32+
2933
// MinConnectedPeers is the minimum number of peers that the network should
3034
// be connected to be considered healthy.
3135
MinConnectedPeers uint `json:"minConnectedPeers"`

network/network.go

+34-12
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,11 @@ import (
4242
)
4343

4444
const (
45-
ConnectedPeersKey = "connectedPeers"
46-
TimeSinceLastMsgReceivedKey = "timeSinceLastMsgReceived"
47-
TimeSinceLastMsgSentKey = "timeSinceLastMsgSent"
48-
SendFailRateKey = "sendFailRate"
45+
PrimaryNetworkValidatorHealthKey = "primary network validator health"
46+
ConnectedPeersKey = "connectedPeers"
47+
TimeSinceLastMsgReceivedKey = "timeSinceLastMsgReceived"
48+
TimeSinceLastMsgSentKey = "timeSinceLastMsgSent"
49+
SendFailRateKey = "sendFailRate"
4950
)
5051

5152
var (
@@ -153,6 +154,8 @@ type network struct {
153154
connectedPeers peer.Set
154155
closing bool
155156

157+
startupTime time.Time
158+
156159
// router is notified about all peer [Connected] and [Disconnected] events
157160
// as well as all non-handshake peer messages.
158161
//
@@ -257,11 +260,10 @@ func NewNetwork(
257260
}
258261

259262
peerConfig := &peer.Config{
260-
ReadBufferSize: config.PeerReadBufferSize,
261-
WriteBufferSize: config.PeerWriteBufferSize,
262-
Metrics: peerMetrics,
263-
MessageCreator: msgCreator,
264-
263+
ReadBufferSize: config.PeerReadBufferSize,
264+
WriteBufferSize: config.PeerWriteBufferSize,
265+
Metrics: peerMetrics,
266+
MessageCreator: msgCreator,
265267
Log: log,
266268
InboundMsgThrottler: inboundMsgThrottler,
267269
Network: nil, // This is set below.
@@ -284,6 +286,7 @@ func NewNetwork(
284286

285287
onCloseCtx, cancel := context.WithCancel(context.Background())
286288
n := &network{
289+
startupTime: time.Now(),
287290
config: config,
288291
peerConfig: peerConfig,
289292
metrics: metrics,
@@ -401,6 +404,15 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) {
401404
details[SendFailRateKey] = sendFailRate
402405
n.metrics.sendFailRate.Set(sendFailRate)
403406

407+
reachablePrimaryNetworkValidator := true
408+
// If we're a primary network validator, make sure we have ingress connections
409+
if time.Since(n.startupTime) > n.config.NoIngressValidatorConnectionGracePeriod {
410+
connectedPrimaryValidatorInfo, isConnectedPrimaryValidatorErr := checkNoIngressConnections(n.config.MyNodeID, n, n.config.Validators)
411+
reachablePrimaryNetworkValidator = isConnectedPrimaryValidatorErr == nil
412+
details[PrimaryNetworkValidatorHealthKey] = connectedPrimaryValidatorInfo
413+
}
414+
healthy = healthy && reachablePrimaryNetworkValidator
415+
404416
// emit metrics about the lifetime of peer connections
405417
n.metrics.updatePeerConnectionLifetimeMetrics()
406418

@@ -427,9 +439,18 @@ func (n *network) HealthCheck(context.Context) (interface{}, error) {
427439
if !isMsgFailRate {
428440
errorReasons = append(errorReasons, fmt.Sprintf("messages failure send rate %g > %g", sendFailRate, n.config.HealthConfig.MaxSendFailRate))
429441
}
442+
443+
if !reachablePrimaryNetworkValidator {
444+
errorReasons = append(errorReasons, ErrNoIngressConnections.Error())
445+
}
446+
430447
return details, fmt.Errorf("network layer is unhealthy reason: %s", strings.Join(errorReasons, ", "))
431448
}
432449

450+
func (n *network) IngressConnCount() int {
451+
return int(n.peerConfig.IngressConnectionCount.Load())
452+
}
453+
433454
// Connected is called after the peer finishes the handshake.
434455
// Will not be called after [Disconnected] is called with this peer.
435456
func (n *network) Connected(nodeID ids.NodeID) {
@@ -630,7 +651,7 @@ func (n *network) Dispatch() error {
630651
zap.Stringer("peerIP", ip),
631652
)
632653

633-
if err := n.upgrade(conn, n.serverUpgrader); err != nil {
654+
if err := n.upgrade(conn, n.serverUpgrader, true); err != nil {
634655
n.peerConfig.Log.Verbo("failed to upgrade connection",
635656
zap.String("direction", "inbound"),
636657
zap.Error(err),
@@ -977,7 +998,7 @@ func (n *network) dial(nodeID ids.NodeID, ip *trackedIP) {
977998
zap.Stringer("peerIP", ip.ip),
978999
)
9791000

980-
err = n.upgrade(conn, n.clientUpgrader)
1001+
err = n.upgrade(conn, n.clientUpgrader, false)
9811002
if err != nil {
9821003
n.peerConfig.Log.Verbo(
9831004
"failed to upgrade, attempting again",
@@ -1000,7 +1021,7 @@ func (n *network) dial(nodeID ids.NodeID, ip *trackedIP) {
10001021
// If the connection is desired by the node, then the resulting upgraded
10011022
// connection will be used to create a new peer. Otherwise the connection will
10021023
// be immediately closed.
1003-
func (n *network) upgrade(conn net.Conn, upgrader peer.Upgrader) error {
1024+
func (n *network) upgrade(conn net.Conn, upgrader peer.Upgrader, isIngress bool) error {
10041025
upgradeTimeout := n.peerConfig.Clock.Time().Add(n.config.ReadHandshakeTimeout)
10051026
if err := conn.SetReadDeadline(upgradeTimeout); err != nil {
10061027
_ = conn.Close()
@@ -1100,6 +1121,7 @@ func (n *network) upgrade(conn net.Conn, upgrader peer.Upgrader) error {
11001121
n.peerConfig.Log,
11011122
n.outboundMsgThrottler,
11021123
),
1124+
isIngress,
11031125
)
11041126
n.connectingPeers.Add(peer)
11051127
n.peersLock.Unlock()

network/network_test.go

+49
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,11 @@ func newFullyConnectedTestNetwork(t *testing.T, handlers []router.InboundHandler
282282
if i != 0 {
283283
config := configs[0]
284284
net.ManuallyTrack(config.MyNodeID, config.MyIPPort.Get())
285+
// Wait until the node is connected to the first node.
286+
// This forces nodes to connect to each other in a deterministic order.
287+
require.Eventually(func() bool {
288+
return len(net.PeerInfo([]ids.NodeID{config.MyNodeID})) > 0
289+
}, 10*time.Second, time.Millisecond)
285290
}
286291

287292
go func(net Network) {
@@ -306,6 +311,50 @@ func TestNewNetwork(t *testing.T) {
306311
wg.Wait()
307312
}
308313

314+
func TestIngressConnCount(t *testing.T) {
315+
require := require.New(t)
316+
317+
emptyHandler := func(context.Context, message.InboundMessage) {}
318+
319+
_, networks, wg := newFullyConnectedTestNetwork(
320+
t, []router.InboundHandler{
321+
router.InboundHandlerFunc(emptyHandler),
322+
router.InboundHandlerFunc(emptyHandler),
323+
router.InboundHandlerFunc(emptyHandler),
324+
})
325+
326+
wg.Done()
327+
328+
for _, net := range networks {
329+
net.config.NoIngressValidatorConnectionGracePeriod = 0
330+
net.config.HealthConfig.Enabled = true
331+
}
332+
333+
require.Eventually(func() bool {
334+
result := true
335+
for _, net := range networks {
336+
result = result && len(net.PeerInfo(nil)) == len(networks)-1
337+
}
338+
return result
339+
}, time.Minute, time.Millisecond*10)
340+
341+
ingressConnections := make([]int, 0, len(networks))
342+
healthCheckErrors := make([]error, 0, len(networks))
343+
344+
for _, net := range networks {
345+
ingressConnections = append(ingressConnections, net.IngressConnCount())
346+
_, err := net.HealthCheck(context.Background())
347+
healthCheckErrors = append(healthCheckErrors, err)
348+
}
349+
350+
// First node has all nodes connected to it.
351+
// Second node has only the third node connected to it.
352+
// Third node has no node connected to it, as it connects to the first and second node.
353+
require.Equal([]int{2, 1, 0}, ingressConnections)
354+
require.Equal([]error{nil, nil}, healthCheckErrors[:2])
355+
require.ErrorContains(healthCheckErrors[2], ErrNoIngressConnections.Error()) //nolint
356+
}
357+
309358
func TestSend(t *testing.T) {
310359
require := require.New(t)
311360

network/no_ingress_conn_alert.go

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
2+
// See the file LICENSE for licensing terms.
3+
4+
package network
5+
6+
import (
7+
"errors"
8+
9+
"github.com/ava-labs/avalanchego/ids"
10+
"github.com/ava-labs/avalanchego/snow/validators"
11+
"github.com/ava-labs/avalanchego/utils/constants"
12+
)
13+
14+
// ErrNoIngressConnections denotes that no node is connected to this validator.
15+
var ErrNoIngressConnections = errors.New("primary network validator has no inbound connections")
16+
17+
type ingressConnectionCounter interface {
18+
IngressConnCount() int
19+
}
20+
21+
type validatorRetriever interface {
22+
GetValidator(subnetID ids.ID, nodeID ids.NodeID) (*validators.Validator, bool)
23+
}
24+
25+
func checkNoIngressConnections(selfID ids.NodeID, ingressConnections ingressConnectionCounter, validators validatorRetriever) (interface{}, error) {
26+
connCount := ingressConnections.IngressConnCount()
27+
_, areWeValidator := validators.GetValidator(constants.PrimaryNetworkID, selfID)
28+
29+
result := map[string]interface{}{
30+
"ingressConnectionCount": connCount,
31+
"primaryNetworkValidator": areWeValidator,
32+
}
33+
34+
if connCount > 0 || !areWeValidator {
35+
return result, nil
36+
}
37+
38+
return result, ErrNoIngressConnections
39+
}

network/no_ingress_conn_alert_test.go

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
2+
// See the file LICENSE for licensing terms.
3+
4+
package network
5+
6+
import (
7+
"testing"
8+
9+
"github.com/stretchr/testify/require"
10+
11+
"github.com/ava-labs/avalanchego/ids"
12+
"github.com/ava-labs/avalanchego/snow/validators"
13+
)
14+
15+
type fakeValidatorRetriever struct {
16+
result bool
17+
}
18+
19+
func (m *fakeValidatorRetriever) GetValidator(ids.ID, ids.NodeID) (*validators.Validator, bool) {
20+
return nil, m.result
21+
}
22+
23+
type fakeIngressConnectionCounter struct {
24+
res int
25+
}
26+
27+
func (m *fakeIngressConnectionCounter) IngressConnCount() int {
28+
return m.res
29+
}
30+
31+
func TestNoIngressConnAlertHealthCheck(t *testing.T) {
32+
for _, testCase := range []struct {
33+
name string
34+
getValidatorResult bool
35+
ingressConnCountResult int
36+
expectedErr error
37+
expectedResult interface{}
38+
}{
39+
{
40+
name: "not a validator of a primary network",
41+
expectedResult: map[string]interface{}{"ingressConnectionCount": 0, "primaryNetworkValidator": false},
42+
},
43+
{
44+
name: "a validator of the primary network",
45+
getValidatorResult: true,
46+
expectedResult: map[string]interface{}{
47+
"ingressConnectionCount": 0, "primaryNetworkValidator": true,
48+
},
49+
expectedErr: ErrNoIngressConnections,
50+
},
51+
{
52+
name: "a validator with ingress connections",
53+
expectedResult: map[string]interface{}{"ingressConnectionCount": 42, "primaryNetworkValidator": true},
54+
expectedErr: nil,
55+
ingressConnCountResult: 42,
56+
getValidatorResult: true,
57+
},
58+
} {
59+
t.Run(testCase.name, func(t *testing.T) {
60+
result, err := checkNoIngressConnections(ids.EmptyNodeID, &fakeIngressConnectionCounter{res: testCase.ingressConnCountResult}, &fakeValidatorRetriever{result: testCase.getValidatorResult})
61+
require.Equal(t, testCase.expectedErr, err)
62+
require.Equal(t, testCase.expectedResult, result)
63+
})
64+
}
65+
}

0 commit comments

Comments
 (0)