Skip to content

Commit 91c0a94

Browse files
authored
Fix flakiness in TestManager_NoDeadlock (elastic#4104)
1 parent 9e11210 commit 91c0a94

File tree

1 file changed

+70
-101
lines changed

1 file changed

+70
-101
lines changed

pkg/component/runtime/manager_fake_input_test.go

+70-101
Original file line numberDiff line numberDiff line change
@@ -1307,35 +1307,11 @@ LOOP:
13071307
require.Equal(t, context.Canceled, err, "Run should return with context canceled, got %v", err.Error())
13081308
}
13091309

1310-
func (suite *FakeInputSuite) TestManager_NoDeadlock() {
1311-
t := suite.T()
1312-
// NOTE: This is a long-running test that spams the runtime managers `Update` function to try and
1313-
// trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue:
1314-
// https://github.com/elastic/elastic-agent/issues/2691
1315-
1316-
ctx, cancel := context.WithCancel(context.Background())
1317-
defer cancel()
1318-
1319-
ai, _ := info.NewAgentInfo(ctx, true)
1320-
m, err := NewManager(newDebugLogger(t), newDebugLogger(t), "localhost:0", ai, apmtest.DiscardTracer, newTestMonitoringMgr(), configuration.DefaultGRPCConfig())
1321-
require.NoError(t, err)
1322-
errCh := make(chan error)
1323-
go func() {
1324-
err := m.Run(ctx)
1325-
if errors.Is(err, context.Canceled) {
1326-
err = nil
1327-
}
1328-
errCh <- err
1329-
}()
1330-
1331-
waitCtx, waitCancel := context.WithTimeout(ctx, 1*time.Second)
1332-
defer waitCancel()
1333-
if err := waitForReady(waitCtx, m); err != nil {
1334-
require.NoError(t, err)
1335-
}
1336-
1310+
// A component that can be fed to Manager.Update, with an index to allow
1311+
// looping with distinct configurations at each step.
1312+
func noDeadlockTestComponent(t *testing.T, index int) component.Component {
13371313
binaryPath := testBinary(t, "component")
1338-
comp := component.Component{
1314+
return component.Component{
13391315
ID: "fake-default",
13401316
InputSpec: &component.InputRuntimeSpec{
13411317
InputType: "fake",
@@ -1351,99 +1327,92 @@ func (suite *FakeInputSuite) TestManager_NoDeadlock() {
13511327
Config: component.MustExpectedConfig(map[string]interface{}{
13521328
"type": "fake",
13531329
"state": int(client.UnitStateHealthy),
1354-
"message": "Fake Healthy",
1330+
"message": fmt.Sprintf("Fake Healthy %d", index),
13551331
}),
13561332
},
13571333
},
13581334
}
1335+
}
13591336

1360-
updatedCh := make(chan time.Time)
1361-
updatedErr := make(chan error)
1362-
updatedCtx, updatedCancel := context.WithCancel(context.Background())
1363-
defer updatedCancel()
1337+
func (suite *FakeInputSuite) TestManager_NoDeadlock() {
1338+
t := suite.T()
1339+
// NOTE: This is a long-running test that spams the runtime managers `Update` function to try and
1340+
// trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue:
1341+
// https://github.com/elastic/elastic-agent/issues/2691
1342+
1343+
// How long to run the test
1344+
testDuration := 2 * time.Minute
1345+
1346+
// How long without an update before we report it as a deadlock
1347+
maxUpdateInterval := 15 * time.Second
1348+
1349+
// Create the runtime manager
1350+
ai, _ := info.NewAgentInfo(context.Background(), true)
1351+
m, err := NewManager(newDebugLogger(t), newDebugLogger(t), "localhost:0", ai, apmtest.DiscardTracer, newTestMonitoringMgr(), configuration.DefaultGRPCConfig())
1352+
require.NoError(t, err)
1353+
1354+
// Start the runtime manager in a goroutine, passing its termination state
1355+
// to managerResultChan.
1356+
managerCtx, managerCancel := context.WithCancel(context.Background())
1357+
defer managerCancel()
1358+
managerResultChan := make(chan error)
13641359
go func() {
1365-
// spam update on component trying to cause a deadlock
1366-
comp := comp
1367-
i := 0
1368-
for {
1369-
if updatedCtx.Err() != nil {
1370-
return
1371-
}
1372-
updatedComp := comp
1373-
updatedComp.Units = make([]component.Unit, 1)
1374-
updatedComp.Units[0] = component.Unit{
1375-
ID: "fake-input",
1376-
Type: client.UnitTypeInput,
1377-
LogLevel: client.UnitLogLevelError, // test log will get spammed with the constant updates (error to prevent spam)
1378-
Config: component.MustExpectedConfig(map[string]interface{}{
1379-
"type": "fake",
1380-
"state": int(client.UnitStateHealthy),
1381-
"message": fmt.Sprintf("Fake Healthy %d", i),
1382-
}),
1383-
}
1384-
i += 1
1385-
comp = updatedComp
1386-
m.Update(component.Model{Components: []component.Component{updatedComp}})
1387-
err := <-m.errCh
1388-
if err != nil {
1389-
updatedErr <- err
1390-
return
1391-
}
1392-
updatedCh <- time.Now()
1393-
}
1360+
managerResultChan <- m.Run(managerCtx)
13941361
}()
13951362

1396-
deadlockErr := make(chan error)
1363+
// Wait for the manager to become active
1364+
timedWaitForReady(t, m, 1*time.Second)
1365+
1366+
// Start a goroutine to spam the manager update trying to cause
1367+
// a deadlock. When the test context finishes, the update loop
1368+
// closes updateResultChan to signal that it is done.
1369+
updateResultChan := make(chan error)
1370+
updateLoopCtx, updateLoopCancel := context.WithTimeout(context.Background(), testDuration)
1371+
defer updateLoopCancel()
13971372
go func() {
1398-
t := time.NewTimer(15 * time.Second)
1399-
defer t.Stop()
1400-
for {
1401-
select {
1402-
case <-updatedCtx.Done():
1403-
return
1404-
case <-updatedCh:
1405-
// update did occur
1406-
t.Reset(15 * time.Second)
1407-
case <-t.C:
1408-
// timeout hit waiting for another update to work
1409-
deadlockErr <- errors.New("hit deadlock")
1373+
defer close(updateResultChan)
1374+
for i := 0; updateLoopCtx.Err() == nil; i++ {
1375+
comp := noDeadlockTestComponent(t, i)
1376+
m.Update(component.Model{Components: []component.Component{comp}})
1377+
err := <-m.errCh
1378+
updateResultChan <- err
1379+
if err != nil {
1380+
// If the update gave an error, end the test
14101381
return
14111382
}
14121383
}
14131384
}()
14141385

1415-
defer drainErrChan(errCh)
1416-
defer drainErrChan(updatedErr)
1417-
defer drainErrChan(deadlockErr)
1418-
1419-
// wait 2 minutes for a deadlock to occur
1420-
endTimer := time.NewTimer(2 * time.Minute)
1421-
defer endTimer.Stop()
1386+
// The component state is being changed constantly. If updateTimeout
1387+
// triggers without any updates, report it as a deadlock.
1388+
updateTimeout := time.NewTicker(maxUpdateInterval)
1389+
defer updateTimeout.Stop()
14221390
LOOP:
14231391
for {
14241392
select {
1425-
case <-endTimer.C:
1426-
// no deadlock after timeout (all good stop the component)
1427-
updatedCancel()
1428-
m.Update(component.Model{Components: []component.Component{}})
1429-
<-m.errCh // Don't care about the result of Update, just that it runs
1430-
break LOOP
1431-
case err := <-errCh:
1432-
require.NoError(t, err)
1433-
case err := <-updatedErr:
1434-
require.NoError(t, err)
1435-
break LOOP
1436-
case err := <-deadlockErr:
1437-
require.NoError(t, err)
1438-
break LOOP
1393+
case err, ok := <-updateResultChan:
1394+
if ok {
1395+
// update did occur
1396+
require.NoError(t, err)
1397+
updateTimeout.Reset(15 * time.Second)
1398+
} else {
1399+
// Update goroutine is terminating, test is over
1400+
break LOOP
1401+
}
1402+
case <-managerResultChan:
1403+
require.Fail(t, "Runtime manager terminated before test was over")
1404+
case <-updateTimeout.C:
1405+
require.Fail(t, "Timed out waiting for Manager.Update result")
14391406
}
14401407
}
1441-
1442-
updatedCancel()
1443-
cancel()
1444-
1445-
err = <-errCh
1408+
// Finished without a deadlock. Stop the component and shut down the manager.
1409+
m.Update(component.Model{Components: []component.Component{}})
1410+
err = <-m.errCh
14461411
require.NoError(t, err)
1412+
1413+
managerCancel()
1414+
err = <-managerResultChan
1415+
require.Equal(t, err, context.Canceled)
14471416
}
14481417

14491418
func (suite *FakeInputSuite) TestManager_Configure() {

0 commit comments

Comments
 (0)