@@ -1307,35 +1307,11 @@ LOOP:
1307
1307
require .Equal (t , context .Canceled , err , "Run should return with context canceled, got %v" , err .Error ())
1308
1308
}
1309
1309
1310
- func (suite * FakeInputSuite ) TestManager_NoDeadlock () {
1311
- t := suite .T ()
1312
- // NOTE: This is a long-running test that spams the runtime managers `Update` function to try and
1313
- // trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue:
1314
- // https://github.com/elastic/elastic-agent/issues/2691
1315
-
1316
- ctx , cancel := context .WithCancel (context .Background ())
1317
- defer cancel ()
1318
-
1319
- ai , _ := info .NewAgentInfo (ctx , true )
1320
- m , err := NewManager (newDebugLogger (t ), newDebugLogger (t ), "localhost:0" , ai , apmtest .DiscardTracer , newTestMonitoringMgr (), configuration .DefaultGRPCConfig ())
1321
- require .NoError (t , err )
1322
- errCh := make (chan error )
1323
- go func () {
1324
- err := m .Run (ctx )
1325
- if errors .Is (err , context .Canceled ) {
1326
- err = nil
1327
- }
1328
- errCh <- err
1329
- }()
1330
-
1331
- waitCtx , waitCancel := context .WithTimeout (ctx , 1 * time .Second )
1332
- defer waitCancel ()
1333
- if err := waitForReady (waitCtx , m ); err != nil {
1334
- require .NoError (t , err )
1335
- }
1336
-
1310
+ // A component that can be fed to Manager.Update, with an index to allow
1311
+ // looping with distinct configurations at each step.
1312
+ func noDeadlockTestComponent (t * testing.T , index int ) component.Component {
1337
1313
binaryPath := testBinary (t , "component" )
1338
- comp := component.Component {
1314
+ return component.Component {
1339
1315
ID : "fake-default" ,
1340
1316
InputSpec : & component.InputRuntimeSpec {
1341
1317
InputType : "fake" ,
@@ -1351,99 +1327,92 @@ func (suite *FakeInputSuite) TestManager_NoDeadlock() {
1351
1327
Config : component .MustExpectedConfig (map [string ]interface {}{
1352
1328
"type" : "fake" ,
1353
1329
"state" : int (client .UnitStateHealthy ),
1354
- "message" : "Fake Healthy" ,
1330
+ "message" : fmt . Sprintf ( "Fake Healthy %d" , index ) ,
1355
1331
}),
1356
1332
},
1357
1333
},
1358
1334
}
1335
+ }
1359
1336
1360
- updatedCh := make (chan time.Time )
1361
- updatedErr := make (chan error )
1362
- updatedCtx , updatedCancel := context .WithCancel (context .Background ())
1363
- defer updatedCancel ()
1337
+ func (suite * FakeInputSuite ) TestManager_NoDeadlock () {
1338
+ t := suite .T ()
1339
+ // NOTE: This is a long-running test that spams the runtime managers `Update` function to try and
1340
+ // trigger a deadlock. This test takes 2 minutes to run trying to re-produce issue:
1341
+ // https://github.com/elastic/elastic-agent/issues/2691
1342
+
1343
+ // How long to run the test
1344
+ testDuration := 2 * time .Minute
1345
+
1346
+ // How long without an update before we report it as a deadlock
1347
+ maxUpdateInterval := 15 * time .Second
1348
+
1349
+ // Create the runtime manager
1350
+ ai , _ := info .NewAgentInfo (context .Background (), true )
1351
+ m , err := NewManager (newDebugLogger (t ), newDebugLogger (t ), "localhost:0" , ai , apmtest .DiscardTracer , newTestMonitoringMgr (), configuration .DefaultGRPCConfig ())
1352
+ require .NoError (t , err )
1353
+
1354
+ // Start the runtime manager in a goroutine, passing its termination state
1355
+ // to managerResultChan.
1356
+ managerCtx , managerCancel := context .WithCancel (context .Background ())
1357
+ defer managerCancel ()
1358
+ managerResultChan := make (chan error )
1364
1359
go func () {
1365
- // spam update on component trying to cause a deadlock
1366
- comp := comp
1367
- i := 0
1368
- for {
1369
- if updatedCtx .Err () != nil {
1370
- return
1371
- }
1372
- updatedComp := comp
1373
- updatedComp .Units = make ([]component.Unit , 1 )
1374
- updatedComp .Units [0 ] = component.Unit {
1375
- ID : "fake-input" ,
1376
- Type : client .UnitTypeInput ,
1377
- LogLevel : client .UnitLogLevelError , // test log will get spammed with the constant updates (error to prevent spam)
1378
- Config : component .MustExpectedConfig (map [string ]interface {}{
1379
- "type" : "fake" ,
1380
- "state" : int (client .UnitStateHealthy ),
1381
- "message" : fmt .Sprintf ("Fake Healthy %d" , i ),
1382
- }),
1383
- }
1384
- i += 1
1385
- comp = updatedComp
1386
- m .Update (component.Model {Components : []component.Component {updatedComp }})
1387
- err := <- m .errCh
1388
- if err != nil {
1389
- updatedErr <- err
1390
- return
1391
- }
1392
- updatedCh <- time .Now ()
1393
- }
1360
+ managerResultChan <- m .Run (managerCtx )
1394
1361
}()
1395
1362
1396
- deadlockErr := make (chan error )
1363
+ // Wait for the manager to become active
1364
+ timedWaitForReady (t , m , 1 * time .Second )
1365
+
1366
+ // Start a goroutine to spam the manager update trying to cause
1367
+ // a deadlock. When the test context finishes, the update loop
1368
+ // closes updateResultChan to signal that it is done.
1369
+ updateResultChan := make (chan error )
1370
+ updateLoopCtx , updateLoopCancel := context .WithTimeout (context .Background (), testDuration )
1371
+ defer updateLoopCancel ()
1397
1372
go func () {
1398
- t := time .NewTimer (15 * time .Second )
1399
- defer t .Stop ()
1400
- for {
1401
- select {
1402
- case <- updatedCtx .Done ():
1403
- return
1404
- case <- updatedCh :
1405
- // update did occur
1406
- t .Reset (15 * time .Second )
1407
- case <- t .C :
1408
- // timeout hit waiting for another update to work
1409
- deadlockErr <- errors .New ("hit deadlock" )
1373
+ defer close (updateResultChan )
1374
+ for i := 0 ; updateLoopCtx .Err () == nil ; i ++ {
1375
+ comp := noDeadlockTestComponent (t , i )
1376
+ m .Update (component.Model {Components : []component.Component {comp }})
1377
+ err := <- m .errCh
1378
+ updateResultChan <- err
1379
+ if err != nil {
1380
+ // If the update gave an error, end the test
1410
1381
return
1411
1382
}
1412
1383
}
1413
1384
}()
1414
1385
1415
- defer drainErrChan (errCh )
1416
- defer drainErrChan (updatedErr )
1417
- defer drainErrChan (deadlockErr )
1418
-
1419
- // wait 2 minutes for a deadlock to occur
1420
- endTimer := time .NewTimer (2 * time .Minute )
1421
- defer endTimer .Stop ()
1386
+ // The component state is being changed constantly. If updateTimeout
1387
+ // triggers without any updates, report it as a deadlock.
1388
+ updateTimeout := time .NewTicker (maxUpdateInterval )
1389
+ defer updateTimeout .Stop ()
1422
1390
LOOP:
1423
1391
for {
1424
1392
select {
1425
- case <- endTimer .C :
1426
- // no deadlock after timeout (all good stop the component)
1427
- updatedCancel ()
1428
- m .Update (component.Model {Components : []component.Component {}})
1429
- <- m .errCh // Don't care about the result of Update, just that it runs
1430
- break LOOP
1431
- case err := <- errCh :
1432
- require .NoError (t , err )
1433
- case err := <- updatedErr :
1434
- require .NoError (t , err )
1435
- break LOOP
1436
- case err := <- deadlockErr :
1437
- require .NoError (t , err )
1438
- break LOOP
1393
+ case err , ok := <- updateResultChan :
1394
+ if ok {
1395
+ // update did occur
1396
+ require .NoError (t , err )
1397
+ updateTimeout .Reset (15 * time .Second )
1398
+ } else {
1399
+ // Update goroutine is terminating, test is over
1400
+ break LOOP
1401
+ }
1402
+ case <- managerResultChan :
1403
+ require .Fail (t , "Runtime manager terminated before test was over" )
1404
+ case <- updateTimeout .C :
1405
+ require .Fail (t , "Timed out waiting for Manager.Update result" )
1439
1406
}
1440
1407
}
1441
-
1442
- updatedCancel ()
1443
- cancel ()
1444
-
1445
- err = <- errCh
1408
+ // Finished without a deadlock. Stop the component and shut down the manager.
1409
+ m .Update (component.Model {Components : []component.Component {}})
1410
+ err = <- m .errCh
1446
1411
require .NoError (t , err )
1412
+
1413
+ managerCancel ()
1414
+ err = <- managerResultChan
1415
+ require .Equal (t , err , context .Canceled )
1447
1416
}
1448
1417
1449
1418
func (suite * FakeInputSuite ) TestManager_Configure () {
0 commit comments