From dcee968b286d58f39a2caa2f179a9f92086fdeb2 Mon Sep 17 00:00:00 2001 From: Boris Zbarsky Date: Thu, 23 May 2024 23:58:01 -0400 Subject: [PATCH] Improve our various round-trip timeout computations. We had a few issues: 1) Our "round-trip timeout" only accounted for one side of the round-trip needing to do MRP retransmits. So if the sender retried a few times, the last retry finally got through, then the response had to be retried a well, the sender would time out the exchange before the response was received. The fix here is to add the MRP backoff times for both the initial message and the response. 2) ReadClient could end up timing out a subscription before the server had actually given up on receiving a StatusReport in response to its ReportData, in situations where the server ended up doing MRP retries and the last MRP retry took a few seconds to get through the network. The fix here is to just estimate how long the server will be waiting for the StatusReport and not time out the subscription until then; at that point even if the server did in fact send its report on time, it will have dropped the subscription on its end. --- src/app/OperationalSessionSetup.cpp | 5 +++ src/app/ReadClient.cpp | 32 ++++++++++--------- .../Framework/CHIPTests/MTRControllerTests.m | 4 +++ .../CHIPTests/MTRPerControllerStorageTests.m | 4 +++ src/protocols/secure_channel/CASESession.cpp | 31 +++++++++++++----- src/transport/GroupSession.h | 14 ++++++++ src/transport/SecureSession.h | 21 ++++++++++++ src/transport/Session.cpp | 5 ++- src/transport/Session.h | 16 +++++++++- src/transport/UnauthenticatedSessionTable.h | 21 ++++++++++++ 10 files changed, 128 insertions(+), 25 deletions(-) diff --git a/src/app/OperationalSessionSetup.cpp b/src/app/OperationalSessionSetup.cpp index 5733a423702226..9197a2edbddf70 100644 --- a/src/app/OperationalSessionSetup.cpp +++ b/src/app/OperationalSessionSetup.cpp @@ -792,6 +792,11 @@ CHIP_ERROR OperationalSessionSetup::ScheduleSessionSetupReattempt(System::Clock: // but in practice for old devices BUSY often sends some hardcoded value // that tells us nothing about when the other side will decide it has // timed out. + // + // Unfortunately, we do not have the MRP config for the other side here, + // but in practice if the other side is using its local config to + // compute Sigma2 response timeouts, then it's also returning useful + // values with BUSY, so we will wait long enough. auto additionalTimeout = CASESession::ComputeSigma2ResponseTimeout(GetLocalMRPConfig().ValueOr(GetDefaultMRPConfig())); actualTimerDelay += additionalTimeout; } diff --git a/src/app/ReadClient.cpp b/src/app/ReadClient.cpp index e8ada20a428c70..bb058e091858c7 100644 --- a/src/app/ReadClient.cpp +++ b/src/app/ReadClient.cpp @@ -938,24 +938,26 @@ CHIP_ERROR ReadClient::ComputeLivenessCheckTimerTimeout(System::Clock::Timeout * // // To calculate the duration we're willing to wait for a report to come to us, we take into account the maximum interval of - // the subscription AND the time it takes for the report to make it to us in the worst case. This latter bit involves - // computing the Ack timeout from the publisher for the ReportData message being sent to us using our IDLE interval as the - // basis for that computation. + // the subscription AND the time it takes for the report to make it to us in the worst case. // - // Make sure to use the retransmission computation that includes backoff. For purposes of that computation, treat us as - // active now (since we are right now sending/receiving messages), and use the default "how long are we guaranteed to stay - // active" threshold for now. + // We have no way to estimate what the network latency will be, but we do know the other side will time out its ReportData + // after its computed round-trip timeout plus the processing time it gives us (app::kExpectedIMProcessingTime). Once it + // times out, assuming it sent the report at all, there's no point in us thinking we still have a subscription. // - // TODO: We need to find a good home for this logic that will correctly compute this based on transport. For now, this will - // suffice since we don't use TCP as a transport currently and subscriptions over BLE aren't really a thing. + // We can't use ComputeRoundTripTimeout() on the session for two reasons: we want the roundtrip timeout from the point of + // view of the peer, not us, and we want to start off with the assumption the peer will likely have, which is that we are + // idle, whereas ComputeRoundTripTimeout() uses the current activity state of the peer. // - const auto & localMRPConfig = GetLocalMRPConfig(); - const auto & defaultMRPConfig = GetDefaultMRPConfig(); - const auto & ourMrpConfig = localMRPConfig.ValueOr(defaultMRPConfig); - auto publisherTransmissionTimeout = - GetRetransmissionTimeout(ourMrpConfig.mActiveRetransTimeout, ourMrpConfig.mIdleRetransTimeout, - System::SystemClock().GetMonotonicTimestamp(), ourMrpConfig.mActiveThresholdTime); - *aTimeout = System::Clock::Seconds16(mMaxInterval) + publisherTransmissionTimeout; + // So recompute the round-trip timeout directly. Assume MRP, since in practice that is likely what is happening. + auto & peerMRPConfig = mReadPrepareParams.mSessionHolder->GetRemoteMRPConfig(); + // Peer will assume we are idle (hence we pass kZero to GetMessageReceiptTimeout()), but will assume we treat it as active + // for the response, so to match the retransmission timeout computation for the message back to the peeer, we should treat + // it as active. + auto roundTripTimeout = mReadPrepareParams.mSessionHolder->GetMessageReceiptTimeout(System::Clock::kZero) + + kExpectedIMProcessingTime + + GetRetransmissionTimeout(peerMRPConfig.mActiveRetransTimeout, peerMRPConfig.mIdleRetransTimeout, + System::SystemClock().GetMonotonicTimestamp(), peerMRPConfig.mActiveThresholdTime); + *aTimeout = System::Clock::Seconds16(mMaxInterval) + roundTripTimeout; return CHIP_NO_ERROR; } diff --git a/src/darwin/Framework/CHIPTests/MTRControllerTests.m b/src/darwin/Framework/CHIPTests/MTRControllerTests.m index b5ca63135703b3..436a0df230e3d9 100644 --- a/src/darwin/Framework/CHIPTests/MTRControllerTests.m +++ b/src/darwin/Framework/CHIPTests/MTRControllerTests.m @@ -1557,6 +1557,10 @@ - (void)testSetMRPParameters // Can be called before starting the factory XCTAssertFalse(MTRDeviceControllerFactory.sharedInstance.running); MTRSetMessageReliabilityParameters(@2000, @2000, @2000, @2000); + + // Now reset back to the default state, so timings in other tests are not + // affected. + MTRSetMessageReliabilityParameters(nil, nil, nil, nil); } @end diff --git a/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m b/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m index 4a87fd66a7d88d..d4c049f3938800 100644 --- a/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m +++ b/src/darwin/Framework/CHIPTests/MTRPerControllerStorageTests.m @@ -2049,6 +2049,10 @@ - (void)testSetMRPParametersWithRunningController XCTAssertTrue(controller.running); MTRSetMessageReliabilityParameters(@2000, @2000, @2000, @2000); [controller shutdown]; + + // Now reset back to the default state, so timings in other tests are not + // affected. + MTRSetMessageReliabilityParameters(nil, nil, nil, nil); } static NSString * const kLocalTestUserDefaultDomain = @"org.csa-iot.matter.darwintest"; diff --git a/src/protocols/secure_channel/CASESession.cpp b/src/protocols/secure_channel/CASESession.cpp index ccc2bc2188d176..b0642a966890fd 100644 --- a/src/protocols/secure_channel/CASESession.cpp +++ b/src/protocols/secure_channel/CASESession.cpp @@ -2362,21 +2362,36 @@ CHIP_ERROR CASESession::OnMessageReceived(ExchangeContext * ec, const PayloadHea return err; } -System::Clock::Timeout CASESession::ComputeSigma1ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig) +namespace { +System::Clock::Timeout ComputeRoundTripTimeout(ExchangeContext::Timeout serverProcessingTime, + const ReliableMessageProtocolConfig & remoteMrpConfig) { + // TODO: This is duplicating logic from Session::ComputeRoundTripTimeout. Unfortunately, it's called by + // consumers who do not have a session. + const auto & maybeLocalMRPConfig = GetLocalMRPConfig(); + const auto & defaultMRRPConfig = GetDefaultMRPConfig(); + const auto & localMRPConfig = maybeLocalMRPConfig.ValueOr(defaultMRRPConfig); return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout, - // Assume peer is idle, since that's what we - // will assume for our initial message. + // Assume peer is idle, as a worst-case assumption (probably true for + // Sigma1, since that will be our initial message on the session, but less + // so for Sigma2). System::Clock::kZero, remoteMrpConfig.mActiveThresholdTime) + - kExpectedSigma1ProcessingTime; + serverProcessingTime + + GetRetransmissionTimeout(localMRPConfig.mActiveRetransTimeout, localMRPConfig.mIdleRetransTimeout, + // Peer will assume we are active, since it's + // responding to our message. + System::SystemClock().GetMonotonicTimestamp(), localMRPConfig.mActiveThresholdTime); +} +} // anonymous namespace + +System::Clock::Timeout CASESession::ComputeSigma1ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig) +{ + return ComputeRoundTripTimeout(kExpectedSigma1ProcessingTime, remoteMrpConfig); } System::Clock::Timeout CASESession::ComputeSigma2ResponseTimeout(const ReliableMessageProtocolConfig & remoteMrpConfig) { - return GetRetransmissionTimeout(remoteMrpConfig.mActiveRetransTimeout, remoteMrpConfig.mIdleRetransTimeout, - // Assume peer is idle, as a worst-case assumption. - System::Clock::kZero, remoteMrpConfig.mActiveThresholdTime) + - kExpectedHighProcessingTime; + return ComputeRoundTripTimeout(kExpectedHighProcessingTime, remoteMrpConfig); } bool CASESession::InvokeBackgroundWorkWatchdog() diff --git a/src/transport/GroupSession.h b/src/transport/GroupSession.h index f20ff069ab029f..1c5ffecd2be3c9 100644 --- a/src/transport/GroupSession.h +++ b/src/transport/GroupSession.h @@ -75,6 +75,13 @@ class IncomingGroupSession : public Session, public ReferenceCounted