Skip to content

Commit a97bad9

Browse files
DeviceCommissioner: Track ExtendArmFailSafe conditionally and manage CASE callbacks accurately (project-chip#32522)
* Handle --faults argument for example apps * Add CASEServerBusy fault injection point * DeviceCommissioner: Manage CASE callbacks accurately - Ensure all callbacks are unregistered once CASE setup is done. - Cancel CASE setup callbacks on StopPairing. * DeviceCommissioner: Track ExtendArmFailSafe conditionally Only pass a context into ExtendArmFailSafe callbacks and track them as the current cancelable invocation when it is actually a part of the sequential commissioning flow. Treat as "fire and forget" when the method is called by an external client, or when we're handling CASE retries. Fixes project-chip#32521 * Address review comments * Make fireAndForget non-optional for ExtendArmFailSafeInternal
1 parent 6a254ac commit a97bad9

File tree

7 files changed

+124
-76
lines changed

7 files changed

+124
-76
lines changed

examples/platform/linux/Options.cpp

+32
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include <lib/core/CHIPError.h>
2929
#include <lib/support/Base64.h>
3030
#include <lib/support/BytesToHex.h>
31+
#include <lib/support/CHIPMemString.h>
32+
#include <lib/support/CodeUtils.h>
3133
#include <lib/support/SafeInt.h>
3234

3335
#include <credentials/examples/DeviceAttestationCredsExample.h>
@@ -36,6 +38,12 @@
3638
#include <TracingCommandLineArgument.h> // nogncheck
3739
#endif
3840

41+
#if CHIP_WITH_NLFAULTINJECTION
42+
#include <inet/InetFaultInjection.h>
43+
#include <lib/support/CHIPFaultInjection.h>
44+
#include <system/SystemFaultInjection.h>
45+
#endif
46+
3947
using namespace chip;
4048
using namespace chip::ArgParser;
4149

@@ -94,6 +102,9 @@ enum
94102
#if CONFIG_BUILD_FOR_HOST_UNIT_TEST
95103
kDeviceOption_SubscriptionResumptionRetryIntervalSec = 0x1026,
96104
#endif
105+
#if CHIP_WITH_NLFAULTINJECTION
106+
kDeviceOption_FaultInjection = 0x1027,
107+
#endif
97108
};
98109

99110
constexpr unsigned kAppUsageLength = 64;
@@ -155,6 +166,9 @@ OptionDef sDeviceOptionDefs[] = {
155166
#if CONFIG_BUILD_FOR_HOST_UNIT_TEST
156167
{ "subscription-capacity", kArgumentRequired, kDeviceOption_SubscriptionCapacity },
157168
{ "subscription-resumption-retry-interval", kArgumentRequired, kDeviceOption_SubscriptionResumptionRetryIntervalSec },
169+
#endif
170+
#if CHIP_WITH_NLFAULTINJECTION
171+
{ "faults", kArgumentRequired, kDeviceOption_FaultInjection },
158172
#endif
159173
{}
160174
};
@@ -286,6 +300,10 @@ const char * sDeviceOptionHelp =
286300
" Max number of subscriptions the device will allow\n"
287301
" --subscription-resumption-retry-interval\n"
288302
" subscription timeout resumption retry interval in seconds\n"
303+
#endif
304+
#if CHIP_WITH_NLFAULTINJECTION
305+
" --faults <fault-string,...>\n"
306+
" Inject specified fault(s) at runtime.\n"
289307
#endif
290308
"\n";
291309

@@ -556,6 +574,20 @@ bool HandleOption(const char * aProgram, OptionSet * aOptions, int aIdentifier,
556574
case kDeviceOption_SubscriptionResumptionRetryIntervalSec:
557575
LinuxDeviceOptions::GetInstance().subscriptionResumptionRetryIntervalSec = static_cast<int32_t>(atoi(aValue));
558576
break;
577+
#endif
578+
#if CHIP_WITH_NLFAULTINJECTION
579+
case kDeviceOption_FaultInjection: {
580+
constexpr nl::FaultInjection::GetManagerFn faultManagerFns[] = { FaultInjection::GetManager,
581+
Inet::FaultInjection::GetManager,
582+
System::FaultInjection::GetManager };
583+
Platform::ScopedMemoryString mutableArg(aValue, strlen(aValue)); // ParseFaultInjectionStr may mutate
584+
if (!nl::FaultInjection::ParseFaultInjectionStr(mutableArg.Get(), faultManagerFns, ArraySize(faultManagerFns)))
585+
{
586+
PrintArgError("%s: Invalid fault injection specification\n", aProgram);
587+
retval = false;
588+
}
589+
break;
590+
}
559591
#endif
560592
default:
561593
PrintArgError("%s: INTERNAL ERROR: Unhandled option: %s\n", aProgram, aName);

src/controller/CHIPDeviceController.cpp

+64-72
Original file line numberDiff line numberDiff line change
@@ -953,6 +953,20 @@ void DeviceCommissioner::CancelCommissioningInteractions()
953953
mInvokeCancelFn();
954954
mInvokeCancelFn = nullptr;
955955
}
956+
if (mOnDeviceConnectedCallback.IsRegistered())
957+
{
958+
ChipLogDetail(Controller, "Cancelling CASE setup for step '%s'", StageToString(mCommissioningStage));
959+
CancelCASECallbacks();
960+
}
961+
}
962+
963+
void DeviceCommissioner::CancelCASECallbacks()
964+
{
965+
mOnDeviceConnectedCallback.Cancel();
966+
mOnDeviceConnectionFailureCallback.Cancel();
967+
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
968+
mOnDeviceConnectionRetryCallback.Cancel();
969+
#endif
956970
}
957971

958972
CHIP_ERROR DeviceCommissioner::UnpairDevice(NodeId remoteDeviceId)
@@ -1216,9 +1230,10 @@ void DeviceCommissioner::OnICDManagementRegisterClientResponse(
12161230
commissioner->CommissioningStageComplete(CHIP_NO_ERROR, report);
12171231
}
12181232

1219-
bool DeviceCommissioner::ExtendArmFailSafe(DeviceProxy * proxy, CommissioningStage step, uint16_t armFailSafeTimeout,
1220-
Optional<System::Clock::Timeout> commandTimeout, OnExtendFailsafeSuccess onSuccess,
1221-
OnExtendFailsafeFailure onFailure)
1233+
bool DeviceCommissioner::ExtendArmFailSafeInternal(DeviceProxy * proxy, CommissioningStage step, uint16_t armFailSafeTimeout,
1234+
Optional<System::Clock::Timeout> commandTimeout,
1235+
OnExtendFailsafeSuccess onSuccess, OnExtendFailsafeFailure onFailure,
1236+
bool fireAndForget)
12221237
{
12231238
using namespace System;
12241239
using namespace System::Clock;
@@ -1237,17 +1252,15 @@ bool DeviceCommissioner::ExtendArmFailSafe(DeviceProxy * proxy, CommissioningSta
12371252
request.expiryLengthSeconds = armFailSafeTimeout;
12381253
request.breadcrumb = breadcrumb;
12391254
ChipLogProgress(Controller, "Arming failsafe (%u seconds)", request.expiryLengthSeconds);
1240-
CHIP_ERROR err = SendCommissioningCommand(proxy, request, onSuccess, onFailure, kRootEndpointId, commandTimeout);
1255+
CHIP_ERROR err = SendCommissioningCommand(proxy, request, onSuccess, onFailure, kRootEndpointId, commandTimeout, fireAndForget);
12411256
if (err != CHIP_NO_ERROR)
12421257
{
1243-
onFailure(this, err);
1244-
}
1245-
else
1246-
{
1247-
// TODO: Handle the situation when our command ends up erroring out
1248-
// asynchronously?
1249-
proxy->SetFailSafeExpirationTimestamp(newFailSafeTimeout);
1258+
onFailure((!fireAndForget) ? this : nullptr, err);
1259+
return true; // we have called onFailure already
12501260
}
1261+
1262+
// Note: The stored timestamp may become invalid if we fail asynchronously
1263+
proxy->SetFailSafeExpirationTimestamp(newFailSafeTimeout);
12511264
return true;
12521265
}
12531266

@@ -1269,9 +1282,9 @@ void DeviceCommissioner::ExtendArmFailSafeForDeviceAttestation(const Credentials
12691282
// Per spec, anything we do with the fail-safe armed must not time out
12701283
// in less than kMinimumCommissioningStepTimeout.
12711284
waitForFailsafeExtension =
1272-
ExtendArmFailSafe(mDeviceBeingCommissioned, mCommissioningStage, expiryLengthSeconds.Value(),
1273-
MakeOptional(kMinimumCommissioningStepTimeout), OnArmFailSafeExtendedForDeviceAttestation,
1274-
OnFailedToExtendedArmFailSafeDeviceAttestation);
1285+
ExtendArmFailSafeInternal(mDeviceBeingCommissioned, mCommissioningStage, expiryLengthSeconds.Value(),
1286+
MakeOptional(kMinimumCommissioningStepTimeout), OnArmFailSafeExtendedForDeviceAttestation,
1287+
OnFailedToExtendedArmFailSafeDeviceAttestation, /* fireAndForget = */ false);
12751288
}
12761289
else
12771290
{
@@ -1848,58 +1861,34 @@ void DeviceCommissioner::OnDeviceConnectedFn(void * context, Messaging::Exchange
18481861
{
18491862
// CASE session established.
18501863
DeviceCommissioner * commissioner = static_cast<DeviceCommissioner *>(context);
1851-
VerifyOrReturn(commissioner != nullptr, ChipLogProgress(Controller, "Device connected callback with null context. Ignoring"));
1864+
VerifyOrDie(commissioner->mCommissioningStage == CommissioningStage::kFindOperational);
1865+
VerifyOrDie(commissioner->mDeviceBeingCommissioned->GetDeviceId() == sessionHandle->GetPeer().GetNodeId());
1866+
commissioner->CancelCASECallbacks(); // ensure all CASE callbacks are unregistered
18521867

1853-
if (commissioner->mCommissioningStage != CommissioningStage::kFindOperational)
1854-
{
1855-
// This call is definitely not us finding our commissionee device.
1856-
// This is presumably us trying to re-establish CASE on MRP failure.
1857-
return;
1858-
}
1859-
1860-
if (commissioner->mDeviceBeingCommissioned == nullptr ||
1861-
commissioner->mDeviceBeingCommissioned->GetDeviceId() != sessionHandle->GetPeer().GetNodeId())
1862-
{
1863-
// Not the device we are trying to commission.
1864-
return;
1865-
}
1866-
1867-
if (commissioner->mCommissioningDelegate != nullptr)
1868-
{
1869-
CommissioningDelegate::CommissioningReport report;
1870-
report.Set<OperationalNodeFoundData>(OperationalNodeFoundData(OperationalDeviceProxy(&exchangeMgr, sessionHandle)));
1871-
commissioner->CommissioningStageComplete(CHIP_NO_ERROR, report);
1872-
}
1868+
CommissioningDelegate::CommissioningReport report;
1869+
report.Set<OperationalNodeFoundData>(OperationalNodeFoundData(OperationalDeviceProxy(&exchangeMgr, sessionHandle)));
1870+
commissioner->CommissioningStageComplete(CHIP_NO_ERROR, report);
18731871
}
18741872

18751873
void DeviceCommissioner::OnDeviceConnectionFailureFn(void * context, const ScopedNodeId & peerId, CHIP_ERROR error)
18761874
{
18771875
// CASE session establishment failed.
18781876
DeviceCommissioner * commissioner = static_cast<DeviceCommissioner *>(context);
1877+
VerifyOrDie(commissioner->mCommissioningStage == CommissioningStage::kFindOperational);
1878+
VerifyOrDie(commissioner->mDeviceBeingCommissioned->GetDeviceId() == peerId.GetNodeId());
1879+
commissioner->CancelCASECallbacks(); // ensure all CASE callbacks are unregistered
18791880

1880-
ChipLogProgress(Controller, "Device connection failed. Error %s", ErrorStr(error));
1881-
VerifyOrReturn(commissioner != nullptr,
1882-
ChipLogProgress(Controller, "Device connection failure callback with null context. Ignoring"));
1883-
1884-
// Ensure that commissioning stage advancement is done based on seeing an error.
1885-
if (error == CHIP_NO_ERROR)
1881+
if (error != CHIP_NO_ERROR)
18861882
{
1887-
ChipLogError(Controller, "Device connection failed without a valid error code. Making one up.");
1888-
error = CHIP_ERROR_INTERNAL;
1883+
ChipLogProgress(Controller, "Device connection failed. Error %" CHIP_ERROR_FORMAT, error.Format());
18891884
}
1890-
1891-
if (commissioner->mDeviceBeingCommissioned == nullptr ||
1892-
commissioner->mDeviceBeingCommissioned->GetDeviceId() != peerId.GetNodeId())
1893-
{
1894-
// Not the device we are trying to commission.
1895-
return;
1896-
}
1897-
1898-
if (commissioner->mCommissioningStage == CommissioningStage::kFindOperational &&
1899-
commissioner->mCommissioningDelegate != nullptr)
1885+
else
19001886
{
1901-
commissioner->CommissioningStageComplete(error);
1887+
// Ensure that commissioning stage advancement is done based on seeing an error.
1888+
ChipLogError(Controller, "Device connection failed without a valid error code.");
1889+
error = CHIP_ERROR_INTERNAL;
19021890
}
1891+
commissioner->CommissioningStageComplete(error);
19031892
}
19041893

19051894
#if CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
@@ -1926,6 +1915,8 @@ void DeviceCommissioner::OnDeviceConnectionRetryFn(void * context, const ScopedN
19261915
ChipLogValueScopedNodeId(peerId), error.Format(), retryTimeout.count());
19271916

19281917
auto self = static_cast<DeviceCommissioner *>(context);
1918+
VerifyOrDie(self->mCommissioningStage == CommissioningStage::kFindOperational);
1919+
VerifyOrDie(self->mDeviceBeingCommissioned->GetDeviceId() == peerId.GetNodeId());
19291920

19301921
// We need to do the fail-safe arming over the PASE session.
19311922
auto * commissioneeDevice = self->FindCommissioneeDevice(peerId.GetNodeId());
@@ -1950,11 +1941,11 @@ void DeviceCommissioner::OnDeviceConnectionRetryFn(void * context, const ScopedN
19501941
{
19511942
failsafeTimeout = static_cast<uint16_t>(retryTimeout.count() + kDefaultFailsafeTimeout);
19521943
}
1953-
// A false return from ExtendArmFailSafe is fine; we don't want to make the
1954-
// fail-safe shorter here.
1955-
self->ExtendArmFailSafe(commissioneeDevice, CommissioningStage::kFindOperational, failsafeTimeout,
1956-
MakeOptional(kMinimumCommissioningStepTimeout), OnExtendFailsafeForCASERetrySuccess,
1957-
OnExtendFailsafeForCASERetryFailure);
1944+
1945+
// A false return is fine; we don't want to make the fail-safe shorter here.
1946+
self->ExtendArmFailSafeInternal(commissioneeDevice, CommissioningStage::kFindOperational, failsafeTimeout,
1947+
MakeOptional(kMinimumCommissioningStepTimeout), OnExtendFailsafeForCASERetrySuccess,
1948+
OnExtendFailsafeForCASERetryFailure, /* fireAndForget = */ true);
19581949
}
19591950
#endif // CHIP_DEVICE_CONFIG_ENABLE_AUTOMATIC_CASE_RETRIES
19601951

@@ -2552,19 +2543,22 @@ CHIP_ERROR
25522543
DeviceCommissioner::SendCommissioningCommand(DeviceProxy * device, const RequestObjectT & request,
25532544
CommandResponseSuccessCallback<typename RequestObjectT::ResponseType> successCb,
25542545
CommandResponseFailureCallback failureCb, EndpointId endpoint,
2555-
Optional<System::Clock::Timeout> timeout)
2546+
Optional<System::Clock::Timeout> timeout, bool fireAndForget)
25562547

25572548
{
2558-
VerifyOrDie(!mInvokeCancelFn); // we don't make parallel calls
2549+
// Default behavior is to make sequential, cancellable calls tracked via mInvokeCancelFn.
2550+
// Fire-and-forget calls are not cancellable and don't receive `this` as context in callbacks.
2551+
VerifyOrDie(fireAndForget || !mInvokeCancelFn); // we don't make parallel (cancellable) calls
25592552

2560-
auto onSuccessCb = [context = this, successCb](const app::ConcreteCommandPath & aPath, const app::StatusIB & aStatus,
2561-
const typename RequestObjectT::ResponseType & responseData) {
2553+
void * context = (!fireAndForget) ? this : nullptr;
2554+
auto onSuccessCb = [context, successCb](const app::ConcreteCommandPath & aPath, const app::StatusIB & aStatus,
2555+
const typename RequestObjectT::ResponseType & responseData) {
25622556
successCb(context, responseData);
25632557
};
2564-
auto onFailureCb = [context = this, failureCb](CHIP_ERROR aError) { failureCb(context, aError); };
2558+
auto onFailureCb = [context, failureCb](CHIP_ERROR aError) { failureCb(context, aError); };
25652559

25662560
return InvokeCommandRequest(device->GetExchangeManager(), device->GetSecureSession().Value(), endpoint, request, onSuccessCb,
2567-
onFailureCb, NullOptional, timeout, &mInvokeCancelFn);
2561+
onFailureCb, NullOptional, timeout, (!fireAndForget) ? &mInvokeCancelFn : nullptr);
25682562
}
25692563

25702564
void DeviceCommissioner::SendCommissioningReadRequest(DeviceProxy * proxy, Optional<System::Clock::Timeout> timeout,
@@ -2626,8 +2620,8 @@ void DeviceCommissioner::PerformCommissioningStep(DeviceProxy * proxy, Commissio
26262620
// Make sure the fail-safe value we set here actually ends up being used
26272621
// no matter what.
26282622
proxy->SetFailSafeExpirationTimestamp(System::Clock::kZero);
2629-
VerifyOrDie(ExtendArmFailSafe(proxy, step, params.GetFailsafeTimerSeconds().ValueOr(kDefaultFailsafeTimeout), timeout,
2630-
OnArmFailSafe, OnBasicFailure));
2623+
VerifyOrDie(ExtendArmFailSafeInternal(proxy, step, params.GetFailsafeTimerSeconds().ValueOr(kDefaultFailsafeTimeout),
2624+
timeout, OnArmFailSafe, OnBasicFailure, /* fireAndForget = */ false));
26312625
}
26322626
break;
26332627
case CommissioningStage::kReadCommissioningInfo: {
@@ -3270,12 +3264,10 @@ void DeviceCommissioner::ExtendFailsafeBeforeNetworkEnable(DeviceProxy * device,
32703264
failSafeTimeoutSecs = static_cast<uint16_t>(failSafeTimeoutSecs + sigma1TimeoutSecs);
32713265
}
32723266

3273-
// A false return from ExtendArmFailSafe is fine; we don't want to make the
3274-
// fail-safe shorter here.
3275-
if (!ExtendArmFailSafe(commissioneeDevice, step, failSafeTimeoutSecs, MakeOptional(kMinimumCommissioningStepTimeout),
3276-
OnArmFailSafe, OnBasicFailure))
3267+
if (!ExtendArmFailSafeInternal(commissioneeDevice, step, failSafeTimeoutSecs, MakeOptional(kMinimumCommissioningStepTimeout),
3268+
OnArmFailSafe, OnBasicFailure, /* fireAndForget = */ false))
32773269
{
3278-
// Just move on to the next step.
3270+
// A false return is fine; we don't want to make the fail-safe shorter here.
32793271
CommissioningStageComplete(CHIP_NO_ERROR, CommissioningDelegate::CommissioningReport());
32803272
}
32813273
}

src/controller/CHIPDeviceController.h

+12-2
Original file line numberDiff line numberDiff line change
@@ -765,7 +765,12 @@ class DLL_EXPORT DeviceCommissioner : public DeviceController,
765765
// onSuccess nor onFailure will be called.
766766
bool ExtendArmFailSafe(DeviceProxy * proxy, CommissioningStage step, uint16_t armFailSafeTimeout,
767767
Optional<System::Clock::Timeout> commandTimeout, OnExtendFailsafeSuccess onSuccess,
768-
OnExtendFailsafeFailure onFailure);
768+
OnExtendFailsafeFailure onFailure)
769+
{
770+
// If this method is called directly by a client, assume it's fire-and-forget (not a commissioning stage)
771+
return ExtendArmFailSafeInternal(proxy, step, armFailSafeTimeout, commandTimeout, onSuccess, onFailure,
772+
/* fireAndForget = */ true);
773+
}
769774

770775
private:
771776
DevicePairingDelegate * mPairingDelegate = nullptr;
@@ -957,14 +962,19 @@ class DLL_EXPORT DeviceCommissioner : public DeviceController,
957962
CommissioneeDeviceProxy * FindCommissioneeDevice(const Transport::PeerAddress & peerAddress);
958963
void ReleaseCommissioneeDevice(CommissioneeDeviceProxy * device);
959964

965+
bool ExtendArmFailSafeInternal(DeviceProxy * proxy, CommissioningStage step, uint16_t armFailSafeTimeout,
966+
Optional<System::Clock::Timeout> commandTimeout, OnExtendFailsafeSuccess onSuccess,
967+
OnExtendFailsafeFailure onFailure, bool fireAndForget);
968+
960969
template <typename RequestObjectT>
961970
CHIP_ERROR SendCommissioningCommand(DeviceProxy * device, const RequestObjectT & request,
962971
CommandResponseSuccessCallback<typename RequestObjectT::ResponseType> successCb,
963972
CommandResponseFailureCallback failureCb, EndpointId endpoint,
964-
Optional<System::Clock::Timeout> timeout = NullOptional);
973+
Optional<System::Clock::Timeout> timeout = NullOptional, bool fireAndForget = false);
965974
void SendCommissioningReadRequest(DeviceProxy * proxy, Optional<System::Clock::Timeout> timeout,
966975
app::AttributePathParams * readPaths, size_t readPathsSize);
967976
void CancelCommissioningInteractions();
977+
void CancelCASECallbacks();
968978

969979
#if CHIP_CONFIG_ENABLE_READ_CLIENT
970980
void ParseCommissioningInfo();

src/darwin/Framework/CHIPTests/MTRPairingTests.m

+8
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
// Fixture: chip-all-clusters-app --KVS "$(mktemp -t chip-test-kvs)" --interface-id -1 \
3030
--dac_provider credentials/development/commissioner_dut/struct_cd_origin_pid_vid_correct/test_case_vector.json \
3131
--product-id 32768 --discriminator 3839
32+
// For manual testing, CASE retry code paths can be tested by adding --faults chip_CASEServerBusy_f1 (or similar)
3233

3334
static const uint16_t kPairingTimeoutInSeconds = 10;
3435
static const uint16_t kTimeoutInSeconds = 3;
@@ -243,6 +244,7 @@ - (void)test004_PairWithAttestationDelegateFailsafeExtensionLong
243244
- (void)doPairingAndWaitForProgress:(NSString *)trigger
244245
{
245246
XCTestExpectation * expectation = [self expectationWithDescription:@"Trigger message seen"];
247+
expectation.assertForOverFulfill = NO;
246248
MTRSetLogCallback(MTRLogTypeDetail, ^(MTRLogType type, NSString * moduleName, NSString * message) {
247249
if ([message containsString:trigger]) {
248250
[expectation fulfill];
@@ -298,4 +300,10 @@ - (void)test006_pairingAfterCancellation_ConfigRegulatoryCommand
298300
[self doPairingTestAfterCancellationAtProgress:@"Performing next commissioning step 'ConfigRegulatory'"];
299301
}
300302

303+
- (void)test007_pairingAfterCancellation_FindOperational
304+
{
305+
// Ensure CASE establishment has started by waiting for 'FindOrEstablishSession'
306+
[self doPairingTestAfterCancellationAtProgress:@"FindOrEstablishSession:"];
307+
}
308+
301309
@end

src/lib/support/CHIPFaultInjection.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ static const nl::FaultInjection::Name sFaultNames[] = {
5353
#if CONFIG_NETWORK_LAYER_BLE
5454
"CHIPOBLESend",
5555
#endif // CONFIG_NETWORK_LAYER_BLE
56+
"CASEServerBusy",
5657
};
5758

5859
/**

0 commit comments

Comments
 (0)