Skip to content

Commit 4aadee7

Browse files
Stop doing read-throughs on MTRDevice when subscription is being established. (project-chip#32990)
* Stop doing read-throughs on MTRDevice when subscription is being established. The key goal is to stop doing read-throughs when we expect to start getting data soon: between sending the Sigma1 for our initial subscribe and getting the first priming read data packets. Any reads dispatched during that time "because we don't have a subscription yet" are just wasted network traffic, because chances are we're about to have a subscription and get that data as part of the priming read anyway. The main fix is just to stop doing read-throughs altogether for any attributes known to not be C and if we have a subscription delegate. In that state we should be subscribed and should be getting updates for attribute values, and there's no reason to read through. The rest of the changes are related to a side-effect of the stopping of read-throughs: if we had a subscription drop and have backed off quite a bit, a read-through that succeeded could prod us to retry subscribing. We don't want to do a read just to (maybe) trigger that behavior, though. Instead, treat a skipped read-through as a direct signal to retry our subscription attempt. But not too often. This "try to resubscribe" heuristic can probably use some tweaking. * Fix XPC test failure.
1 parent 0c17072 commit 4aadee7

File tree

1 file changed

+120
-10
lines changed

1 file changed

+120
-10
lines changed

src/darwin/Framework/CHIP/MTRDevice.mm

+120-10
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#import "MTRCommandTimedCheck.h"
2929
#import "MTRConversion.h"
3030
#import "MTRDefines_Internal.h"
31+
#import "MTRDeviceControllerOverXPC.h"
3132
#import "MTRDeviceController_Internal.h"
3233
#import "MTRDevice_Internal.h"
3334
#import "MTRError_Internal.h"
@@ -45,6 +46,7 @@
4546
#include <app/BufferedReadCallback.h>
4647
#include <app/ClusterStateCache.h>
4748
#include <app/InteractionModelEngine.h>
49+
#include <platform/LockTracker.h>
4850
#include <platform/PlatformManager.h>
4951

5052
typedef void (^MTRDeviceAttributeReportHandler)(NSArray * _Nonnull);
@@ -260,6 +262,13 @@ - (BOOL)isEqual:(id)object
260262

261263
@end
262264

265+
// Minimal time to wait since our last resubscribe failure before we will allow
266+
// a read attempt to prod our subscription.
267+
//
268+
// TODO: Figure out a better value for this, but for now don't allow this to
269+
// happen more often than once every 10 minutes.
270+
#define MTRDEVICE_MIN_RESUBSCRIBE_DUE_TO_READ_INTERVAL_SECONDS (10 * 60)
271+
263272
@interface MTRDevice ()
264273
@property (nonatomic, readonly) os_unfair_lock lock; // protects the caches and device state
265274
// protects against concurrent time updates by guarding timeUpdateScheduled flag which manages time updates scheduling,
@@ -340,6 +349,11 @@ @implementation MTRDevice {
340349
// as the read cache, should testing prove attribute storage by cluster is the better solution.
341350
NSMutableDictionary<MTRClusterPath *, MTRDeviceClusterData *> * _clusterData;
342351
NSMutableSet<MTRClusterPath *> * _clustersToPersist;
352+
353+
// When we last failed to subscribe to the device (either via
354+
// _setupSubscription or via the auto-resubscribe behavior of the
355+
// ReadClient). Nil if we have had no such failures.
356+
NSDate * _Nullable _lastSubscriptionFailureTime;
343357
}
344358

345359
- (instancetype)initWithNodeID:(NSNumber *)nodeID controller:(MTRDeviceController *)controller
@@ -659,13 +673,26 @@ - (void)invalidate
659673

660674
- (void)nodeMayBeAdvertisingOperational
661675
{
676+
assertChipStackLockedByCurrentThread();
677+
662678
MTR_LOG_DEFAULT("%@ saw new operational advertisement", self);
663679

680+
[self _triggerResubscribeWithReason:"operational advertisement seen"
681+
nodeLikelyReachable:YES];
682+
}
683+
684+
// Trigger a resubscribe as needed. nodeLikelyReachable should be YES if we
685+
// have reason to suspect the node is now reachable, NO if we have no idea
686+
// whether it might be.
687+
- (void)_triggerResubscribeWithReason:(const char *)reason nodeLikelyReachable:(BOOL)nodeLikelyReachable
688+
{
689+
assertChipStackLockedByCurrentThread();
690+
664691
// We might want to trigger a resubscribe on our existing ReadClient. Do
665692
// that outside the scope of our lock, so we're not calling arbitrary code
666-
// we don't control with the lock held. This is safe, because when
667-
// nodeMayBeAdvertisingOperational is called we are running on the Matter
668-
// queue, and the ReadClient can't get destroyed while we are on that queue.
693+
// we don't control with the lock held. This is safe, because we are
694+
// running on he Matter queue and the ReadClient can't get destroyed while
695+
// we are on that queue.
669696
ReadClient * readClientToResubscribe = nullptr;
670697
SubscriptionCallback * subscriptionCallback = nullptr;
671698

@@ -684,17 +711,85 @@ - (void)nodeMayBeAdvertisingOperational
684711
os_unfair_lock_unlock(&self->_lock);
685712

686713
if (readClientToResubscribe) {
687-
subscriptionCallback->ResetResubscriptionBackoff();
688-
readClientToResubscribe->TriggerResubscribeIfScheduled("operational advertisement seen");
714+
if (nodeLikelyReachable) {
715+
// If we have reason to suspect the node is now reachable, reset the
716+
// backoff timer, so that if this attempt fails we'll try again
717+
// quickly; it's possible we'll just catch the node at a bad time
718+
// here (e.g. still booting up), but should try again reasonably quickly.
719+
subscriptionCallback->ResetResubscriptionBackoff();
720+
}
721+
readClientToResubscribe->TriggerResubscribeIfScheduled(reason);
689722
}
690723
}
691724

692-
// Return YES if there's a valid delegate AND subscription is expected to report value
725+
// Return YES if we are in a state where, apart from communication issues with
726+
// the device, we will be able to get reports via our subscription.
693727
- (BOOL)_subscriptionAbleToReport
694728
{
695729
std::lock_guard lock(_lock);
696730
id<MTRDeviceDelegate> delegate = _weakDelegate.strongObject;
697-
return (delegate != nil) && (_state == MTRDeviceStateReachable);
731+
if (delegate == nil) {
732+
// No delegate definitely means no subscription.
733+
return NO;
734+
}
735+
736+
// For unit testing only, matching logic in setDelegate
737+
#ifdef DEBUG
738+
id testDelegate = delegate;
739+
if ([testDelegate respondsToSelector:@selector(unitTestShouldSetUpSubscriptionForDevice:)]) {
740+
if (![testDelegate unitTestShouldSetUpSubscriptionForDevice:self]) {
741+
return NO;
742+
}
743+
}
744+
#endif
745+
746+
// Unfortunately, we currently have no subscriptions over our hacked-up XPC
747+
// setup. Try to detect that situation.
748+
if ([_deviceController.class respondsToSelector:@selector(sharedControllerWithID:xpcConnectBlock:)]) {
749+
return NO;
750+
}
751+
752+
return YES;
753+
}
754+
755+
// Notification that read-through was skipped for an attribute read.
756+
- (void)_readThroughSkipped
757+
{
758+
std::lock_guard lock(_lock);
759+
if (_state == MTRDeviceStateReachable) {
760+
// We're getting reports from the device, so there's nothing else to be
761+
// done here. We could skip this check, because our "try to
762+
// resubscribe" code would be a no-op in this case, but then we'd have
763+
// an extra dispatch in the common case of read-while-subscribed, which
764+
// is not great for peformance.
765+
return;
766+
}
767+
768+
if (_lastSubscriptionFailureTime == nil) {
769+
// No need to try to do anything here, because we have never failed a
770+
// subscription attempt (so we might be in the middle of one now, and no
771+
// need to prod things along).
772+
return;
773+
}
774+
775+
if ([[NSDate now] timeIntervalSinceDate:_lastSubscriptionFailureTime] < MTRDEVICE_MIN_RESUBSCRIBE_DUE_TO_READ_INTERVAL_SECONDS) {
776+
// Not enough time has passed since we last tried. Don't create extra
777+
// network traffic.
778+
//
779+
// TODO: Do we need to worry about this being too spammy in the log if
780+
// we keep getting reads while not subscribed? We could add another
781+
// backoff timer or counter for the log line...
782+
MTR_LOG_DEBUG("%@ skipping resubscribe from skipped read-through: not enough time has passed since %@", self, _lastSubscriptionFailureTime);
783+
return;
784+
}
785+
786+
// Do the remaining work on the Matter queue, because we may want to touch
787+
// ReadClient in there. If the dispatch fails, that's fine; it means our
788+
// controller has shut down, so nothing to be done.
789+
[_deviceController asyncDispatchToMatterQueue:^{
790+
[self _triggerResubscribeWithReason:"read-through skipped while not subscribed" nodeLikelyReachable:NO];
791+
}
792+
errorHandler:nil];
698793
}
699794

700795
- (BOOL)_callDelegateWithBlock:(void (^)(id<MTRDeviceDelegate>))block
@@ -791,15 +886,29 @@ - (void)_handleResubscriptionNeeded
791886
std::lock_guard lock(_lock);
792887

793888
[self _changeState:MTRDeviceStateUnknown];
889+
890+
// If we are here, then the ReadClient either just detected a subscription
891+
// drop or just tried again and failed. Either way, count it as "tried and
892+
// failed to subscribe": in the latter case it's actually true, and in the
893+
// former case we recently had a subscription and do not want to be forcing
894+
// retries immediately.
895+
_lastSubscriptionFailureTime = [NSDate now];
794896
}
795897

796898
- (void)_handleSubscriptionReset:(NSNumber * _Nullable)retryDelay
797899
{
798900
std::lock_guard lock(_lock);
901+
902+
// If we are here, then either we failed to establish initil CASE, or we
903+
// failed to send the initial SubscribeRequest message, or our ReadClient
904+
// has given up completely. Those all count as "we have tried and failed to
905+
// subscribe".
906+
_lastSubscriptionFailureTime = [NSDate now];
907+
799908
// if there is no delegate then also do not retry
800909
id<MTRDeviceDelegate> delegate = _weakDelegate.strongObject;
801910
if (!delegate) {
802-
// NOTE: Do not log anythig here: we have been invalidated, and the
911+
// NOTE: Do not log anything here: we have been invalidated, and the
803912
// Matter stack might already be torn down.
804913
return;
805914
}
@@ -1222,15 +1331,14 @@ - (void)_setupSubscription
12221331
// Drop our pointer to the ReadClient immediately, since
12231332
// it's about to be destroyed and we don't want to be
12241333
// holding a dangling pointer.
1225-
os_unfair_lock_lock(&self->_lock);
1334+
std::lock_guard lock(self->_lock);
12261335
self->_currentReadClient = nullptr;
12271336
self->_currentSubscriptionCallback = nullptr;
12281337

12291338
dispatch_async(self.queue, ^{
12301339
// OnDone
12311340
[self _handleSubscriptionReset:nil];
12321341
});
1233-
os_unfair_lock_unlock(&self->_lock);
12341342
},
12351343
^(void) {
12361344
MTR_LOG_DEFAULT("%@ got unsolicited message from publisher", self);
@@ -1624,6 +1732,8 @@ static BOOL AttributeHasChangesOmittedQuality(MTRAttributePath * attributePath)
16241732
}];
16251733
}];
16261734
[_asyncWorkQueue enqueueWorkItem:workItem descriptionWithFormat:@"read %@ 0x%llx 0x%llx", endpointID, clusterID.unsignedLongLongValue, attributeID.unsignedLongLongValue];
1735+
} else {
1736+
[self _readThroughSkipped];
16271737
}
16281738

16291739
return attributeValueToReturn;

0 commit comments

Comments
 (0)