Skip to content

Commit 30d598f

Browse files
Stop doing read-throughs on MTRDevice when subscription is being established.
The key goal is to stop doing read-throughs when we expect to start getting data soon: between sending the Sigma1 for our initial subscribe and getting the first priming read data packets. Any reads dispatched during that time "because we don't have a subscription yet" are just wasted network traffic, because chances are we're about to have a subscription and get that data as part of the priming read anyway. The main fix is just to stop doing read-throughs altogether for any attributes known to not be C and if we have a subscription delegate. In that state we should be subscribed and should be getting updates for attribute values, and there's no reason to read through. The rest of the changes are related to a side-effect of the stopping of read-throughs: if we had a subscription drop and have backed off quite a bit, a read-through that succeeded could prod us to retry subscribing. We don't want to do a read just to (maybe) trigger that behavior, though. Instead, treat a skipped read-through as a direct signal to retry our subscription attempt. But not too often. This "try to resubscribe" heuristic can probably use some tweaking.
1 parent fd72eae commit 30d598f

File tree

1 file changed

+99
-10
lines changed

1 file changed

+99
-10
lines changed

src/darwin/Framework/CHIP/MTRDevice.mm

+99-10
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include <app/BufferedReadCallback.h>
4646
#include <app/ClusterStateCache.h>
4747
#include <app/InteractionModelEngine.h>
48+
#include <platform/LockTracker.h>
4849
#include <platform/PlatformManager.h>
4950

5051
typedef void (^MTRDeviceAttributeReportHandler)(NSArray * _Nonnull);
@@ -252,6 +253,13 @@ - (BOOL)isEqual:(id)object
252253

253254
@end
254255

256+
// Minimal time to wait since our last resubscribe failure before we will allow
257+
// a read attempt to prod our subscription.
258+
//
259+
// TODO: Figure out a better value for this, but for now don't allow this to
260+
// happen more often than once every 10 minutes.
261+
#define MTRDEVICE_MIN_RESUBSCRIBE_DUE_TO_READ_INTERVAL_SECONDS (10 * 60)
262+
255263
@interface MTRDevice ()
256264
@property (nonatomic, readonly) os_unfair_lock lock; // protects the caches and device state
257265
// protects against concurrent time updates by guarding timeUpdateScheduled flag which manages time updates scheduling,
@@ -332,6 +340,11 @@ @implementation MTRDevice {
332340
// as the read cache, should testing prove attribute storage by cluster is the better solution.
333341
NSMutableDictionary<MTRClusterPath *, MTRDeviceClusterData *> * _clusterData;
334342
NSMutableSet<MTRClusterPath *> * _clustersToPersist;
343+
344+
// When we last failed to subscribe to the device (either via
345+
// _setupSubscription or via the auto-resubscribe behavior of the
346+
// ReadClient). Nil if we have had no such failures.
347+
NSDate * _Nullable _lastSubscriptionFailureTime;
335348
}
336349

337350
- (instancetype)initWithNodeID:(NSNumber *)nodeID controller:(MTRDeviceController *)controller
@@ -651,13 +664,26 @@ - (void)invalidate
651664

652665
- (void)nodeMayBeAdvertisingOperational
653666
{
667+
assertChipStackLockedByCurrentThread();
668+
654669
MTR_LOG_DEFAULT("%@ saw new operational advertisement", self);
655670

671+
[self _triggerResubscribeWithReason:"operational advertisement seen"
672+
nodeLikelyReachable:YES];
673+
}
674+
675+
// Trigger a resubscribe as needed. nodeLikelyReachable should be YES if we
676+
// have reason to suspect the node is now reachable, NO if we have no idea
677+
// whether it might be.
678+
- (void)_triggerResubscribeWithReason:(const char *)reason nodeLikelyReachable:(BOOL)nodeLikelyReachable
679+
{
680+
assertChipStackLockedByCurrentThread();
681+
656682
// We might want to trigger a resubscribe on our existing ReadClient. Do
657683
// that outside the scope of our lock, so we're not calling arbitrary code
658-
// we don't control with the lock held. This is safe, because when
659-
// nodeMayBeAdvertisingOperational is called we are running on the Matter
660-
// queue, and the ReadClient can't get destroyed while we are on that queue.
684+
// we don't control with the lock held. This is safe, because we are
685+
// running on he Matter queue and the ReadClient can't get destroyed while
686+
// we are on that queue.
661687
ReadClient * readClientToResubscribe = nullptr;
662688
SubscriptionCallback * subscriptionCallback = nullptr;
663689

@@ -676,17 +702,65 @@ - (void)nodeMayBeAdvertisingOperational
676702
os_unfair_lock_unlock(&self->_lock);
677703

678704
if (readClientToResubscribe) {
679-
subscriptionCallback->ResetResubscriptionBackoff();
680-
readClientToResubscribe->TriggerResubscribeIfScheduled("operational advertisement seen");
705+
if (nodeLikelyReachable) {
706+
// If we have reason to suspect the node is now reachable, reset the
707+
// backoff timer, so that if this attempt fails we'll try again
708+
// quickly; it's possible we'll just catch the node at a bad time
709+
// here (e.g. still booting up), but should try again reasonably quickly.
710+
subscriptionCallback->ResetResubscriptionBackoff();
711+
}
712+
readClientToResubscribe->TriggerResubscribeIfScheduled(reason);
681713
}
682714
}
683715

684-
// Return YES if there's a valid delegate AND subscription is expected to report value
716+
// Return YES if there's a valid delegate. As long as we have that, we are
717+
// responsible for dealing with the subscription bits, and doing read-throughs
718+
// when we can't establish a subscription won't help things either.
685719
- (BOOL)_subscriptionAbleToReport
686720
{
687721
std::lock_guard lock(_lock);
688722
id<MTRDeviceDelegate> delegate = _weakDelegate.strongObject;
689-
return (delegate != nil) && (_state == MTRDeviceStateReachable);
723+
return (delegate != nil);
724+
}
725+
726+
// Notification that read-through was skipped for an attribute read.
727+
- (void)_readThroughSkipped
728+
{
729+
std::lock_guard lock(_lock);
730+
if (_state == MTRDeviceStateReachable) {
731+
// We're getting reports from the device, so there's nothing else to be
732+
// done here. We could skip this check, because our "try to
733+
// resubscribe" code would be a no-op in this case, but then we'd have
734+
// an extra dispatch in the common case of read-while-subscribed, which
735+
// is not great for peformance.
736+
return;
737+
}
738+
739+
if (_lastSubscriptionFailureTime == nil) {
740+
// No need to try to do anything here, because we have never failed a
741+
// subscription attempt (so we might be in the middle of one now, and no
742+
// need to prod things along).
743+
return;
744+
}
745+
746+
if ([[NSDate now] timeIntervalSinceDate:_lastSubscriptionFailureTime] < MTRDEVICE_MIN_RESUBSCRIBE_DUE_TO_READ_INTERVAL_SECONDS) {
747+
// Not enough time has passed since we last tried. Don't create extra
748+
// network traffic.
749+
//
750+
// TODO: Do we need to worry about this being too spammy in the log if
751+
// we keep getting reads while not subscribed? We could add another
752+
// backoff timer or counter for the log line...
753+
MTR_LOG_DEBUG("%@ skipping resubscribe from skipped read-through: not enough time has passed since %@", self, _lastSubscriptionFailureTime);
754+
return;
755+
}
756+
757+
// Do the remaining work on the Matter queue, because we may want to touch
758+
// ReadClient in there. If the dispatch fails, that's fine; it means our
759+
// controller has shut down, so nothing to be done.
760+
[_deviceController asyncDispatchToMatterQueue:^{
761+
[self _triggerResubscribeWithReason:"read-through skipped while not subscribed" nodeLikelyReachable:NO];
762+
}
763+
errorHandler:nil];
690764
}
691765

692766
- (BOOL)_callDelegateWithBlock:(void (^)(id<MTRDeviceDelegate>))block
@@ -783,15 +857,29 @@ - (void)_handleResubscriptionNeeded
783857
std::lock_guard lock(_lock);
784858

785859
[self _changeState:MTRDeviceStateUnknown];
860+
861+
// If we are here, then the ReadClient either just detected a subscription
862+
// drop or just tried again and failed. Either way, count it as "tried and
863+
// failed to subscribe": in the latter case it's actually true, and in the
864+
// former case we recently had a subscription and do not want to be forcing
865+
// retries immediately.
866+
_lastSubscriptionFailureTime = [NSDate now];
786867
}
787868

788869
- (void)_handleSubscriptionReset:(NSNumber * _Nullable)retryDelay
789870
{
790871
std::lock_guard lock(_lock);
872+
873+
// If we are here, then either we failed to establish initil CASE, or we
874+
// failed to send the initial SubscribeRequest message, or our ReadClient
875+
// has given up completely. Those all count as "we have tried and failed to
876+
// subscribe".
877+
_lastSubscriptionFailureTime = [NSDate now];
878+
791879
// if there is no delegate then also do not retry
792880
id<MTRDeviceDelegate> delegate = _weakDelegate.strongObject;
793881
if (!delegate) {
794-
// NOTE: Do not log anythig here: we have been invalidated, and the
882+
// NOTE: Do not log anything here: we have been invalidated, and the
795883
// Matter stack might already be torn down.
796884
return;
797885
}
@@ -1214,15 +1302,14 @@ - (void)_setupSubscription
12141302
// Drop our pointer to the ReadClient immediately, since
12151303
// it's about to be destroyed and we don't want to be
12161304
// holding a dangling pointer.
1217-
os_unfair_lock_lock(&self->_lock);
1305+
std::lock_guard lock(self->_lock);
12181306
self->_currentReadClient = nullptr;
12191307
self->_currentSubscriptionCallback = nullptr;
12201308

12211309
dispatch_async(self.queue, ^{
12221310
// OnDone
12231311
[self _handleSubscriptionReset:nil];
12241312
});
1225-
os_unfair_lock_unlock(&self->_lock);
12261313
},
12271314
^(void) {
12281315
MTR_LOG_DEFAULT("%@ got unsolicited message from publisher", self);
@@ -1616,6 +1703,8 @@ static BOOL AttributeHasChangesOmittedQuality(MTRAttributePath * attributePath)
16161703
}];
16171704
}];
16181705
[_asyncWorkQueue enqueueWorkItem:workItem descriptionWithFormat:@"read %@ 0x%llx 0x%llx", endpointID, clusterID.unsignedLongLongValue, attributeID.unsignedLongLongValue];
1706+
} else {
1707+
[self _readThroughSkipped];
16191708
}
16201709

16211710
return attributeValueToReturn;

0 commit comments

Comments
 (0)