Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DNM: Fix Fail-Safe after reboot #559

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions src/app/FailSafeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
#if CHIP_CONFIG_ENABLE_ICD_SERVER
#include <app/icd/server/ICDNotifier.h> // nogncheck
#endif
#include <lib/core/TLV.h>
#include <lib/support/DefaultStorageKeyAllocator.h>
#include <lib/support/SafeInt.h>
#include <platform/CHIPDeviceConfig.h>
#include <platform/ConnectivityManager.h>
Expand All @@ -34,6 +36,66 @@ using namespace chip::DeviceLayer;
namespace chip {
namespace app {

namespace {

// Tags for AddNOCStartedMarker storage
constexpr TLV::Tag kAddNOCStartedMarkerFabricIndexTag = TLV::ContextTag(0);

constexpr size_t AddNOCStartedMarkerContextTLVMaxSize()
{
// Add 2x uncommitted uint64_t to leave space for backwards/forwards
// versioning for this critical feature that runs at boot.
return TLV::EstimateStructOverhead(sizeof(FabricIndex), sizeof(uint64_t), sizeof(uint64_t));
}

} // namespace

CHIP_ERROR FailSafeContext::Init(const InitParams & initParams)
{
VerifyOrReturnError(initParams.storage != nullptr, CHIP_ERROR_INVALID_ARGUMENT);

mStorage = initParams.storage;

return CHIP_NO_ERROR;
}

void FailSafeContext::CheckAddNOCStartedMarker()
{
AddNOCStartedMarker marker;

CHIP_ERROR err = GetAddNOCStartedMarker(marker);

if (err == CHIP_NO_ERROR)
{
// This should not be possible at this point
VerifyOrDie(IsFailSafeArmed() == false);

// Fail-Safe may be busy due to cleanup scheduled by failed commit to FabricTable.
// We can ignore it here, AddNOCStartedMarker will be deleted when Fail-Safe is disarmed.
if (IsFailSafeBusy())
{
return;
}

// Found a marker! We need to trigger a cleanup.
ChipLogError(FailSafe, "Found a Fail-Safe marker for index 0x%x, preparing cleanup!",
static_cast<unsigned>(marker.fabricIndex));

// Fake arm Fail-Safe and trigger timer expiry.
// We handle only the case when new fabric is added. FabricTable CommitMarker
// is responsible for guarding the case of updating the existing fabric.
SetFailSafeArmed(true);
mFabricIndex = marker.fabricIndex;
mAddNocCommandHasBeenInvoked = true;
ForceFailSafeTimerExpiry();
}
else if (err != CHIP_ERROR_PERSISTED_STORAGE_VALUE_NOT_FOUND)
{
// Got an error, but somehow value is not missing altogether: inconsistent state but touch nothing.
ChipLogError(FailSafe, "Error loading Fail-Safe marker: %" CHIP_ERROR_FORMAT ", hope for the best!", err.Format());
}
}

void FailSafeContext::HandleArmFailSafeTimer(System::Layer * layer, void * aAppState)
{
FailSafeContext * failSafeContext = reinterpret_cast<FailSafeContext *>(aAppState);
Expand Down Expand Up @@ -152,5 +214,59 @@ void FailSafeContext::ForceFailSafeTimerExpiry()
FailSafeTimerExpired();
}

CHIP_ERROR FailSafeContext::GetAddNOCStartedMarker(AddNOCStartedMarker & outMarker)
{
VerifyOrReturnError(mStorage != nullptr, CHIP_ERROR_INCORRECT_STATE);

uint8_t tlvBuf[AddNOCStartedMarkerContextTLVMaxSize()];
uint16_t tlvSize = sizeof(tlvBuf);

ReturnErrorOnFailure(
mStorage->SyncGetKeyValue(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName(), tlvBuf, tlvSize));

// If buffer was too small, we won't reach here.
TLV::ContiguousBufferTLVReader reader;
reader.Init(tlvBuf, tlvSize);
ReturnErrorOnFailure(reader.Next(TLV::kTLVType_Structure, TLV::AnonymousTag()));

TLV::TLVType containerType;
ReturnErrorOnFailure(reader.EnterContainer(containerType));

ReturnErrorOnFailure(reader.Next(kAddNOCStartedMarkerFabricIndexTag));
ReturnErrorOnFailure(reader.Get(outMarker.fabricIndex));

// Don't try to exit container: we got all we needed. This allows us to
// avoid erroring-out on newer versions.

return CHIP_NO_ERROR;
}

CHIP_ERROR FailSafeContext::StoreAddNOCStartedMarker(const AddNOCStartedMarker & marker)
{
VerifyOrReturnError(mStorage != nullptr, CHIP_ERROR_INCORRECT_STATE);

uint8_t tlvBuf[AddNOCStartedMarkerContextTLVMaxSize()];
TLV::TLVWriter writer;
writer.Init(tlvBuf);

TLV::TLVType outerType;
ReturnErrorOnFailure(writer.StartContainer(TLV::AnonymousTag(), TLV::kTLVType_Structure, outerType));
ReturnErrorOnFailure(writer.Put(kAddNOCStartedMarkerFabricIndexTag, marker.fabricIndex));
ReturnErrorOnFailure(writer.EndContainer(outerType));

const auto markerContextTLVLength = writer.GetLengthWritten();
VerifyOrReturnError(CanCastTo<uint16_t>(markerContextTLVLength), CHIP_ERROR_BUFFER_TOO_SMALL);

return mStorage->SyncSetKeyValue(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName(), tlvBuf,
static_cast<uint16_t>(markerContextTLVLength));
}

void FailSafeContext::ClearAddNOCStartedMarker()
{
VerifyOrDie(mStorage != nullptr);

mStorage->SyncDeleteKeyValue(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName());
}

} // namespace app
} // namespace chip
38 changes: 36 additions & 2 deletions src/app/FailSafeContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,27 @@

#pragma once

#include "lib/core/CHIPPersistentStorageDelegate.h"
#include <lib/core/CHIPError.h>
#include <lib/core/DataModelTypes.h>
#include <platform/internal/CHIPDeviceLayerInternal.h>
#include <system/SystemClock.h>

namespace chip {
namespace app {

class FailSafeContext
{
public:
struct InitParams
{
// PersistentStorageDelegate for marker storage (MANDATORY).
PersistentStorageDelegate * storage = nullptr;
};

CHIP_ERROR Init(const InitParams & initParams);

void CheckAddNOCStartedMarker();

// ===== Members for internal use by other Device Layer components.

/**
Expand All @@ -48,11 +58,21 @@ class FailSafeContext
* @brief Cleanly disarm failsafe timer, such as on CommissioningComplete
*/
void DisarmFailSafe();

bool SetAddNocCommandStarted(FabricIndex nocFabricIndex)
{
mFabricIndex = nocFabricIndex;

AddNOCStartedMarker marker{ mFabricIndex };
return StoreAddNOCStartedMarker(marker) == CHIP_NO_ERROR;
}

void SetAddNocCommandInvoked(FabricIndex nocFabricIndex)
{
mAddNocCommandHasBeenInvoked = true;
mFabricIndex = nocFabricIndex;
}

void SetUpdateNocCommandInvoked() { mUpdateNocCommandHasBeenInvoked = true; }
void SetAddTrustedRootCertInvoked() { mAddTrustedRootCertHasBeenInvoked = true; }
void SetCsrRequestForUpdateNoc(bool isForUpdateNoc) { mIsCsrRequestForUpdateNoc = isForUpdateNoc; }
Expand Down Expand Up @@ -103,6 +123,16 @@ class FailSafeContext
void ForceFailSafeTimerExpiry();

private:
// Stored to indicate a Fail-Safe is in armed, so that clean-up cana run on next boot
// if device is reset e.g. during commissioning.
struct AddNOCStartedMarker
{
AddNOCStartedMarker() = default;
AddNOCStartedMarker(FabricIndex fabricIndex_) : fabricIndex{ fabricIndex_ } {}
FabricIndex fabricIndex = kUndefinedFabricIndex;
};

PersistentStorageDelegate * mStorage = nullptr;
bool mFailSafeArmed = false;
bool mFailSafeBusy = false;
bool mAddNocCommandHasBeenInvoked = false;
Expand Down Expand Up @@ -145,10 +175,14 @@ class FailSafeContext
mAddTrustedRootCertHasBeenInvoked = false;
mFailSafeBusy = false;
mIsCsrRequestForUpdateNoc = false;

ClearAddNOCStartedMarker();
}

void FailSafeTimerExpired();
CHIP_ERROR CommitToStorage();
CHIP_ERROR GetAddNOCStartedMarker(AddNOCStartedMarker & outMarker);
CHIP_ERROR StoreAddNOCStartedMarker(const AddNOCStartedMarker & marker);
void ClearAddNOCStartedMarker();
};

} // namespace app
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,9 @@ bool emberAfOperationalCredentialsClusterAddNOCCallback(app::CommandHandler * co
&newFabricIndex);
VerifyOrExit(err == CHIP_NO_ERROR, nocResponse = ConvertToNOCResponseStatus(err));

// Inform FailSafeContext about starting adding NOC for specified fabric
VerifyOrExit(failSafeContext.SetAddNocCommandStarted(newFabricIndex), nonDefaultStatus = Status::Failure);

// From here if we error-out, we should revert the fabric table pending updates
needRevert = true;

Expand Down
12 changes: 12 additions & 0 deletions src/app/server/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,14 @@ CHIP_ERROR Server::Init(const ServerInitParams & initParams)
SuccessOrExit(err);
}

{
app::FailSafeContext::InitParams failSafeInitParams;
failSafeInitParams.storage = mDeviceStorage;

err = mFailSafeContext.Init(failSafeInitParams);
SuccessOrExit(err);
}

SuccessOrExit(err = mAccessControl.Init(initParams.accessDelegate, sDeviceTypeResolver));
Access::SetAccessControl(mAccessControl);

Expand Down Expand Up @@ -437,6 +445,10 @@ CHIP_ERROR Server::Init(const ServerInitParams & initParams)
}
}

// Run fail-safe check for marker. If marker is present, then device was reset while fail-safe was armed
// and we need to trigger a cleanup.
GetFailSafeContext().CheckAddNOCStartedMarker();

#if CHIP_DEVICE_CONFIG_ENABLE_COMMISSIONER_DISCOVERY_CLIENT // support UDC port for commissioner declaration msgs
mUdcTransportMgr = chip::Platform::New<UdcTransportMgr>();
ReturnErrorOnFailure(mUdcTransportMgr->Init(Transport::UdpListenParameters(DeviceLayer::UDPEndPointManager())
Expand Down
62 changes: 61 additions & 1 deletion src/app/tests/TestFailSafeContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#include <lib/core/StringBuilderAdapters.h>
#include <lib/support/CHIPMem.h>
#include <lib/support/CodeUtils.h>
#include <lib/support/DefaultStorageKeyAllocator.h>
#include <lib/support/TestPersistentStorageDelegate.h>
#include <platform/CHIPDeviceLayer.h>

using namespace chip;
Expand Down Expand Up @@ -66,35 +68,93 @@ class TestFailSafeContext : public ::testing::Test

TEST_F(TestFailSafeContext, TestFailSafeContext_ArmFailSafe)
{
chip::TestPersistentStorageDelegate testStorage;
chip::app::FailSafeContext failSafeContext;

failSafeContext.Init({ &testStorage });

EXPECT_EQ(failSafeContext.ArmFailSafe(kTestAccessingFabricIndex1, System::Clock::Seconds16(1)), CHIP_NO_ERROR);
EXPECT_TRUE(failSafeContext.IsFailSafeArmed());
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex1);
EXPECT_TRUE(failSafeContext.IsFailSafeArmed(kTestAccessingFabricIndex1));
EXPECT_FALSE(failSafeContext.IsFailSafeArmed(kTestAccessingFabricIndex2));
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));

failSafeContext.DisarmFailSafe();
EXPECT_FALSE(failSafeContext.IsFailSafeArmed());
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));
}

TEST_F(TestFailSafeContext, TestFailSafeContext_NocCommandInvoked)
TEST_F(TestFailSafeContext, TestFailSafeContext_AddNocCommandInvoked)
{
chip::TestPersistentStorageDelegate testStorage;
chip::app::FailSafeContext failSafeContext;

failSafeContext.Init({ &testStorage });

EXPECT_EQ(failSafeContext.ArmFailSafe(kTestAccessingFabricIndex1, System::Clock::Seconds16(1)), CHIP_NO_ERROR);
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex1);
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));

failSafeContext.SetAddNocCommandStarted(kTestAccessingFabricIndex2);
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex2);
EXPECT_TRUE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));

failSafeContext.SetAddNocCommandInvoked(kTestAccessingFabricIndex2);
EXPECT_TRUE(failSafeContext.NocCommandHasBeenInvoked());
EXPECT_TRUE(failSafeContext.AddNocCommandHasBeenInvoked());
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex2);

failSafeContext.DisarmFailSafe();
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));
}

TEST_F(TestFailSafeContext, TestFailSafeContext_UpdateNocCommandInvoked)
{
chip::TestPersistentStorageDelegate testStorage;
chip::app::FailSafeContext failSafeContext;

failSafeContext.Init({ &testStorage });

EXPECT_EQ(failSafeContext.ArmFailSafe(kTestAccessingFabricIndex1, System::Clock::Seconds16(1)), CHIP_NO_ERROR);
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex1);
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));

failSafeContext.SetUpdateNocCommandInvoked();
EXPECT_TRUE(failSafeContext.NocCommandHasBeenInvoked());
EXPECT_TRUE(failSafeContext.UpdateNocCommandHasBeenInvoked());
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));

failSafeContext.DisarmFailSafe();
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));
}

TEST_F(TestFailSafeContext, TestFailSafeContext_NocCommandInvokedTimerExpiry)
{
chip::TestPersistentStorageDelegate testStorage;
chip::app::FailSafeContext failSafeContext;

failSafeContext.Init({ &testStorage });

EXPECT_EQ(failSafeContext.ArmFailSafe(kTestAccessingFabricIndex1, System::Clock::Seconds16(1)), CHIP_NO_ERROR);
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex1);
EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));

failSafeContext.SetAddNocCommandStarted(kTestAccessingFabricIndex2);
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex2);
EXPECT_TRUE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));

failSafeContext.SetAddNocCommandInvoked(kTestAccessingFabricIndex2);
EXPECT_TRUE(failSafeContext.NocCommandHasBeenInvoked());
EXPECT_TRUE(failSafeContext.AddNocCommandHasBeenInvoked());
EXPECT_EQ(failSafeContext.GetFabricIndex(), kTestAccessingFabricIndex2);

failSafeContext.ForceFailSafeTimerExpiry();

PlatformMgr().ScheduleWork([](intptr_t) -> void { PlatformMgr().StopEventLoopTask(); }, (intptr_t) nullptr);
PlatformMgr().RunEventLoop();

EXPECT_FALSE(testStorage.SyncDoesKeyExist(DefaultStorageKeyAllocator::FailSafeAddNOCStartedMarkerKey().KeyName()));
}

} // namespace
3 changes: 3 additions & 0 deletions src/app/tests/TestThreadBorderRouterManagementCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <lib/support/BitFlags.h>
#include <lib/support/CHIPMem.h>
#include <lib/support/Span.h>
#include <lib/support/TestPersistentStorageDelegate.h>
#include <lib/support/ThreadOperationalDataset.h>
#include <lib/support/tests/ExtraPwTestMacros.h>
#include <optional>
Expand Down Expand Up @@ -154,6 +155,7 @@ class TestDelegate : public Delegate

constexpr EndpointId kTestEndpointId = 1;
constexpr FabricIndex kTestAccessingFabricIndex = 1;
static chip::TestPersistentStorageDelegate sTestStorage;
static FailSafeContext sTestFailsafeContext;
static TestDelegate sTestDelegate;
static ServerInstance sTestSeverInstance(kTestEndpointId, &sTestDelegate, sTestFailsafeContext);
Expand Down Expand Up @@ -211,6 +213,7 @@ class TestThreadBorderRouterManagementCluster : public ::testing::Test
{
ASSERT_EQ(Platform::MemoryInit(), CHIP_NO_ERROR);
ASSERT_EQ(DeviceLayer::PlatformMgr().InitChipStack(), CHIP_NO_ERROR);
ASSERT_EQ(sTestFailsafeContext.Init({ &sTestStorage }), CHIP_NO_ERROR);
}

static void TearDownTestSuite()
Expand Down
Loading
Loading