From e72c0162300de5e2ec74cdd1eca76e3bf88e58c4 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 18 Nov 2024 22:00:59 +0000 Subject: [PATCH 1/6] Core: Split blocks on invalid instructions --- FEXCore/Source/Interface/Core/Core.cpp | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp index 9e965546e3..56f3eb953d 100644 --- a/FEXCore/Source/Interface/Core/Core.cpp +++ b/FEXCore/Source/Interface/Core/Core.cpp @@ -555,6 +555,7 @@ ContextImpl::GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t Gue GuestCode = reinterpret_cast(GuestRIP); bool HadDispatchError {false}; + bool HadInvalidInst {false}; Thread->FrontendDecoder->DecodeInstructionsAtEntry(GuestCode, GuestRIP, MaxInst, [Thread](uint64_t BlockEntry, uint64_t Start, uint64_t Length) { @@ -652,16 +653,23 @@ ContextImpl::GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t Gue ++TotalInstructions; } } else { - if (TableInfo) { - LogMan::Msg::EFmt("Invalid or Unknown instruction: {} 0x{:x}", TableInfo->Name ?: "UND", Block.Entry - GuestRIP); - } // Invalid instruction - Thread->OpDispatcher->InvalidOp(DecodedInfo); - Thread->OpDispatcher->ExitFunction(Thread->OpDispatcher->_EntrypointOffset(GPRSize, Block.Entry - GuestRIP)); + if (!BlockInstructionsLength) { + // SMC can modify block contents and patch invalid instructions to valid ones inline. + // End blocks upon encountering them and only emit an invalid opcode exception if there are no prior instructions in the block (that could have modified it to be valid). + + if (TableInfo) { + LogMan::Msg::EFmt("Invalid or Unknown instruction: {} 0x{:x}", TableInfo->Name ?: "UND", Block.Entry - GuestRIP); + } + + Thread->OpDispatcher->InvalidOp(DecodedInfo); + } + + HadInvalidInst = true; } - const bool NeedsBlockEnd = - (HadDispatchError && TotalInstructions > 0) || (Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock); + const bool NeedsBlockEnd = (HadDispatchError && TotalInstructions > 0) || + (Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock) || HadInvalidInst; // If we had a dispatch error then leave early if (HadDispatchError && TotalInstructions == 0) { From 5337b9537d47ec23de570f479e92fa52b86b08ea Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 18 Nov 2024 22:05:49 +0000 Subject: [PATCH 2/6] FEXCore: Expose an API to query intersection with the current block Frontends need to detect this in order to handle SMC within the current block (inline SMC) differently to regular SMC which can just reprotect and continue. --- FEXCore/Source/Interface/Context/Context.h | 2 ++ FEXCore/Source/Interface/Core/CPUBackend.h | 7 +++++- FEXCore/Source/Interface/Core/Core.cpp | 26 +++++++++++++++++--- FEXCore/Source/Interface/Core/JIT/JIT.cpp | 6 +++-- FEXCore/Source/Interface/Core/JIT/JITClass.h | 2 +- FEXCore/include/FEXCore/Core/Context.h | 2 ++ 6 files changed, 38 insertions(+), 7 deletions(-) diff --git a/FEXCore/Source/Interface/Context/Context.h b/FEXCore/Source/Interface/Context/Context.h index b90375a92a..6cfc80d6a9 100644 --- a/FEXCore/Source/Interface/Context/Context.h +++ b/FEXCore/Source/Interface/Context/Context.h @@ -88,6 +88,8 @@ class ContextImpl final : public FEXCore::Context::Context { void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) override; + bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) override; + uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) override; uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, bool WasInJIT, const uint64_t* HostGPRs, uint64_t PSTATE) override; void SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) override; diff --git a/FEXCore/Source/Interface/Core/CPUBackend.h b/FEXCore/Source/Interface/Core/CPUBackend.h index 188851cd3c..f8a51008d4 100644 --- a/FEXCore/Source/Interface/Core/CPUBackend.h +++ b/FEXCore/Source/Interface/Core/CPUBackend.h @@ -80,9 +80,13 @@ namespace CPU { struct JITCodeTail { // The total size of the codeblock from [BlockBegin, BlockBegin+Size). size_t Size; + // RIP that the block's entry comes from. uint64_t RIP; + // The length of the guest code for this block. + size_t GuestSize; + // Number of RIP entries for this JIT Code section. uint32_t NumberOfRIPEntries; @@ -119,6 +123,7 @@ namespace CPU { * * This is a thread specific compilation unit since there is one CPUBackend per guest thread * + * @param Size - The byte size of the guest code for this block * @param IR - IR that maps to the IR for this RIP * @param DebugData - Debug data that is available for this IR indirectly * @param CheckTF - If EFLAGS.TF checks should be emitted at the start of the block @@ -126,7 +131,7 @@ namespace CPU { * @return Information about the compiled code block. */ [[nodiscard]] - virtual CompiledCode CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, + virtual CompiledCode CompileCode(uint64_t Entry, uint64_t Size, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0; /** diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp index 56f3eb953d..1305ee7cc7 100644 --- a/FEXCore/Source/Interface/Core/Core.cpp +++ b/FEXCore/Source/Interface/Core/Core.cpp @@ -112,13 +112,33 @@ ContextImpl::~ContextImpl() { } } -uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) { - const auto Frame = Thread->CurrentFrame; +struct GetFrameBlockInfoResult { + const CPU::CPUBackend::JITCodeHeader* InlineHeader; + const CPU::CPUBackend::JITCodeTail* InlineTail; +}; +static GetFrameBlockInfoResult GetFrameBlockInfo(FEXCore::Core::CpuStateFrame* Frame) { const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; auto InlineHeader = reinterpret_cast(BlockBegin); if (InlineHeader) { auto InlineTail = reinterpret_cast(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail); + return {InlineHeader, InlineTail}; + } + + return {InlineHeader, nullptr}; +} + +bool ContextImpl::IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) { + auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); + return InlineTail && (Address + Size > InlineTail->RIP && Address < InlineTail->RIP + InlineTail->GuestSize); +} + +uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) { + const auto Frame = Thread->CurrentFrame; + const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; + auto [InlineHeader, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); + + if (InlineHeader) { auto RIPEntries = reinterpret_cast( Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries); @@ -794,7 +814,7 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT // FEX currently throws away the CPUBackend::CompiledCode object other than the entrypoint // In the future with code caching getting wired up, we will pass the rest of the data forward. // TODO: Pass the data forward when code caching is wired up to this. - .CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry, + .CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, Length, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry, .IR = std::move(IR), .DebugData = DebugData, .GeneratedIR = true, diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp index 03fe12c674..7fbee3451d 100644 --- a/FEXCore/Source/Interface/Core/JIT/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/JIT.cpp @@ -720,8 +720,9 @@ void Arm64JITCore::EmitInterruptChecks(bool CheckTF) { #endif } -CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, - const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) { +CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size, const FEXCore::IR::IRListView* IR, + FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, + bool CheckTF) { FEXCORE_PROFILE_SCOPED("Arm64::CompileCode"); JumpTargets.clear(); @@ -861,6 +862,7 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, const FEXCore // TODO: This needs to be a data RIP relocation once code caching works. // Current relocation code doesn't support this feature yet. JITBlockTail->RIP = Entry; + JITBlockTail->GuestSize = Size; JITBlockTail->SpinLockFutex = 0; { diff --git a/FEXCore/Source/Interface/Core/JIT/JITClass.h b/FEXCore/Source/Interface/Core/JIT/JITClass.h index 0c3dbbf8ff..330c14d289 100644 --- a/FEXCore/Source/Interface/Core/JIT/JITClass.h +++ b/FEXCore/Source/Interface/Core/JIT/JITClass.h @@ -38,7 +38,7 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { ~Arm64JITCore() override; [[nodiscard]] - CPUBackend::CompiledCode CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, + CPUBackend::CompiledCode CompileCode(uint64_t Entry, uint64_t Size, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) override; void ClearCache() override; diff --git a/FEXCore/include/FEXCore/Core/Context.h b/FEXCore/include/FEXCore/Core/Context.h index c6859a004d..82aa03370a 100644 --- a/FEXCore/include/FEXCore/Core/Context.h +++ b/FEXCore/include/FEXCore/Core/Context.h @@ -104,6 +104,8 @@ class Context { FEX_DEFAULT_VISIBILITY virtual void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) = 0; + FEX_DEFAULT_VISIBILITY virtual bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) = 0; + ///< State reconstruction helpers ///< Reconstructs the guest RIP from the passed in thread context and related Host PC. FEX_DEFAULT_VISIBILITY virtual uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) = 0; From d5d7eec8b00ee240fd7042f65165861f82986bea Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 18 Nov 2024 22:09:25 +0000 Subject: [PATCH 3/6] FEXCore: Expose an API to check if the current block represents a single guest instruction Single instruction blocks need to be treated specially when inline SMC is detected, the frontend only needs to reprotect RWX and invalidate caches then continue execution as side effects from the SMC shouldn't be seen until the instruction executes. --- FEXCore/Source/Interface/Context/Context.h | 1 + FEXCore/Source/Interface/Core/CPUBackend.h | 8 ++++++-- FEXCore/Source/Interface/Core/Core.cpp | 11 +++++++++-- FEXCore/Source/Interface/Core/JIT/JIT.cpp | 3 ++- FEXCore/Source/Interface/Core/JIT/JITClass.h | 5 +++-- FEXCore/include/FEXCore/Core/Context.h | 1 + 6 files changed, 22 insertions(+), 7 deletions(-) diff --git a/FEXCore/Source/Interface/Context/Context.h b/FEXCore/Source/Interface/Context/Context.h index 6cfc80d6a9..0402b3642c 100644 --- a/FEXCore/Source/Interface/Context/Context.h +++ b/FEXCore/Source/Interface/Context/Context.h @@ -89,6 +89,7 @@ class ContextImpl final : public FEXCore::Context::Context { void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) override; bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) override; + bool IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) override; uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) override; uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, bool WasInJIT, const uint64_t* HostGPRs, uint64_t PSTATE) override; diff --git a/FEXCore/Source/Interface/Core/CPUBackend.h b/FEXCore/Source/Interface/Core/CPUBackend.h index f8a51008d4..c7265dc5a4 100644 --- a/FEXCore/Source/Interface/Core/CPUBackend.h +++ b/FEXCore/Source/Interface/Core/CPUBackend.h @@ -87,6 +87,9 @@ namespace CPU { // The length of the guest code for this block. size_t GuestSize; + // If this block represents a single guest instruction. + bool SingleInst; + // Number of RIP entries for this JIT Code section. uint32_t NumberOfRIPEntries; @@ -124,6 +127,7 @@ namespace CPU { * This is a thread specific compilation unit since there is one CPUBackend per guest thread * * @param Size - The byte size of the guest code for this block + * @param SingleInst - If this block represents a single guest instruction * @param IR - IR that maps to the IR for this RIP * @param DebugData - Debug data that is available for this IR indirectly * @param CheckTF - If EFLAGS.TF checks should be emitted at the start of the block @@ -131,8 +135,8 @@ namespace CPU { * @return Information about the compiled code block. */ [[nodiscard]] - virtual CompiledCode CompileCode(uint64_t Entry, uint64_t Size, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, - const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0; + virtual CompiledCode CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, + FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0; /** * @brief Relocates a block of code from the JIT code object cache diff --git a/FEXCore/Source/Interface/Core/Core.cpp b/FEXCore/Source/Interface/Core/Core.cpp index 1305ee7cc7..9277a766c7 100644 --- a/FEXCore/Source/Interface/Core/Core.cpp +++ b/FEXCore/Source/Interface/Core/Core.cpp @@ -133,6 +133,11 @@ bool ContextImpl::IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Th return InlineTail && (Address + Size > InlineTail->RIP && Address < InlineTail->RIP + InlineTail->GuestSize); } +bool ContextImpl::IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) { + auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame); + return InlineTail && InlineTail->SingleInst; +} + uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) { const auto Frame = Thread->CurrentFrame; const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader; @@ -775,6 +780,7 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT fextl::unique_ptr IR; FEXCore::Core::DebugData* DebugData {}; + uint64_t TotalInstructions {}; uint64_t StartAddr {}; uint64_t Length {}; @@ -792,11 +798,12 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT if (!IR) { // Generate IR + Meta Info - auto [IRCopy, TotalInstructions, TotalInstructionsLength, _StartAddr, _Length] = GenerateIR(Thread, GuestRIP, Config.GDBSymbols(), MaxInst); + auto [IRCopy, _TotalInstructions, TotalInstructionsLength, _StartAddr, _Length] = GenerateIR(Thread, GuestRIP, Config.GDBSymbols(), MaxInst); // Setup pointers to internal structures IR = std::move(IRCopy); DebugData = new FEXCore::Core::DebugData(); + TotalInstructions = _TotalInstructions; StartAddr = _StartAddr; Length = _Length; } @@ -814,7 +821,7 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT // FEX currently throws away the CPUBackend::CompiledCode object other than the entrypoint // In the future with code caching getting wired up, we will pass the rest of the data forward. // TODO: Pass the data forward when code caching is wired up to this. - .CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, Length, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry, + .CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, Length, TotalInstructions == 1, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry, .IR = std::move(IR), .DebugData = DebugData, .GeneratedIR = true, diff --git a/FEXCore/Source/Interface/Core/JIT/JIT.cpp b/FEXCore/Source/Interface/Core/JIT/JIT.cpp index 7fbee3451d..5f5d7580e2 100644 --- a/FEXCore/Source/Interface/Core/JIT/JIT.cpp +++ b/FEXCore/Source/Interface/Core/JIT/JIT.cpp @@ -720,7 +720,7 @@ void Arm64JITCore::EmitInterruptChecks(bool CheckTF) { #endif } -CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size, const FEXCore::IR::IRListView* IR, +CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) { FEXCORE_PROFILE_SCOPED("Arm64::CompileCode"); @@ -863,6 +863,7 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size // Current relocation code doesn't support this feature yet. JITBlockTail->RIP = Entry; JITBlockTail->GuestSize = Size; + JITBlockTail->SingleInst = SingleInst; JITBlockTail->SpinLockFutex = 0; { diff --git a/FEXCore/Source/Interface/Core/JIT/JITClass.h b/FEXCore/Source/Interface/Core/JIT/JITClass.h index 330c14d289..f1f4477aa9 100644 --- a/FEXCore/Source/Interface/Core/JIT/JITClass.h +++ b/FEXCore/Source/Interface/Core/JIT/JITClass.h @@ -38,8 +38,9 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter { ~Arm64JITCore() override; [[nodiscard]] - CPUBackend::CompiledCode CompileCode(uint64_t Entry, uint64_t Size, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, - const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) override; + CPUBackend::CompiledCode + CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData, + const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) override; void ClearCache() override; diff --git a/FEXCore/include/FEXCore/Core/Context.h b/FEXCore/include/FEXCore/Core/Context.h index 82aa03370a..e807107018 100644 --- a/FEXCore/include/FEXCore/Core/Context.h +++ b/FEXCore/include/FEXCore/Core/Context.h @@ -105,6 +105,7 @@ class Context { FEX_DEFAULT_VISIBILITY virtual void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) = 0; FEX_DEFAULT_VISIBILITY virtual bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) = 0; + FEX_DEFAULT_VISIBILITY virtual bool IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) = 0; ///< State reconstruction helpers ///< Reconstructs the guest RIP from the passed in thread context and related Host PC. From 90c1282f3a188d03ee7a43eb90f0430b58401af5 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 18 Nov 2024 22:31:50 +0000 Subject: [PATCH 4/6] Dispatcher: Support forcing a temp single instr block on ARM64EC JIT entry --- FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h | 3 +++ FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h index 2348d9cd6a..b38df6ed0d 100644 --- a/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h +++ b/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h @@ -76,6 +76,9 @@ constexpr size_t CPU_AREA_EMULATOR_STACK_BASE_OFFSET = 0x8; constexpr size_t CPU_AREA_EMULATOR_DATA_OFFSET = 0x30; #endif +// Will force one single instruction block to be generated first if set when entering the JIT filling SRA. +constexpr auto ENTRY_FILL_SRA_SINGLE_INST_REG = TMP1; + // Predicate register temporaries (used when AVX support is enabled) // PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1. // PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1. diff --git a/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp b/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp index 4157d20a25..2e0d2bfb2a 100644 --- a/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp +++ b/FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp @@ -93,6 +93,10 @@ void Dispatcher::EmitDispatcher() { ldr(STATE, EC_ENTRY_CPUAREA_REG, CPU_AREA_EMULATOR_DATA_OFFSET); FillStaticRegs(); + ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip)); + // Force a single instruction block if ENTRY_FILL_SRA_SINGLE_INST_REG is nonzero entering the JIT, used for inline SMC handling. + cbnz(ARMEmitter::Size::i32Bit, ENTRY_FILL_SRA_SINGLE_INST_REG, &CompileSingleStep); + // Enter JIT b(&LoopTop); From af1d2d600546e45b529ebb0e2cb37d5d663baa9b Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Mon, 18 Nov 2024 22:32:35 +0000 Subject: [PATCH 5/6] ARM64EC: Implement inline SMC support using context reconstruction When an SMC trap happens: reconstruct the context before the SMC write then compile the write as a single instruction block to reduce it to regular SMC. SMC where the writing instruction is the instruction being patched will hit the signal handler at most twice: the 1st will trigger the write to be compiled as a single instuction block, the 2nd will detect inline SMC of a single instruction block and then just take the usual invalidate+reprotect+continue step, avoiding a potential infinite loop of recompilation. --- Source/Windows/ARM64EC/Module.S | 3 ++- Source/Windows/ARM64EC/Module.cpp | 14 +++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/Source/Windows/ARM64EC/Module.S b/Source/Windows/ARM64EC/Module.S index 46c18d884d..dcb01943cc 100644 --- a/Source/Windows/ARM64EC/Module.S +++ b/Source/Windows/ARM64EC/Module.S @@ -49,7 +49,8 @@ BeginSimulation: bl "#SyncThreadContext" ldr x17, [x18, #0x1788] // TEB->ChpeV2CpuAreaInfo ldr x16, [x17, #0x48] // ChpeV2CpuAreaInfo->EmulatorData[3] - DispatcherLoopTopEnterECFillSRA - br x16 // DispatcherLoopTopEnterECFillSRA(CPUArea:x17) + mov x10, #0 // Zero ENTRY_FILL_SRA_SINGLE_INST_REG to avoid single step + br x16 // DispatcherLoopTopEnterECFillSRA(SingleInst:x10, CPUArea:x17) // Called into by FEXCore // Expects the target code address in x9 diff --git a/Source/Windows/ARM64EC/Module.cpp b/Source/Windows/ARM64EC/Module.cpp index 02ae43046c..c4d0b8ba92 100644 --- a/Source/Windows/ARM64EC/Module.cpp +++ b/Source/Windows/ARM64EC/Module.cpp @@ -591,7 +591,19 @@ bool ResetToConsistentStateImpl(EXCEPTION_RECORD* Exception, CONTEXT* GuestConte std::scoped_lock Lock(ThreadCreationMutex); if (InvalidationTracker->HandleRWXAccessViolation(FaultAddress)) { - LogMan::Msg::DFmt("Handled self-modifying code: pc: {:X} fault: {:X}", NativeContext->Pc, FaultAddress); + if (CTX->IsAddressInCodeBuffer(CPUArea.ThreadState(), NativeContext->Pc) && !CTX->IsCurrentBlockSingleInst(CPUArea.ThreadState()) && + CTX->IsAddressInCurrentBlock(CPUArea.ThreadState(), FaultAddress, 8)) { + // If we are not patching ourself (single inst block case) and patching the current block, this is inline SMC. Reconstruct the current context (before the SMC write) then single step the write to reduce it to regular SMC. + Exception::ReconstructThreadState(CPUArea.ThreadState(), *NativeContext); + LogMan::Msg::DFmt("Handled inline self-modifying code: pc: {:X} rip: {:X} fault: {:X}", NativeContext->Pc, + CPUArea.ThreadState()->CurrentFrame->State.rip, FaultAddress); + NativeContext->Pc = CPUArea.DispatcherLoopTopEnterECFillSRA(); + NativeContext->Sp = CPUArea.EmulatorStackBase(); + NativeContext->X10 = 1; // Set ENTRY_FILL_SRA_SINGLE_INST_REG to force a single step + } else { + LogMan::Msg::DFmt("Handled self-modifying code: pc: {:X} fault: {:X}", NativeContext->Pc, FaultAddress); + } + return true; } } From d080180e858789ec0eefcc15bfe159fa594057c5 Mon Sep 17 00:00:00 2001 From: Billy Laws Date: Tue, 23 Jul 2024 20:35:17 +0000 Subject: [PATCH 6/6] ARM64EC: Process pending cross-process work on syscalls and exceptions This is used to notify the JIT of e.g. memory writes by a debugger. --- Source/Windows/ARM64EC/Module.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Source/Windows/ARM64EC/Module.cpp b/Source/Windows/ARM64EC/Module.cpp index c4d0b8ba92..a24963412a 100644 --- a/Source/Windows/ARM64EC/Module.cpp +++ b/Source/Windows/ARM64EC/Module.cpp @@ -467,6 +467,8 @@ class ECSyscallHandler : public FEXCore::HLE::SyscallHandler, public FEXCore::Al } uint64_t HandleSyscall(FEXCore::Core::CpuStateFrame* Frame, FEXCore::HLE::SyscallArguments* Args) override { + ProcessPendingCrossProcessEmulatorWork(); + // Manually raise an exeption with the current JIT state packed into a native context, ntdll handles this and // reenters the JIT (see dlls/ntdll/signal_arm64ec.c in wine). uint64_t FPCR, FPSR; @@ -505,6 +507,7 @@ class ECSyscallHandler : public FEXCore::HLE::SyscallHandler, public FEXCore::Al } // namespace Exception extern "C" void SyncThreadContext(CONTEXT* Context) { + ProcessPendingCrossProcessEmulatorWork(); auto* Thread = GetCPUArea().ThreadState(); // All other EFlags bits are lost when converting to/from an ARM64EC context, so merge them in from the current JIT state. // This is advisable over dropping their values as thread suspend/resume uses this function, and that can happen at any point in guest code.