Skip to content

Commit

Permalink
Merge pull request #4165 from bylaws/denuvo
Browse files Browse the repository at this point in the history
Support inline self modifying code
  • Loading branch information
Sonicadvance1 authored Dec 12, 2024
2 parents 072cf4c + d080180 commit 656477e
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 19 deletions.
3 changes: 3 additions & 0 deletions FEXCore/Source/Interface/Context/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ class ContextImpl final : public FEXCore::Context::Context {

void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) override;

bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) override;
bool IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) override;

uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) override;
uint32_t ReconstructCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, bool WasInJIT, const uint64_t* HostGPRs, uint64_t PSTATE) override;
void SetFlagsFromCompactedEFLAGS(FEXCore::Core::InternalThreadState* Thread, uint32_t EFLAGS) override;
Expand Down
3 changes: 3 additions & 0 deletions FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ constexpr size_t CPU_AREA_EMULATOR_STACK_BASE_OFFSET = 0x8;
constexpr size_t CPU_AREA_EMULATOR_DATA_OFFSET = 0x30;
#endif

// Will force one single instruction block to be generated first if set when entering the JIT filling SRA.
constexpr auto ENTRY_FILL_SRA_SINGLE_INST_REG = TMP1;

// Predicate register temporaries (used when AVX support is enabled)
// PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1.
// PRED_TMP_32B indicates a predicate register that indicates the first 32 bytes set to 1.
Expand Down
13 changes: 11 additions & 2 deletions FEXCore/Source/Interface/Core/CPUBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,16 @@ namespace CPU {
struct JITCodeTail {
// The total size of the codeblock from [BlockBegin, BlockBegin+Size).
size_t Size;

// RIP that the block's entry comes from.
uint64_t RIP;

// The length of the guest code for this block.
size_t GuestSize;

// If this block represents a single guest instruction.
bool SingleInst;

// Number of RIP entries for this JIT Code section.
uint32_t NumberOfRIPEntries;

Expand Down Expand Up @@ -119,15 +126,17 @@ namespace CPU {
*
* This is a thread specific compilation unit since there is one CPUBackend per guest thread
*
* @param Size - The byte size of the guest code for this block
* @param SingleInst - If this block represents a single guest instruction
* @param IR - IR that maps to the IR for this RIP
* @param DebugData - Debug data that is available for this IR indirectly
* @param CheckTF - If EFLAGS.TF checks should be emitted at the start of the block
*
* @return Information about the compiled code block.
*/
[[nodiscard]]
virtual CompiledCode CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData,
const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0;
virtual CompiledCode CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR,
FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) = 0;

/**
* @brief Relocates a block of code from the JIT code object cache
Expand Down
57 changes: 46 additions & 11 deletions FEXCore/Source/Interface/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,38 @@ ContextImpl::~ContextImpl() {
}
}

uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) {
const auto Frame = Thread->CurrentFrame;
struct GetFrameBlockInfoResult {
const CPU::CPUBackend::JITCodeHeader* InlineHeader;
const CPU::CPUBackend::JITCodeTail* InlineTail;
};
static GetFrameBlockInfoResult GetFrameBlockInfo(FEXCore::Core::CpuStateFrame* Frame) {
const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader;
auto InlineHeader = reinterpret_cast<const CPU::CPUBackend::JITCodeHeader*>(BlockBegin);

if (InlineHeader) {
auto InlineTail = reinterpret_cast<const CPU::CPUBackend::JITCodeTail*>(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail);
return {InlineHeader, InlineTail};
}

return {InlineHeader, nullptr};
}

bool ContextImpl::IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) {
auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame);
return InlineTail && (Address + Size > InlineTail->RIP && Address < InlineTail->RIP + InlineTail->GuestSize);
}

bool ContextImpl::IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) {
auto [_, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame);
return InlineTail && InlineTail->SingleInst;
}

uint64_t ContextImpl::RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) {
const auto Frame = Thread->CurrentFrame;
const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader;
auto [InlineHeader, InlineTail] = GetFrameBlockInfo(Thread->CurrentFrame);

if (InlineHeader) {
auto RIPEntries = reinterpret_cast<const CPU::CPUBackend::JITRIPReconstructEntries*>(
Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail + InlineTail->OffsetToRIPEntries);

Expand Down Expand Up @@ -555,6 +580,7 @@ ContextImpl::GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t Gue
GuestCode = reinterpret_cast<const uint8_t*>(GuestRIP);

bool HadDispatchError {false};
bool HadInvalidInst {false};

Thread->FrontendDecoder->DecodeInstructionsAtEntry(GuestCode, GuestRIP, MaxInst,
[Thread](uint64_t BlockEntry, uint64_t Start, uint64_t Length) {
Expand Down Expand Up @@ -652,16 +678,23 @@ ContextImpl::GenerateIR(FEXCore::Core::InternalThreadState* Thread, uint64_t Gue
++TotalInstructions;
}
} else {
if (TableInfo) {
LogMan::Msg::EFmt("Invalid or Unknown instruction: {} 0x{:x}", TableInfo->Name ?: "UND", Block.Entry - GuestRIP);
}
// Invalid instruction
Thread->OpDispatcher->InvalidOp(DecodedInfo);
Thread->OpDispatcher->ExitFunction(Thread->OpDispatcher->_EntrypointOffset(GPRSize, Block.Entry - GuestRIP));
if (!BlockInstructionsLength) {
// SMC can modify block contents and patch invalid instructions to valid ones inline.
// End blocks upon encountering them and only emit an invalid opcode exception if there are no prior instructions in the block (that could have modified it to be valid).

if (TableInfo) {
LogMan::Msg::EFmt("Invalid or Unknown instruction: {} 0x{:x}", TableInfo->Name ?: "UND", Block.Entry - GuestRIP);
}

Thread->OpDispatcher->InvalidOp(DecodedInfo);
}

HadInvalidInst = true;
}

const bool NeedsBlockEnd =
(HadDispatchError && TotalInstructions > 0) || (Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock);
const bool NeedsBlockEnd = (HadDispatchError && TotalInstructions > 0) ||
(Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock) || HadInvalidInst;

// If we had a dispatch error then leave early
if (HadDispatchError && TotalInstructions == 0) {
Expand Down Expand Up @@ -747,6 +780,7 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT

fextl::unique_ptr<FEXCore::IR::IRStorageBase> IR;
FEXCore::Core::DebugData* DebugData {};
uint64_t TotalInstructions {};
uint64_t StartAddr {};
uint64_t Length {};

Expand All @@ -764,11 +798,12 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT

if (!IR) {
// Generate IR + Meta Info
auto [IRCopy, TotalInstructions, TotalInstructionsLength, _StartAddr, _Length] = GenerateIR(Thread, GuestRIP, Config.GDBSymbols(), MaxInst);
auto [IRCopy, _TotalInstructions, TotalInstructionsLength, _StartAddr, _Length] = GenerateIR(Thread, GuestRIP, Config.GDBSymbols(), MaxInst);

// Setup pointers to internal structures
IR = std::move(IRCopy);
DebugData = new FEXCore::Core::DebugData();
TotalInstructions = _TotalInstructions;
StartAddr = _StartAddr;
Length = _Length;
}
Expand All @@ -786,7 +821,7 @@ ContextImpl::CompileCodeResult ContextImpl::CompileCode(FEXCore::Core::InternalT
// FEX currently throws away the CPUBackend::CompiledCode object other than the entrypoint
// In the future with code caching getting wired up, we will pass the rest of the data forward.
// TODO: Pass the data forward when code caching is wired up to this.
.CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry,
.CompiledCode = Thread->CPUBackend->CompileCode(GuestRIP, Length, TotalInstructions == 1, &IRView, DebugData, IR->RAData(), TFSet).BlockEntry,
.IR = std::move(IR),
.DebugData = DebugData,
.GeneratedIR = true,
Expand Down
4 changes: 4 additions & 0 deletions FEXCore/Source/Interface/Core/Dispatcher/Dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ void Dispatcher::EmitDispatcher() {
ldr(STATE, EC_ENTRY_CPUAREA_REG, CPU_AREA_EMULATOR_DATA_OFFSET);
FillStaticRegs();

ldr(RipReg, STATE_PTR(CpuStateFrame, State.rip));
// Force a single instruction block if ENTRY_FILL_SRA_SINGLE_INST_REG is nonzero entering the JIT, used for inline SMC handling.
cbnz(ARMEmitter::Size::i32Bit, ENTRY_FILL_SRA_SINGLE_INST_REG, &CompileSingleStep);

// Enter JIT
b(&LoopTop);

Expand Down
7 changes: 5 additions & 2 deletions FEXCore/Source/Interface/Core/JIT/JIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,8 +720,9 @@ void Arm64JITCore::EmitInterruptChecks(bool CheckTF) {
#endif
}

CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData,
const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) {
CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR,
FEXCore::Core::DebugData* DebugData, const FEXCore::IR::RegisterAllocationData* RAData,
bool CheckTF) {
FEXCORE_PROFILE_SCOPED("Arm64::CompileCode");

JumpTargets.clear();
Expand Down Expand Up @@ -861,6 +862,8 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry, const FEXCore
// TODO: This needs to be a data RIP relocation once code caching works.
// Current relocation code doesn't support this feature yet.
JITBlockTail->RIP = Entry;
JITBlockTail->GuestSize = Size;
JITBlockTail->SingleInst = SingleInst;
JITBlockTail->SpinLockFutex = 0;

{
Expand Down
5 changes: 3 additions & 2 deletions FEXCore/Source/Interface/Core/JIT/JITClass.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ class Arm64JITCore final : public CPUBackend, public Arm64Emitter {
~Arm64JITCore() override;

[[nodiscard]]
CPUBackend::CompiledCode CompileCode(uint64_t Entry, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData,
const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) override;
CPUBackend::CompiledCode
CompileCode(uint64_t Entry, uint64_t Size, bool SingleInst, const FEXCore::IR::IRListView* IR, FEXCore::Core::DebugData* DebugData,
const FEXCore::IR::RegisterAllocationData* RAData, bool CheckTF) override;

void ClearCache() override;

Expand Down
3 changes: 3 additions & 0 deletions FEXCore/include/FEXCore/Core/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ class Context {

FEX_DEFAULT_VISIBILITY virtual void HandleCallback(FEXCore::Core::InternalThreadState* Thread, uint64_t RIP) = 0;

FEX_DEFAULT_VISIBILITY virtual bool IsAddressInCurrentBlock(FEXCore::Core::InternalThreadState* Thread, uint64_t Address, uint64_t Size) = 0;
FEX_DEFAULT_VISIBILITY virtual bool IsCurrentBlockSingleInst(FEXCore::Core::InternalThreadState* Thread) = 0;

///< State reconstruction helpers
///< Reconstructs the guest RIP from the passed in thread context and related Host PC.
FEX_DEFAULT_VISIBILITY virtual uint64_t RestoreRIPFromHostPC(FEXCore::Core::InternalThreadState* Thread, uint64_t HostPC) = 0;
Expand Down
3 changes: 2 additions & 1 deletion Source/Windows/ARM64EC/Module.S
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ BeginSimulation:
bl "#SyncThreadContext"
ldr x17, [x18, #0x1788] // TEB->ChpeV2CpuAreaInfo
ldr x16, [x17, #0x48] // ChpeV2CpuAreaInfo->EmulatorData[3] - DispatcherLoopTopEnterECFillSRA
br x16 // DispatcherLoopTopEnterECFillSRA(CPUArea:x17)
mov x10, #0 // Zero ENTRY_FILL_SRA_SINGLE_INST_REG to avoid single step
br x16 // DispatcherLoopTopEnterECFillSRA(SingleInst:x10, CPUArea:x17)

// Called into by FEXCore
// Expects the target code address in x9
Expand Down
17 changes: 16 additions & 1 deletion Source/Windows/ARM64EC/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,8 @@ class ECSyscallHandler : public FEXCore::HLE::SyscallHandler, public FEXCore::Al
}

uint64_t HandleSyscall(FEXCore::Core::CpuStateFrame* Frame, FEXCore::HLE::SyscallArguments* Args) override {
ProcessPendingCrossProcessEmulatorWork();

// Manually raise an exeption with the current JIT state packed into a native context, ntdll handles this and
// reenters the JIT (see dlls/ntdll/signal_arm64ec.c in wine).
uint64_t FPCR, FPSR;
Expand Down Expand Up @@ -505,6 +507,7 @@ class ECSyscallHandler : public FEXCore::HLE::SyscallHandler, public FEXCore::Al
} // namespace Exception

extern "C" void SyncThreadContext(CONTEXT* Context) {
ProcessPendingCrossProcessEmulatorWork();
auto* Thread = GetCPUArea().ThreadState();
// All other EFlags bits are lost when converting to/from an ARM64EC context, so merge them in from the current JIT state.
// This is advisable over dropping their values as thread suspend/resume uses this function, and that can happen at any point in guest code.
Expand Down Expand Up @@ -591,7 +594,19 @@ bool ResetToConsistentStateImpl(EXCEPTION_RECORD* Exception, CONTEXT* GuestConte

std::scoped_lock Lock(ThreadCreationMutex);
if (InvalidationTracker->HandleRWXAccessViolation(FaultAddress)) {
LogMan::Msg::DFmt("Handled self-modifying code: pc: {:X} fault: {:X}", NativeContext->Pc, FaultAddress);
if (CTX->IsAddressInCodeBuffer(CPUArea.ThreadState(), NativeContext->Pc) && !CTX->IsCurrentBlockSingleInst(CPUArea.ThreadState()) &&
CTX->IsAddressInCurrentBlock(CPUArea.ThreadState(), FaultAddress, 8)) {
// If we are not patching ourself (single inst block case) and patching the current block, this is inline SMC. Reconstruct the current context (before the SMC write) then single step the write to reduce it to regular SMC.
Exception::ReconstructThreadState(CPUArea.ThreadState(), *NativeContext);
LogMan::Msg::DFmt("Handled inline self-modifying code: pc: {:X} rip: {:X} fault: {:X}", NativeContext->Pc,
CPUArea.ThreadState()->CurrentFrame->State.rip, FaultAddress);
NativeContext->Pc = CPUArea.DispatcherLoopTopEnterECFillSRA();
NativeContext->Sp = CPUArea.EmulatorStackBase();
NativeContext->X10 = 1; // Set ENTRY_FILL_SRA_SINGLE_INST_REG to force a single step
} else {
LogMan::Msg::DFmt("Handled self-modifying code: pc: {:X} fault: {:X}", NativeContext->Pc, FaultAddress);
}

return true;
}
}
Expand Down

0 comments on commit 656477e

Please sign in to comment.