Skip to content

Commit

Permalink
Merge pull request #4266 from pmatos/FSTOpt
Browse files Browse the repository at this point in the history
x87 fst/fld optimization for different addrmodes
  • Loading branch information
Sonicadvance1 authored Jan 17, 2025
2 parents d01db8f + 8191c49 commit 3f788eb
Show file tree
Hide file tree
Showing 19 changed files with 29,927 additions and 2,643 deletions.
20 changes: 18 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ desc: Handles x86/64 x87 to IR
#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/Utils/FPState.h>

#include <cmath>
#include <stddef.h>
#include <stdint.h>

Expand Down Expand Up @@ -129,8 +130,23 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {
}

void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i128Bit, true, Width);
// Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
// FIXME: Is TSO relevant for x87?
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

// Index scale is a power of 2?
LOGMAN_THROW_A_FMT(A.IndexScale > 0 && (A.IndexScale & (A.IndexScale - 1)) == 0, "Invalid index scale");

Ref Addr = A.Base ? A.Base : _Constant(0);
if (A.Index) {
Ref ScaledIndex = A.Index;
if (A.IndexScale > 1) {
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(std::log2(A.IndexScale)));
}
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
}

_StoreStackMem(OpSize::i128Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down
16 changes: 14 additions & 2 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,21 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
}

void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) {
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
_StoreStackMemory(Mem, OpSize::i64Bit, true, Width);
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);

// Index scale is a power of 2?
LOGMAN_THROW_A_FMT(A.IndexScale > 0 && (A.IndexScale & (A.IndexScale - 1)) == 0, "Invalid index scale");

Ref Addr = A.Base ? A.Base : _Constant(0);
if (A.Index) {
Ref ScaledIndex = A.Index;
if (A.IndexScale > 1) {
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(std::log2(A.IndexScale)));
}
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
}

_StoreStackMem(OpSize::i64Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
_PopStackDestroy();
}
Expand Down
2 changes: 1 addition & 1 deletion FEXCore/Source/Interface/IR/IR.json
Original file line number Diff line number Diff line change
Expand Up @@ -2788,7 +2788,7 @@
"HasSideEffects": true,
"X87": true
},
"StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
"StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, i1:$Float": {
"Desc": [
"Takes the top value off the x87 stack and stores it to memory.",
"SourceSize is 128bit for F80 values, 64-bit for low precision.",
Expand Down
1 change: 0 additions & 1 deletion FEXCore/Source/Interface/IR/IREmitter.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include <FEXCore/fextl/vector.h>

#include <algorithm>
#include <new>
#include <stdint.h>
#include <string.h>

Expand Down
51 changes: 35 additions & 16 deletions FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,18 @@ class X87StackOptimization final : public Pass {
// Helpers
Ref RotateRight8(uint32_t V, Ref Amount);

// Helper to check if a Ref is a Zero constant
bool IsZero(Ref Node) {
auto Header = IR->GetOp<IR::IROp_Header>(Node);
if (Header->Op != OP_CONSTANT) {
return false;
}

auto Const = Header->C<IROp_Constant>();
return Const->Constant == 0;
}


// Handles a Unary operation.
// Takes the op we are handling, the Node for the reduced precision case and the node for the normal case.
// Depending on the type of Op64, we might need to pass a couple of extra constant arguments, this happens
Expand Down Expand Up @@ -245,6 +257,7 @@ class X87StackOptimization final : public Pass {
bool SlowPath = false;
// Keeping IREmitter not to pass arguments around
IREmitter* IREmit = nullptr;
IRListView* IR;
};

inline void X87StackOptimization::InvalidateCaches() {
Expand Down Expand Up @@ -537,6 +550,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {

// Initialize IREmit member
IREmit = Emit;
IR = &CurrentIR;

// Run optimization proper
for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) {
Expand Down Expand Up @@ -780,11 +794,12 @@ void X87StackOptimization::Run(IREmitter* Emit) {
break;
}

case OP_STORESTACKMEMORY: {
const auto* Op = IROp->C<IROp_StoreStackMemory>();
case OP_STORESTACKMEM: {
const auto* Op = IROp->C<IROp_StoreStackMem>();
const auto& Value = MigrateToSlowPath_IfInvalid();
Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode;
Ref AddrNode = CurrentIR.GetNode(Op->Addr);
Ref Offset = CurrentIR.GetNode(Op->Offset);

// On the fast path we can optimize memory copies.
// If we are doing:
Expand All @@ -796,45 +811,49 @@ void X87StackOptimization::Run(IREmitter* Emit) {
// or similar. As long as the source size and dest size are one and the same.
// This will avoid any conversions between source and stack element size and conversion back.
if (!SlowPath && Value->Source && Value->Source->first == Op->StoreSize && Value->InterpretAsFloat) {
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, AddrNode, Value->Source->second);
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, Value->Source->second, AddrNode, Offset,
OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
} else {
if (ReducedPrecisionMode) {
switch (Op->StoreSize) {
case OpSize::i32Bit: {
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i32Bit, AddrNode, StackNode);
break;
}
case OpSize::i32Bit:
case OpSize::i64Bit: {
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
if (Op->StoreSize == OpSize::i32Bit) {
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
}
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
break;
}
case OpSize::f80Bit: {
StackNode = IREmit->_F80CVTTo(StackNode, OpSize::i64Bit);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, GetConstant(8), OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
break;
}
default: ERROR_AND_DIE_FMT("Unsupported x87 size");
}
} else {
} else { // !ReducedPrecisionMode
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
}
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
if (!IsZero(Offset)) {
AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset);
}
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
} else {
// For X87 extended doubles, split before storing
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit);
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
}
} else {
IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode);
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
}
}
}
Expand Down
44 changes: 44 additions & 0 deletions unittests/32Bit_ASM/X87/FST_AddrModes.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x3f800000",
"RBX": "0x3f800000",
"RCX": "0x3f800000",
"RBP": "0x3f800000",
"RDI": "0x3f800000",
"RSP": "0x3f800000"
},
"MemoryRegions": {
"0xf0000000": "4096"
},
"Mode": "32BIT"
}
%endif

section .bss
base resb 4096

section .text

; Setup
fld1
lea edx, [rel base]
mov esi, 0x64

; Test fst
fst dword [edx]
fst dword [edx + 0xa]
fst dword [edx + esi]
fst dword [edx + esi * 4]
fst dword [edx + esi + 0xa]
fst dword [edx + esi * 4 + 0xa]

; Result check
mov eax, dword [edx]
mov ebx, dword [edx + 0xa]
mov ecx, dword [edx + esi]
mov ebp, dword [edx + esi * 4]
mov edi, dword [edx + esi + 0xa]
mov esp, dword [edx + esi * 4 + 0xa]

hlt
25 changes: 25 additions & 0 deletions unittests/ASM/X87/DB_07_2.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
%ifdef CONFIG
{
"RegData": {
"MM7": ["0x8000000000000000", "0x4000"]
}
}
%endif

lea rdx, [rel data]
fld tword [rdx + 8 * 0]

lea rdx, [rel data2]
lea rax, [rdx + 8 * 0]
fstp tword [rax]
fld tword [rdx + 8 * 0]

hlt

align 8
data:
dt 2.0
dq 0
data2:
dt 0.0
dq 0
43 changes: 43 additions & 0 deletions unittests/ASM/X87/FST_AddrModes.asm
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x3f800000",
"RBX": "0x3f800000",
"RCX": "0x3f800000",
"R8": "0x3f800000",
"R9": "0x3f800000",
"R10": "0x3f800000"
},
"MemoryRegions": {
"0x100000000": "4096"
}
}
%endif

section .bss
base resb 4096

section .text

; Setup
fld1
lea rdx, [rel base]
mov rsi, 0x64

; Test fst
fst dword [rdx]
fst dword [rdx + 0xa]
fst dword [rdx + rsi]
fst dword [rdx + rsi * 4]
fst dword [rdx + rsi + 0xa]
fst dword [rdx + rsi * 4 + 0xa]

; Result check
mov eax, dword [rdx]
mov ebx, dword [rdx + 0xa]
mov ecx, dword [rdx + rsi]
mov r8d, dword [rdx + rsi * 4]
mov r9d, dword [rdx + rsi + 0xa]
mov r10d, dword [rdx + rsi * 4 + 0xa]

hlt
Loading

0 comments on commit 3f788eb

Please sign in to comment.