Skip to content

Commit 3f788eb

Browse files
Merge pull request #4266 from pmatos/FSTOpt
x87 fst/fld optimization for different addrmodes
2 parents d01db8f + 8191c49 commit 3f788eb

19 files changed

+29927
-2643
lines changed

FEXCore/Source/Interface/Core/OpcodeDispatcher/X87.cpp

+18-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ desc: Handles x86/64 x87 to IR
1616
#include <FEXCore/Utils/LogManager.h>
1717
#include <FEXCore/Utils/FPState.h>
1818

19+
#include <cmath>
1920
#include <stddef.h>
2021
#include <stdint.h>
2122

@@ -129,8 +130,23 @@ void OpDispatchBuilder::FILD(OpcodeArgs) {
129130
}
130131

131132
void OpDispatchBuilder::FST(OpcodeArgs, IR::OpSize Width) {
132-
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
133-
_StoreStackMemory(Mem, OpSize::i128Bit, true, Width);
133+
// Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
134+
// FIXME: Is TSO relevant for x87?
135+
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);
136+
137+
// Index scale is a power of 2?
138+
LOGMAN_THROW_A_FMT(A.IndexScale > 0 && (A.IndexScale & (A.IndexScale - 1)) == 0, "Invalid index scale");
139+
140+
Ref Addr = A.Base ? A.Base : _Constant(0);
141+
if (A.Index) {
142+
Ref ScaledIndex = A.Index;
143+
if (A.IndexScale > 1) {
144+
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(std::log2(A.IndexScale)));
145+
}
146+
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
147+
}
148+
149+
_StoreStackMem(OpSize::i128Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
134150
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
135151
_PopStackDestroy();
136152
}

FEXCore/Source/Interface/Core/OpcodeDispatcher/X87F64.cpp

+14-2
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,21 @@ void OpDispatchBuilder::FILDF64(OpcodeArgs) {
104104
}
105105

106106
void OpDispatchBuilder::FSTF64(OpcodeArgs, IR::OpSize Width) {
107-
Ref Mem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
108-
_StoreStackMemory(Mem, OpSize::i64Bit, true, Width);
107+
AddressMode A = DecodeAddress(Op, Op->Dest, MemoryAccessType::DEFAULT, false);
109108

109+
// Index scale is a power of 2?
110+
LOGMAN_THROW_A_FMT(A.IndexScale > 0 && (A.IndexScale & (A.IndexScale - 1)) == 0, "Invalid index scale");
111+
112+
Ref Addr = A.Base ? A.Base : _Constant(0);
113+
if (A.Index) {
114+
Ref ScaledIndex = A.Index;
115+
if (A.IndexScale > 1) {
116+
ScaledIndex = _Lshl(A.AddrSize, ScaledIndex, _Constant(std::log2(A.IndexScale)));
117+
}
118+
Addr = _Add(A.AddrSize, Addr, ScaledIndex);
119+
}
120+
121+
_StoreStackMem(OpSize::i64Bit, Width, Addr, _Constant(A.Offset), /*Float=*/true);
110122
if (Op->TableInfo->Flags & X86Tables::InstFlags::FLAGS_POP) {
111123
_PopStackDestroy();
112124
}

FEXCore/Source/Interface/IR/IR.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -2788,7 +2788,7 @@
27882788
"HasSideEffects": true,
27892789
"X87": true
27902790
},
2791-
"StoreStackMemory GPR:$Addr, OpSize:$SourceSize, i1:$Float, OpSize:$StoreSize": {
2791+
"StoreStackMem OpSize:$SourceSize, OpSize:$StoreSize, GPR:$Addr, GPR:$Offset, i1:$Float": {
27922792
"Desc": [
27932793
"Takes the top value off the x87 stack and stores it to memory.",
27942794
"SourceSize is 128bit for F80 values, 64-bit for low precision.",

FEXCore/Source/Interface/IR/IREmitter.h

-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#include <FEXCore/fextl/vector.h>
1212

1313
#include <algorithm>
14-
#include <new>
1514
#include <stdint.h>
1615
#include <string.h>
1716

FEXCore/Source/Interface/IR/Passes/x87StackOptimizationPass.cpp

+35-16
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,18 @@ class X87StackOptimization final : public Pass {
161161
// Helpers
162162
Ref RotateRight8(uint32_t V, Ref Amount);
163163

164+
// Helper to check if a Ref is a Zero constant
165+
bool IsZero(Ref Node) {
166+
auto Header = IR->GetOp<IR::IROp_Header>(Node);
167+
if (Header->Op != OP_CONSTANT) {
168+
return false;
169+
}
170+
171+
auto Const = Header->C<IROp_Constant>();
172+
return Const->Constant == 0;
173+
}
174+
175+
164176
// Handles a Unary operation.
165177
// Takes the op we are handling, the Node for the reduced precision case and the node for the normal case.
166178
// Depending on the type of Op64, we might need to pass a couple of extra constant arguments, this happens
@@ -245,6 +257,7 @@ class X87StackOptimization final : public Pass {
245257
bool SlowPath = false;
246258
// Keeping IREmitter not to pass arguments around
247259
IREmitter* IREmit = nullptr;
260+
IRListView* IR;
248261
};
249262

250263
inline void X87StackOptimization::InvalidateCaches() {
@@ -537,6 +550,7 @@ void X87StackOptimization::Run(IREmitter* Emit) {
537550

538551
// Initialize IREmit member
539552
IREmit = Emit;
553+
IR = &CurrentIR;
540554

541555
// Run optimization proper
542556
for (auto [BlockNode, BlockHeader] : CurrentIR.GetBlocks()) {
@@ -780,11 +794,12 @@ void X87StackOptimization::Run(IREmitter* Emit) {
780794
break;
781795
}
782796

783-
case OP_STORESTACKMEMORY: {
784-
const auto* Op = IROp->C<IROp_StoreStackMemory>();
797+
case OP_STORESTACKMEM: {
798+
const auto* Op = IROp->C<IROp_StoreStackMem>();
785799
const auto& Value = MigrateToSlowPath_IfInvalid();
786800
Ref StackNode = SlowPath ? LoadStackValueAtOffset_Slow() : Value->StackDataNode;
787801
Ref AddrNode = CurrentIR.GetNode(Op->Addr);
802+
Ref Offset = CurrentIR.GetNode(Op->Offset);
788803

789804
// On the fast path we can optimize memory copies.
790805
// If we are doing:
@@ -796,45 +811,49 @@ void X87StackOptimization::Run(IREmitter* Emit) {
796811
// or similar. As long as the source size and dest size are one and the same.
797812
// This will avoid any conversions between source and stack element size and conversion back.
798813
if (!SlowPath && Value->Source && Value->Source->first == Op->StoreSize && Value->InterpretAsFloat) {
799-
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, AddrNode, Value->Source->second);
814+
IREmit->_StoreMem(Value->InterpretAsFloat ? FPRClass : GPRClass, Op->StoreSize, Value->Source->second, AddrNode, Offset,
815+
OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
800816
} else {
801817
if (ReducedPrecisionMode) {
802818
switch (Op->StoreSize) {
803-
case OpSize::i32Bit: {
804-
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
805-
IREmit->_StoreMem(FPRClass, OpSize::i32Bit, AddrNode, StackNode);
806-
break;
807-
}
819+
case OpSize::i32Bit:
808820
case OpSize::i64Bit: {
809-
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
821+
if (Op->StoreSize == OpSize::i32Bit) {
822+
StackNode = IREmit->_Float_FToF(OpSize::i32Bit, OpSize::i64Bit, StackNode);
823+
}
824+
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
810825
break;
811826
}
812827
case OpSize::f80Bit: {
813828
StackNode = IREmit->_F80CVTTo(StackNode, OpSize::i64Bit);
814-
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
829+
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
815830
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
816-
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, GetConstant(8), OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
831+
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
832+
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
817833
break;
818834
}
819835
default: ERROR_AND_DIE_FMT("Unsupported x87 size");
820836
}
821-
} else {
837+
} else { // !ReducedPrecisionMode
822838
if (Op->StoreSize != OpSize::f80Bit) { // if it's not 80bits then convert
823839
StackNode = IREmit->_F80CVT(Op->StoreSize, StackNode);
824840
}
825841
if (Op->StoreSize == OpSize::f80Bit) { // Part of code from StoreResult_WithOpSize()
826842
if (Features.SupportsSVE128 || Features.SupportsSVE256) {
827843
auto PReg = IREmit->_InitPredicate(OpSize::i16Bit, FEXCore::ToUnderlying(ARMEmitter::PredicatePattern::SVE_VL5));
844+
if (!IsZero(Offset)) {
845+
AddrNode = IREmit->_Add(OpSize::i64Bit, AddrNode, Offset);
846+
}
828847
IREmit->_StoreMemPredicate(OpSize::i128Bit, OpSize::i16Bit, StackNode, PReg, AddrNode);
829848
} else {
830849
// For X87 extended doubles, split before storing
831-
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, AddrNode, StackNode);
850+
IREmit->_StoreMem(FPRClass, OpSize::i64Bit, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
832851
auto Upper = IREmit->_VExtractToGPR(OpSize::i128Bit, OpSize::i64Bit, StackNode, 1);
833-
auto DestAddr = IREmit->_Add(OpSize::i64Bit, AddrNode, GetConstant(8));
834-
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, DestAddr, Upper, OpSize::i64Bit);
852+
auto NewOffset = IREmit->_Add(OpSize::i64Bit, Offset, GetConstant(8));
853+
IREmit->_StoreMem(GPRClass, OpSize::i16Bit, Upper, AddrNode, NewOffset, OpSize::i64Bit, MEM_OFFSET_SXTX, 1);
835854
}
836855
} else {
837-
IREmit->_StoreMem(FPRClass, Op->StoreSize, AddrNode, StackNode);
856+
IREmit->_StoreMem(FPRClass, Op->StoreSize, StackNode, AddrNode, Offset, OpSize::iInvalid, MEM_OFFSET_SXTX, 1);
838857
}
839858
}
840859
}
+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
%ifdef CONFIG
2+
{
3+
"RegData": {
4+
"RAX": "0x3f800000",
5+
"RBX": "0x3f800000",
6+
"RCX": "0x3f800000",
7+
"RBP": "0x3f800000",
8+
"RDI": "0x3f800000",
9+
"RSP": "0x3f800000"
10+
},
11+
"MemoryRegions": {
12+
"0xf0000000": "4096"
13+
},
14+
"Mode": "32BIT"
15+
}
16+
%endif
17+
18+
section .bss
19+
base resb 4096
20+
21+
section .text
22+
23+
; Setup
24+
fld1
25+
lea edx, [rel base]
26+
mov esi, 0x64
27+
28+
; Test fst
29+
fst dword [edx]
30+
fst dword [edx + 0xa]
31+
fst dword [edx + esi]
32+
fst dword [edx + esi * 4]
33+
fst dword [edx + esi + 0xa]
34+
fst dword [edx + esi * 4 + 0xa]
35+
36+
; Result check
37+
mov eax, dword [edx]
38+
mov ebx, dword [edx + 0xa]
39+
mov ecx, dword [edx + esi]
40+
mov ebp, dword [edx + esi * 4]
41+
mov edi, dword [edx + esi + 0xa]
42+
mov esp, dword [edx + esi * 4 + 0xa]
43+
44+
hlt

unittests/ASM/X87/DB_07_2.asm

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
%ifdef CONFIG
2+
{
3+
"RegData": {
4+
"MM7": ["0x8000000000000000", "0x4000"]
5+
}
6+
}
7+
%endif
8+
9+
lea rdx, [rel data]
10+
fld tword [rdx + 8 * 0]
11+
12+
lea rdx, [rel data2]
13+
lea rax, [rdx + 8 * 0]
14+
fstp tword [rax]
15+
fld tword [rdx + 8 * 0]
16+
17+
hlt
18+
19+
align 8
20+
data:
21+
dt 2.0
22+
dq 0
23+
data2:
24+
dt 0.0
25+
dq 0

unittests/ASM/X87/FST_AddrModes.asm

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
%ifdef CONFIG
2+
{
3+
"RegData": {
4+
"RAX": "0x3f800000",
5+
"RBX": "0x3f800000",
6+
"RCX": "0x3f800000",
7+
"R8": "0x3f800000",
8+
"R9": "0x3f800000",
9+
"R10": "0x3f800000"
10+
},
11+
"MemoryRegions": {
12+
"0x100000000": "4096"
13+
}
14+
}
15+
%endif
16+
17+
section .bss
18+
base resb 4096
19+
20+
section .text
21+
22+
; Setup
23+
fld1
24+
lea rdx, [rel base]
25+
mov rsi, 0x64
26+
27+
; Test fst
28+
fst dword [rdx]
29+
fst dword [rdx + 0xa]
30+
fst dword [rdx + rsi]
31+
fst dword [rdx + rsi * 4]
32+
fst dword [rdx + rsi + 0xa]
33+
fst dword [rdx + rsi * 4 + 0xa]
34+
35+
; Result check
36+
mov eax, dword [rdx]
37+
mov ebx, dword [rdx + 0xa]
38+
mov ecx, dword [rdx + rsi]
39+
mov r8d, dword [rdx + rsi * 4]
40+
mov r9d, dword [rdx + rsi + 0xa]
41+
mov r10d, dword [rdx + rsi * 4 + 0xa]
42+
43+
hlt

0 commit comments

Comments
 (0)