From be3458a6bd7f81bdd98fc2d5cc7ac3f930ff3d53 Mon Sep 17 00:00:00 2001 From: Denis Kotov Date: Thu, 13 Oct 2022 13:16:14 +0300 Subject: [PATCH 1/2] Added push(...) and pop(...) for SIMD registers --- src/cpu/x64/jit_generator.hpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/cpu/x64/jit_generator.hpp b/src/cpu/x64/jit_generator.hpp index 9c930a7fab9..db5d65bb39d 100644 --- a/src/cpu/x64/jit_generator.hpp +++ b/src/cpu/x64/jit_generator.hpp @@ -147,6 +147,8 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { private: const size_t xmm_len = 16; + const size_t ymm_len = 32; + const size_t zmm_len = 64; #ifdef _WIN32 const size_t xmm_to_preserve_start = 6; const size_t xmm_to_preserve = 10; @@ -182,6 +184,35 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { inline size_t get_size_of_abi_save_regs() { return size_of_abi_save_regs; } + using Xbyak::CodeGenerator::push; + using Xbyak::CodeGenerator::pop; + + inline void push(const Xbyak::Xmm &xmm) { + if (xmm.isXMM()) { + sub(rsp, xmm_len); + uni_vmovdqu(ptr[rsp], xmm); + } else if (xmm.isYMM()) { + sub(rsp, ymm_len); + uni_vmovdqu(ptr[rsp], Xbyak::Ymm{xmm.getIdx()}); + } else if (xmm.isZMM()) { + sub(rsp, zmm_len); + uni_vmovdqu(ptr[rsp], Xbyak::Zmm{xmm.getIdx()}); + } + } + + inline void pop(const Xbyak::Xmm &xmm) { + if (xmm.isXMM()) { + uni_vmovdqu(xmm, ptr[rsp]); + add(rsp, xmm_len); + } else if (xmm.isYMM()) { + uni_vmovdqu(Xbyak::Ymm{xmm.getIdx()}, ptr[rsp]); + add(rsp, ymm_len); + } else if (xmm.isZMM()) { + uni_vmovdqu(Xbyak::Zmm{xmm.getIdx()}, ptr[rsp]); + add(rsp, zmm_len); + } + } + void preamble() { if (xmm_to_preserve) { sub(rsp, xmm_to_preserve * xmm_len); From b5a919f0bf3bbea0f353f751dfcf427c4cb23bcc Mon Sep 17 00:00:00 2001 From: Denis Kotov Date: Tue, 27 Sep 2022 14:21:22 +0300 Subject: [PATCH 2/2] Fixes for uni_vaddps, uni_vsubps, uni_vpmulld, uni_vmulps, uni_vmaxps, uni_vminps and uni_vcmpps operations --- src/cpu/x64/jit_generator.hpp | 108 ++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 30 deletions(-) diff --git a/src/cpu/x64/jit_generator.hpp b/src/cpu/x64/jit_generator.hpp index db5d65bb39d..c3e74e6204a 100644 --- a/src/cpu/x64/jit_generator.hpp +++ b/src/cpu/x64/jit_generator.hpp @@ -657,16 +657,22 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { vdivps(x, op1, op2); } - void uni_vaddps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + void uni_vaddps(const Xbyak::Xmm &x, const Xbyak::Xmm &op1, const Xbyak::Operand &op2) { if (is_valid_isa(avx)) vaddps(x, op1, op2); else { - if (!x.isEqualIfNotInherited(op1)) movups(x, op1); - addps(x, op2); + if (x.getIdx() == op1.getIdx()) { + addps(x, op2); + } else if (x.isEqualIfNotInherited(op2)) { + addps(x, op1); + } else { + movups(x, op1); + addps(x, op2); + } } } - void uni_vaddps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + void uni_vaddps(const Xbyak::Ymm &x, const Xbyak::Ymm &op1, const Xbyak::Operand &op2) { vaddps(x, op1, op2); } @@ -775,16 +781,25 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { vsubss(x, Xbyak::Xmm(op1.getIdx()), Xbyak::Xmm(op2.getIdx())); } - void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Xmm &op1, const Xbyak::Operand &op2) { if (is_valid_isa(avx)) vsubps(x, op1, op2); else { - if (!x.isEqualIfNotInherited(op1)) movups(x, op1); - subps(x, op2); + if (x.getIdx() == op1.getIdx()) { + subps(x, op2); + } else if (x.isEqualIfNotInherited(op2)) { + push(op1); + subps(op1, op2); + movups(x, op1); + pop(op1); + } else { + movups(x, op1); + subps(x, op2); + } } } - void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Ymm &op1, const Xbyak::Operand &op2) { vsubps(x, op1, op2); } @@ -818,13 +833,19 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { } } - void uni_vpmulld(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op) { + void uni_vpmulld(const Xbyak::Xmm &x, const Xbyak::Xmm &op1, + const Xbyak::Operand &op2) { if (is_valid_isa(avx)) { - vpmulld(x1, x2, op); + vpmulld(x, op1, op2); } else { - if (x1.getIdx() != x2.getIdx()) movdqa(x1, x2); - pmulld(x1, op); + if (x.getIdx() == op1.getIdx()) { + pmulld(x, op2); + } else if (x.isEqualIfNotInherited(op2)) { + pmulld(x, op1); + } else { + movdqa(x, op1); + pmulld(x, op2); + } } } void uni_vpmulld(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, @@ -832,16 +853,22 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { vpmulld(x1, x2, op); } - void uni_vmulps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + void uni_vmulps(const Xbyak::Xmm &x, const Xbyak::Xmm &op1, const Xbyak::Operand &op2) { if (is_valid_isa(avx)) vmulps(x, op1, op2); else { - if (!x.isEqualIfNotInherited(op1)) movups(x, op1); - mulps(x, op2); + if (x.getIdx() == op1.getIdx()) { + mulps(x, op2); + } else if (x.isEqualIfNotInherited(op2)) { + mulps(x, op1); + } else { + movups(x, op1); + mulps(x, op2); + } } } - void uni_vmulps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + void uni_vmulps(const Xbyak::Ymm &x, const Xbyak::Ymm &op1, const Xbyak::Operand &op2) { vmulps(x, op1, op2); } @@ -1243,16 +1270,22 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { vpsrld(x, op, imm); } - void uni_vmaxps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + void uni_vmaxps(const Xbyak::Xmm &x, const Xbyak::Xmm &op1, const Xbyak::Operand &op2) { if (is_valid_isa(avx)) vmaxps(x, op1, op2); else { - if (!x.isEqualIfNotInherited(op1)) movups(x, op1); - maxps(x, op2); + if (x.getIdx() == op1.getIdx()) { + maxps(x, op2); + } else if (x.isEqualIfNotInherited(op2)) { + maxps(x, op1); + } else { + movups(x, op1); + maxps(x, op2); + } } } - void uni_vmaxps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + void uni_vmaxps(const Xbyak::Ymm &x, const Xbyak::Ymm &op1, const Xbyak::Operand &op2) { vmaxps(x, op1, op2); } @@ -1267,17 +1300,23 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { } } - void uni_vminps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + void uni_vminps(const Xbyak::Xmm &x, const Xbyak::Xmm &op1, const Xbyak::Operand &op2) { if (is_valid_isa(avx)) vminps(x, op1, op2); else { - if (!x.isEqualIfNotInherited(op1)) movups(x, op1); - minps(x, op2); + if (x.getIdx() == op1.getIdx()) { + minps(x, op2); + } else if (x.isEqualIfNotInherited(op2)) { + minps(x, op1); + } else { + movups(x, op1); + minps(x, op2); + } } } - void uni_vminps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + void uni_vminps(const Xbyak::Ymm &x, const Xbyak::Ymm &op1, const Xbyak::Operand &op2) { vminps(x, op1, op2); } @@ -1314,13 +1353,22 @@ class jit_generator : public Xbyak::CodeGenerator, public c_compatible { vpmovzxbd(y, op); } - void uni_vcmpps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, - const Xbyak::Operand &op, int cmp_predicate) { + void uni_vcmpps(const Xbyak::Xmm &x, const Xbyak::Xmm &op1, + const Xbyak::Operand &op2, int cmp_predicate) { if (is_valid_isa(avx)) - vcmpps(x1, x2, op, cmp_predicate); + vcmpps(x, op1, op2, cmp_predicate); else { - if (x1.getIdx() != x2.getIdx()) uni_vmovups(x1, x2); - cmpps(x1, op, cmp_predicate); + if (x.getIdx() == op1.getIdx()) { + cmpps(x, op2, cmp_predicate); + } else if (x.isEqualIfNotInherited(op2)) { + push(op1); + cmpps(op1, op2, cmp_predicate); + movups(x, op1); + pop(op1); + } else { + movups(x, op1); + cmpps(x, op2, cmp_predicate); + } } } void uni_vcmpps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,