From defa057c11c0138f6ed8924587cf10fcd2ff3091 Mon Sep 17 00:00:00 2001 From: Roy Oursler Date: Fri, 21 Mar 2025 14:01:54 -0700 Subject: [PATCH] ngen: downstream nGEN --- src/gpu/intel/ocl/hw_info.cpp | 7 +- src/gpu/intel/sycl/l0/utils.cpp | 8 +- third_party/ngen/ngen.hpp | 18 +- third_party/ngen/ngen_asm.hpp | 46 +- third_party/ngen/ngen_auto_swsb.hpp | 11 +- third_party/ngen/ngen_config_internal.hpp | 44 + third_party/ngen/ngen_core.hpp | 37 +- third_party/ngen/ngen_decoder.hpp | 11 +- third_party/ngen/ngen_elf.hpp | 35 +- third_party/ngen/ngen_emulation.hpp | 796 +++++++++++++++++++ third_party/ngen/ngen_gen12.hpp | 4 +- third_party/ngen/ngen_gen8.hpp | 2 +- third_party/ngen/ngen_interface.hpp | 9 +- third_party/ngen/ngen_level_zero.hpp | 93 +-- third_party/ngen/ngen_opencl.hpp | 51 +- third_party/ngen/ngen_register_allocator.hpp | 4 +- third_party/ngen/ngen_sycl.hpp | 163 ++++ third_party/ngen/ngen_utils.hpp | 3 +- third_party/ngen/npack/neo_packager.hpp | 45 +- 19 files changed, 1200 insertions(+), 187 deletions(-) create mode 100644 third_party/ngen/ngen_config_internal.hpp create mode 100644 third_party/ngen/ngen_emulation.hpp create mode 100644 third_party/ngen/ngen_sycl.hpp diff --git a/src/gpu/intel/ocl/hw_info.cpp b/src/gpu/intel/ocl/hw_info.cpp index aeb1b4a0329..b92fec7b838 100644 --- a/src/gpu/intel/ocl/hw_info.cpp +++ b/src/gpu/intel/ocl/hw_info.cpp @@ -60,13 +60,12 @@ status_t init_gpu_hw_info(impl::engine_t *engine, cl_device_id device, int &gpu_product_family, int &stepping_id, uint64_t &native_extensions, bool &mayiuse_systolic, bool &mayiuse_ngen_kernels) { using namespace ngen; - HW hw = HW::Unknown; - Product product = {ProductFamily::Unknown, 0}; - jit::generator_t::detectHWInfo(context, device, hw, product); + Product product = ngen::OpenCLCodeGenerator::detectHWInfo( + context, device); bool is_xelpg = (product.family == ngen::ProductFamily::ARL || product.family == ngen::ProductFamily::MTL); - gpu_arch = jit::convert_ngen_arch_to_dnnl(hw); + gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family)); gpu_product_family = static_cast(product.family); stepping_id = product.stepping; diff --git a/src/gpu/intel/sycl/l0/utils.cpp b/src/gpu/intel/sycl/l0/utils.cpp index e22d34e141b..4984a3c5cc1 100644 --- a/src/gpu/intel/sycl/l0/utils.cpp +++ b/src/gpu/intel/sycl/l0/utils.cpp @@ -386,12 +386,10 @@ status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device, int &stepping_id, uint64_t &native_extensions, bool &mayiuse_systolic, bool &mayiuse_ngen_kernels) { using namespace ngen; - HW hw = HW::Unknown; - Product product = {ProductFamily::Unknown, 0}; - LevelZeroCodeGenerator::detectHWInfo( - context, device, hw, product); + Product product = LevelZeroCodeGenerator::detectHWInfo( + context, device); - gpu_arch = jit::convert_ngen_arch_to_dnnl(hw); + gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family)); gpu_product_family = static_cast(product.family); stepping_id = product.stepping; diff --git a/third_party/ngen/ngen.hpp b/third_party/ngen/ngen.hpp index 5580a324a10..63a07eb8472 100644 --- a/third_party/ngen/ngen.hpp +++ b/third_party/ngen/ngen.hpp @@ -27,12 +27,12 @@ #ifndef NGEN_HPP #define NGEN_HPP -#ifdef ENABLE_LLVM_WCONVERSION +#if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wimplicit-int-conversion" #endif -#include "ngen_config.hpp" +#include "ngen_config_internal.hpp" #include #include @@ -42,16 +42,13 @@ #include "ngen_core.hpp" #include "ngen_auto_swsb.hpp" #include "ngen_debuginfo.hpp" - // ----------------------------------------------------------------------- // Binary formats, split between pre-Gen12 and post-Gen12. #include "ngen_gen8.hpp" #include "ngen_gen12.hpp" // ----------------------------------------------------------------------- -#ifdef NGEN_ASM #include "ngen_asm.hpp" -#endif namespace NGEN_NAMESPACE { @@ -299,7 +296,7 @@ class BinaryCodeGenerator pushStream(rootStream); } - explicit BinaryCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : BinaryCodeGenerator({genericProductFamily(hw), stepping_}, debugConfig) {} + explicit BinaryCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : BinaryCodeGenerator({genericProductFamily(hw), stepping_, PlatformType::Unknown}, debugConfig) {} ~BinaryCodeGenerator() { for (size_t sn = 1; sn < streamStack.size(); sn++) @@ -684,7 +681,7 @@ class BinaryCodeGenerator void halt(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) { halt(mod, jip, jip, loc); } - void if_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl = false, SourceLocation loc = {}) { + void if_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl, SourceLocation loc = {}) { mod.setBranchCtrl(branchCtrl); opBranch(Opcode::if_, mod, null, jip, uip, loc); } @@ -1506,10 +1503,15 @@ int getStepping() const { return scope::getStepping(); } \ void setProduct(NGEN_NAMESPACE::Product product_) { scope::setProduct(product_); } \ void setProductFamily(NGEN_NAMESPACE::ProductFamily family_) { scope::setProductFamily(family_); } \ void setStepping(int stepping_) { scope::setStepping(stepping_); } \ +NGEN_FORWARD_SCOPE_EXTRA(scope) \ NGEN_FORWARD_SCOPE_OP_NAMES(scope) \ NGEN_FORWARD_SCOPE_MIN_MAX(scope) \ NGEN_FORWARD_SCOPE_REGISTERS(scope) +#define NGEN_FORWARD_SCOPE_EXTRA(scope) +#define NGEN_FORWARD_SCOPE_EXTRA_ELF_OVERRIDES(hw) + + #ifdef NGEN_NO_OP_NAMES #define NGEN_FORWARD_SCOPE_OP_NAMES(scope) #else @@ -2798,7 +2800,7 @@ void BinaryCodeGenerator::opNop(Opcode op, SourceLocation loc) } /* namespace NGEN_NAMESPACE */ -#ifdef ENABLE_LLVM_WCONVERSION +#if defined(__clang__) #pragma clang diagnostic pop #endif diff --git a/third_party/ngen/ngen_asm.hpp b/third_party/ngen/ngen_asm.hpp index 8f1190898e3..438bff51302 100644 --- a/third_party/ngen/ngen_asm.hpp +++ b/third_party/ngen/ngen_asm.hpp @@ -16,15 +16,18 @@ #ifndef NGEN_ASM_HPP #define NGEN_ASM_HPP -#ifdef NGEN_ASM -#include "ngen_config.hpp" +#include "ngen_config_internal.hpp" + +#ifdef NGEN_ASM #include #include #include #include +#include "ngen_core.hpp" +#include "ngen_debuginfo.hpp" #include "ngen_gen12.hpp" namespace NGEN_NAMESPACE { @@ -210,6 +213,7 @@ struct AsmInstruction { explicit AsmInstruction(uint32_t inum_, const std::string &comment_) : op(Opcode::illegal), ext(0), inum(inum_), mod{}, dst{}, src{}, labelManager{nullptr}, comment{comment_} {} inline AsmInstruction(const autoswsb::SyncInsertion &si); + inline AsmInstruction(const autoswsb::DummyMovInsertion &mi); bool isLabel() const { return (op == Opcode::illegal) && (dst.type == AsmOperand::Type::label); } bool isComment() const { return (op == Opcode::illegal) && !comment.empty(); } @@ -278,6 +282,21 @@ AsmInstruction::AsmInstruction(const autoswsb::SyncInsertion &si) src[0] = NullRegister(); } +AsmInstruction::AsmInstruction(const autoswsb::DummyMovInsertion &mi) +{ + op = Opcode::mov_gen12; + ext = 0; + mod = 1 | InstructionModifier::createMaskCtrl(true); + mod.setSWSB(mi.swsb); + dst = NullRegister().retype(mi.dt); + for (auto n = 1; n < 4; n++) + src[n] = NoOperand(); + if (mi.constant) { + src[0] = Immediate::zero(mi.dt); + } else + src[0] = GRF(mi.grf).sub(0, mi.dt); +} + unsigned AsmInstruction::getTypecode(const AsmOperand &op) { DataType dt = DataType::invalid; @@ -416,7 +435,7 @@ class AsmCodeGenerator { streamStack.push_back(new InstructionStream()); } - explicit AsmCodeGenerator(HW hardware_, int stepping_ = 0) : AsmCodeGenerator({genericProductFamily(hardware_), 0}) {} + explicit AsmCodeGenerator(HW hardware_, int stepping_ = 0) : AsmCodeGenerator({genericProductFamily(hardware_), 0, PlatformType::Unknown}) {} AsmCodeGenerator(HW hardware_, std::ostream &defaultOutput_, int stepping_ = 0) : AsmCodeGenerator(hardware_, stepping_) { defaultOutput = &defaultOutput_; @@ -488,7 +507,6 @@ class AsmCodeGenerator { LabelManager labelManager; std::vector streamStack; - inline void unsupported(); // Output functions. @@ -555,7 +573,7 @@ class AsmCodeGenerator { src0.fixup(hardware, 1, 0, defaultType, 0, 3); src1.fixup(hardware, 1, 0, defaultType, 1, 3); src2.fixup(hardware, 1, 0, defaultType, 2, 3); - (void) streamStack.back()->append(op, static_cast((sdepth << 8) | rcount), mod | defaultModifier, &labelManager, dst, src0, src1, src2); + (void) streamStack.back()->append(op, (sdepth << 8) | rcount, mod | defaultModifier, &labelManager, dst, src0, src1, src2); } template void opCall(Opcode op, const InstructionModifier &mod, D dst, S0 src0) { (void) streamStack.back()->append(op, 0, mod | defaultModifier | NoMask, &labelManager, dst, src0); @@ -582,7 +600,6 @@ class AsmCodeGenerator { bool getDefaultNoMask() const { return defaultModifier.isWrEn(); } bool getDefaultAutoSWSB() const { return defaultModifier.isAutoSWSB(); } - // Stream handling. void pushStream() { pushStream(new InstructionStream()); } void pushStream(InstructionStream &s) { pushStream(&s); } @@ -917,7 +934,7 @@ class AsmCodeGenerator { void halt(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) { halt(mod, jip, jip); } - void if_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl = false, SourceLocation loc = {}) { + void if_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl, SourceLocation loc = {}) { (void) jip.getID(labelManager); (void) uip.getID(labelManager); opX(Opcode::if_, DataType::invalid, mod, NoOperand(), jip, uip, NoOperand(), branchCtrl); @@ -1073,6 +1090,9 @@ class AsmCodeGenerator { } template void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) { +#ifdef NGEN_SAFE + if (!src0.isIndirect()) throw invalid_address_mode_exception(); +#endif if (hardware >= HW::Gen10) movi
(mod, dst, src0, null); else @@ -1082,6 +1102,7 @@ class AsmCodeGenerator { void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) { #ifdef NGEN_SAFE if (hardware < HW::Gen10) throw unsupported_instruction(); + if (!src0.isIndirect()) throw invalid_address_mode_exception(); #endif opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType
(), mod, dst, src0, src1); } @@ -1585,17 +1606,24 @@ void AsmCodeGenerator::getCode(std::ostream &out) autoswsb::BasicBlockList analysis = autoswsb::autoSWSB(hardware, declaredGRFs, streamStack.back()->buffer); std::multimap syncs; // Syncs inserted by auto-SWSB. + std::multimap movs; // Dummy moves inserted by auto-SWSB. - for (auto &bb : analysis) - for (auto &sync : bb.syncs) + for (auto &bb : analysis) { + for (auto &sync: bb.syncs) syncs.insert(std::make_pair(sync.inum, &sync)); + for (auto &mov: bb.movs) + movs.insert(std::make_pair(mov.inum, &mov)); + } auto nextSync = syncs.begin(); + auto nextMov = movs.begin(); int lineNo = 0; for (auto &i : streamStack.back()->buffer) { while ((nextSync != syncs.end()) && (nextSync->second->inum == i.inum)) outX(out, *(nextSync++)->second, lineNo++); + while ((nextMov != movs.end()) && (nextMov->second->inum == i.inum)) + outX(out, *(nextMov++)->second, lineNo++); if (i.isLabel()) { i.dst.label.outputText(out, PrintDetail::full, labelManager); diff --git a/third_party/ngen/ngen_auto_swsb.hpp b/third_party/ngen/ngen_auto_swsb.hpp index 998942a5e40..7974cb87265 100644 --- a/third_party/ngen/ngen_auto_swsb.hpp +++ b/third_party/ngen/ngen_auto_swsb.hpp @@ -21,11 +21,6 @@ #ifndef NGEN_AUTO_SWSB_HPP #define NGEN_AUTO_SWSB_HPP -#ifdef ENABLE_LLVM_WCONVERSION -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wimplicit-int-conversion" -#endif - #if defined(NGEN_DEBUG) || defined(NGEN_DEBUG_PROPAGATE) || defined(NGEN_DEBUG_BB) #include #include @@ -35,6 +30,8 @@ #include #include +#include "ngen_core.hpp" + namespace NGEN_NAMESPACE { namespace autoswsb { @@ -2630,8 +2627,4 @@ inline BasicBlockList autoSWSB(HW hw, int grfCount, Program &program) // Instruction operator[](int inum); // size_t size() const; -#ifdef ENABLE_LLVM_WCONVERSION -#pragma clang diagnostic pop -#endif - #endif /* NGEN_AUTOSWSB_HPP */ diff --git a/third_party/ngen/ngen_config_internal.hpp b/third_party/ngen/ngen_config_internal.hpp new file mode 100644 index 00000000000..acad524b957 --- /dev/null +++ b/third_party/ngen/ngen_config_internal.hpp @@ -0,0 +1,44 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef NGEN_CONFIG_INTERNAL_HPP +#define NGEN_CONFIG_INTERNAL_HPP + +// Drop NGEN_CONFIG define once C++11/14 support dropped +#if (defined(__has_include) && __has_include("ngen_config.hpp")) || defined(NGEN_CONFIG) +#include "ngen_config.hpp" +#else +// Default config settings + +#ifndef NGEN_NAMESPACE +#define NGEN_NAMESPACE ngen +#endif + +#ifndef NGEN_ASM +#define NGEN_ASM +#endif + +#if (__cplusplus >= 202002L || _MSVC_LANG >= 202002L) +#if __has_include() +#include +#if __cpp_lib_source_location >= 201907L +#define NGEN_ENABLE_SOURCE_LOCATION true +#endif +#endif +#endif + +#endif +#endif /* header guard */ diff --git a/third_party/ngen/ngen_core.hpp b/third_party/ngen/ngen_core.hpp index e374227eabc..3112c79196d 100644 --- a/third_party/ngen/ngen_core.hpp +++ b/third_party/ngen/ngen_core.hpp @@ -17,17 +17,13 @@ #ifndef NGEN_CORE_HPP #define NGEN_CORE_HPP -#ifdef ENABLE_LLVM_WCONVERSION -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wimplicit-int-conversion" -#endif - - +#include #include #include -#include -#include #include +#include + +#include "ngen_config_internal.hpp" #include "ngen_utils.hpp" @@ -214,14 +210,14 @@ class invalid_execution_size_exception : public std::runtime_error { public: invalid_execution_size_exception() : std::runtime_error("Invalid execution size") {} }; -class invalid_address_modifier_exception : public std::runtime_error { -public: - invalid_address_modifier_exception() : std::runtime_error("Invalid address offset") {} -}; class invalid_address_mode_exception : public std::runtime_error { public: invalid_address_mode_exception() : std::runtime_error("Invalid address mode") {} }; +class invalid_address_modifier_exception : public std::runtime_error { +public: + invalid_address_modifier_exception() : std::runtime_error("Invalid address offset") {} +}; #endif // Graphics core generations. @@ -268,14 +264,15 @@ enum class ProductFamily : int { GenericXe3, }; +enum class PlatformType {Unknown, Integrated, Discrete}; + struct Product { ProductFamily family; int stepping; + PlatformType type; }; -enum class PlatformType {Unknown, Integrated, Discrete}; - -static inline bool operator==(const Product &p1, const Product &p2) { return p1.family == p2.family && p1.stepping == p2.stepping; } +static inline bool operator==(const Product &p1, const Product &p2) { return p1.family == p2.family && p1.stepping == p2.stepping && p1.type == p2.type; } static inline bool operator!=(const Product &p1, const Product &p2) { return !(p1 == p2); } static inline bool operator<(const Product &p1, const Product &p2) { return (p1.family < p2.family) || (p1.family == p2.family && p1.stepping < p2.stepping); } static inline bool operator>(const Product &p1, const Product &p2) { return p2 < p1; } @@ -750,6 +747,7 @@ class RegData { friend inline bool operator!=(const RegData &r1, const RegData &r2); friend inline RegData abs(const RegData &r); + #ifdef NGEN_ASM inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const; #endif @@ -1221,6 +1219,7 @@ class ExtendedReg { constexpr14 RegData &getBase() { return base; } constexpr RegData getBase() const { return base; } constexpr uint8_t getMMENum() const { return mmeNum; } + #ifdef NGEN_ASM inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const; static const bool emptyOp = false; @@ -1719,6 +1718,7 @@ static inline bool trackedByToken(HW hw, Opcode op, unsigned dstTypecode) switch (op) { case Opcode::math: if (hw >= HW::XeHPC) return false; + /* fall through */ case Opcode::dpas: case Opcode::dpasw: return true; @@ -2222,6 +2222,7 @@ class Immediate { result.set(int32_t(int16_t(payload))); return result; } + #ifdef NGEN_ASM inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const; #endif @@ -3157,11 +3158,9 @@ static inline void encodeAtomicDescriptors(HW hw, MessageDescriptor &desc, Exten if (dst.isNull()) desc.parts.responseLen = 0; } -} /* namespace NGEN_NAMESPACE */ -#ifdef ENABLE_LLVM_WCONVERSION -#pragma clang diagnostic pop -#endif +} /* namespace NGEN_NAMESPACE */ + #endif /* header guard */ diff --git a/third_party/ngen/ngen_decoder.hpp b/third_party/ngen/ngen_decoder.hpp index 318753765f0..e3cdc0054cd 100644 --- a/third_party/ngen/ngen_decoder.hpp +++ b/third_party/ngen/ngen_decoder.hpp @@ -17,17 +17,17 @@ #ifndef NGEN_DECODER_HPP #define NGEN_DECODER_HPP -#ifdef ENABLE_LLVM_WCONVERSION +#if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wimplicit-int-conversion" #endif -#include "ngen_config.hpp" +#include "ngen_config_internal.hpp" -#include "ngen_core.hpp" #include "ngen_auto_swsb.hpp" -#include "ngen_gen8.hpp" +#include "ngen_core.hpp" #include "ngen_gen12.hpp" +#include "ngen_gen8.hpp" namespace NGEN_NAMESPACE { @@ -93,8 +93,7 @@ bool Decoder::getOperandRegion(autoswsb::DependencyRegion ®ion, int opNum) co } /* namespace NGEN_NAMESPACE */ -#ifdef ENABLE_LLVM_WCONVERSION +#if defined(__clang__) #pragma clang diagnostic pop #endif - #endif /* header guard */ diff --git a/third_party/ngen/ngen_elf.hpp b/third_party/ngen/ngen_elf.hpp index 7be23f68d9e..6359af0855f 100644 --- a/third_party/ngen/ngen_elf.hpp +++ b/third_party/ngen/ngen_elf.hpp @@ -31,10 +31,11 @@ class ELFCodeGenerator : public BinaryCodeGenerator public: inline std::vector getBinary(); static inline HW getBinaryArch(const std::vector &binary); - static inline void getBinaryHWInfo(const std::vector &binary, HW &outHW, Product &outProduct); + static inline Product getBinaryHWInfo(const std::vector &binary); explicit ELFCodeGenerator(Product product_, DebugConfig debugConfig = {}) : BinaryCodeGenerator(product_, debugConfig) {} explicit ELFCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : BinaryCodeGenerator(stepping_, debugConfig) {} + explicit ELFCodeGenerator(DebugConfig debugConfig) : ELFCodeGenerator(0, debugConfig) {} protected: NEOInterfaceHandler interface_{hw}; @@ -489,7 +490,8 @@ class ELFCodeGenerator : public BinaryCodeGenerator }; #define NGEN_FORWARD_ELF(hw) \ - NGEN_FORWARD_SCOPE_NO_ELF_OVERRIDES(NGEN_NAMESPACE::BinaryCodeGenerator) \ +NGEN_FORWARD_SCOPE_NO_ELF_OVERRIDES(NGEN_NAMESPACE::ELFCodeGenerator) \ +NGEN_FORWARD_SCOPE_ELF_EXTRA(NGEN_NAMESPACE::ELFCodeGenerator) \ template void externalName(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator::externalName(std::forward(args)...); } \ const std::string &getExternalName() const { return NGEN_NAMESPACE::ELFCodeGenerator::getExternalName(); } \ int getSIMD() const { return NGEN_NAMESPACE::ELFCodeGenerator::getSIMD(); } \ @@ -527,6 +529,8 @@ template NGEN_NAMESPACE::Subregister getLocalSize(Targs&&... void prologue() { NGEN_NAMESPACE::ELFCodeGenerator::prologue(); } \ void epilogue(const NGEN_NAMESPACE::RegData &r0_info = NGEN_NAMESPACE::RegData()) { NGEN_NAMESPACE::ELFCodeGenerator::epilogue(r0_info); } +#define NGEN_FORWARD_SCOPE_ELF_EXTRA(scope) + template std::vector ELFCodeGenerator::getBinary() { @@ -602,20 +606,16 @@ std::vector ELFCodeGenerator::getBinary(const std::vector template inline HW ELFCodeGenerator::getBinaryArch(const std::vector &binary) { - HW outHW; - Product outProduct; - - getBinaryHWInfo(binary, outHW, outProduct); - - return outHW; + return getCore(getBinaryHWInfo(binary).family); } template -inline void ELFCodeGenerator::getBinaryHWInfo(const std::vector &binary, HW &outHW, Product &outProduct) +inline Product ELFCodeGenerator::getBinaryHWInfo(const std::vector &binary) { using Note = typename ZebinELF::Note; - outHW = HW::Unknown; + Product outProduct; + HW hw_ = HW::Unknown; outProduct.family = ProductFamily::Unknown; outProduct.stepping = 0; @@ -639,8 +639,8 @@ inline void ELFCodeGenerator::getBinaryHWInfo(const std::vector &bi break; } case Note::Type::GfxCoreFamily: - if (outHW == HW::Unknown) - outHW = npack::decodeGfxCoreFamily(static_cast(*actualPayload)); + if (hw_ == HW::Unknown) + hw_ = npack::decodeGfxCoreFamily(static_cast(*actualPayload)); break; case Note::Type::TargetMetadata: { typename ZebinELF::TargetMetadata metadata; @@ -658,18 +658,17 @@ inline void ELFCodeGenerator::getBinaryHWInfo(const std::vector &bi } } else { if (zebinELF->fileHeader.flags.parts.useGfxCoreFamily) - outHW = npack::decodeGfxCoreFamily(static_cast(zebinELF->fileHeader.machine)); + hw_ = npack::decodeGfxCoreFamily(static_cast(zebinELF->fileHeader.machine)); else outProduct.family = npack::decodeProductFamily(static_cast(zebinELF->fileHeader.machine)); outProduct.stepping = zebinELF->fileHeader.flags.parts.minHWRevision; } } else - npack::getBinaryHWInfo(binary, outHW, outProduct); + return npack::getBinaryHWInfo(binary); - if (outHW != HW::Unknown && outProduct.family == ProductFamily::Unknown) - outProduct.family = genericProductFamily(outHW); - else if (outHW == HW::Unknown && outProduct.family != ProductFamily::Unknown) - outHW = getCore(outProduct.family); + if (hw_ != HW::Unknown && outProduct.family == ProductFamily::Unknown) + outProduct.family = genericProductFamily(hw_); + return outProduct; } } /* namespace NGEN_NAMESPACE */ diff --git a/third_party/ngen/ngen_emulation.hpp b/third_party/ngen/ngen_emulation.hpp new file mode 100644 index 00000000000..be1f19c458d --- /dev/null +++ b/third_party/ngen/ngen_emulation.hpp @@ -0,0 +1,796 @@ +/******************************************************************************* +* Copyright 2020-2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef NGEN_EMULATION_HPP +#define NGEN_EMULATION_HPP + +#include "ngen_config_internal.hpp" + +#ifdef NGEN_ENABLE_SOURCE_LOCATION +#include +#endif + +#include + +namespace NGEN_NAMESPACE { + +struct EmulationStrategy { + // Emulate 64-bit arithmetic (required for GenXLP) + bool emulate64 = false; + // Emulate DW x DW -> DW multiplication (required for Gen12) + bool emulateDWxDW = false; + // Use 32-bit adds for 64-bit arithmetic, assuming no 2^32 boundaries crossed. + bool emulate64_add32 = false; + // Emulate DW x DW -> QW multiplication (XeHPC) + bool emulate64_mul = false; + // Emulate QW and/or/xor operations (XeHPC) + bool emulate64_logic = false; + // Don't emulate QW shl/shr (XeHPC) + bool noemulate64_shift = false; + + EmulationStrategy() = default; + EmulationStrategy(HW hw_, int stepping = 0) { + if (hw_ == HW::Gen11) emulate64 = true; + if (hw_ >= HW::Gen11) emulateDWxDW = true; + if (hw_ == HW::Gen12LP) emulate64 = true; + if (hw_ == HW::XeHPG) emulate64 = true; + if (hw_ >= HW::XeHPC) { + if (hw_ == HW::XeHPC && stepping < SteppingPVCXTB0) + emulate64 = noemulate64_shift = true; + else + emulate64_mul = emulate64_logic = true; + } + emulate64_mul |= emulate64; + } +}; + +struct EmulationState { + GRF temp[2]; // Temporary GRFs for use in emulation sequences + FlagRegister flag; // Flag register for use in emulating 64-bit adds (optional, avoids temporary registers/acc) + int flagOffset = 0; // Channel offset to use with flag register. +}; + +// Implementation wrapped as static methods in non-instantiated class. +// Clients should declare EmulationImplementation as a friend. +struct EmulationImplementation { +#ifdef NGEN_ENABLE_SOURCE_LOCATION + [[noreturn]] static void stub(std::source_location where = std::source_location::current()) { + throw std::runtime_error(std::string("Unimplemented (at ") + + std::string(where.file_name()) + ":" + + std::to_string(where.line()) + ")"); + } +#else + [[noreturn]] static void stub() { + throw std::runtime_error("Unimplemented"); + } +#endif + + template + static void applyDefaultType(O &op) + { + if (op.getType() == DataType::invalid) + op.setType(getDataType
()); + } + + template + static bool isQW(const O &op) + { + return op.getType()== DataType::q || op.getType() == DataType::uq; + } + + template + static bool isDW(const O &op) + { + return op.getType() == DataType::d || op.getType() == DataType::ud; + } + + template + static bool isW(const O &op) { + return op.getType() == DataType::w || op.getType() == DataType::uw; + } + + static bool isDW(const Immediate &op) + { + if (op.getType() == DataType::w) + return int16_t(static_cast(op)) < 0; + else + return op.getType() == DataType::d || op.getType() == DataType::ud; + } + + template static O expandDW(const O &op) { return op; } + static Immediate expandDW(const Immediate &op) { return op.forceInt32(); } + + template static bool equal(const T1 &o1, const T2 &o2) { return o1 == o2; } + static bool equal(const RegData &o1, const Immediate &o2) { return false; } + + static void downgradeToDW(RegData &op) + { + if (isQW(op)) { + op.setType((op.getType() == DataType::q) ? DataType::d : DataType::ud); + op.setOffset(op.getOffset() * 2); + } + } + + static void downgradeToDW(Immediate &op) + { + if (isQW(op)) + op.setType((op.getType() == DataType::q) ? DataType::d : DataType::ud); + } + + // Get the DW equivalent of a QW region. + static void makeDWPair(RegData &op, int esize) + { + if (isQW(op)) { + downgradeToDW(op); + if (op.getHS() > 1) { + if (op.getVS() != op.getHS() * op.getWidth()) stub(); + op.setRegion(op.getHS() * 2, 2, 1); + } else { + auto newVS = op.getVS() * 2; + if (esize == op.getWidth()) + newVS = esize * 2; + op.setRegion(newVS, op.getWidth() * 2, 1); + } + } + } + + // Split a register into DW pairs. + static void splitToDW(RegData in, RegData &outLo, RegData &outHi) + { + bool isQ = (in.getType() == DataType::q); + bool isUQ = (in.getType() == DataType::uq); + + if (isQ || isUQ) { + outLo = in; + outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2); + outLo.setOffset(in.getOffset() * 2); + outLo.setType(DataType::ud); + + outHi = outLo; + outHi.setOffset(in.getOffset() * 2 + 1); + outHi.setType(isQ ? DataType::d : DataType::ud); + } else { + outLo = in; + outHi = Subregister{}; // invalid + } + } + + // Split an Immediate into DW pairs. + static void splitToDW(const Immediate &in, Immediate &outLo, Immediate &outHi) + { + bool isQ = (in.getType() == DataType::q); + bool isUQ = (in.getType() == DataType::uq); + + if (isQ || isUQ) { + outLo = uint32_t(static_cast(in)); + outLo = outLo.forceInt32(); + outLo.setType(DataType::ud); + + outHi = uint32_t(static_cast(in) >> 32); + outHi = outHi.forceInt32(); + outHi.setType(isQ ? DataType::d : DataType::ud); + } else { + outLo = in; + outHi = uint16_t(0); + } + } + + static RegData lowWord(RegData in) + { + if (isW(in)) return in; + + auto outLo = in; + outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2); + outLo.setOffset(in.getOffset() * 2); + outLo.setType(DataType::uw); + + return outLo; + } + + static Immediate lowWord(const Immediate &in) + { + return uint16_t(static_cast(in) & 0xffff); + } + + static RegData highWord(RegData in) + { + auto out = lowWord(in); + out.setOffset(out.getOffset() + 1); + return out; + } + + static Immediate highWord(const Immediate &in) + { + return uint16_t(static_cast(in) >> 16); + } + + static bool isUnitStride(const RegData &rd) + { + return (rd.getHS() == 1 && rd.getVS() == rd.getWidth()); + } + + static void regionVSAdvance(HW hw, RegData &rd, int i) + { + int ne = GRF::bytes(hw) / rd.getBytes(); + int advance = rd.getWidth() > 0 ? (i / rd.getWidth()) * rd.getVS() + : i * rd.getHS(); + int noffset = rd.getOffset() + advance; + if (noffset >= ne) { + noffset--; + rd.setBase(rd.getBase() + 1); + } + rd.setOffset(noffset); + } + + static void regionVSAdvance(HW hw, Immediate &imm, int i) {} + + // Move, emulating 64-bit moves with 32-bit (generally a good idea). + template + static void emov(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, const EmulationStrategy &strategy, SourceLocation loc = {}) + { + applyDefaultType
(dst); + applyDefaultType
(src0); + + bool dstQ = isQW(dst); + bool s0Q = isQW(src0); + bool s0D = isDW(src0); + bool isDF = (src0.getType() == DataType::df && dst.getType() == DataType::df); + bool unaligned = (mod.getExecSize() > 1 && src0.getHS() != 0 && src0.getOffset() != dst.getOffset()); + + if ((dstQ && s0D) && strategy.emulate64) { + if (src0.getNeg()) stub(); + bool s0Signed = isSigned(src0.getType()); + RegData dstHi, dstLo; + splitToDW(dst, dstLo, dstHi); + g.mov(mod, dstLo, src0, loc); + if (!s0Signed) + g.mov(mod, dstHi, 0, loc); + else + g.asr(mod, dstHi, dstLo, uint16_t(31), loc); + } else if (((dstQ || s0Q) && strategy.emulate64) + || (isDF && unaligned && g.hardware >= HW::XeHP)) { + if (dstQ != s0Q) stub(); + + auto mod2x = mod; + mod2x.setExecSize(mod.getExecSize() * 2); + + makeDWPair(dst, mod.getExecSize()); + makeDWPair(src0, mod.getExecSize()); + g.mov(mod2x, dst, src0, loc); + } else if (dst.getType() == DataType::f && src0.getType() == DataType::bf && (src0.getHS() != 1 || mod.getExecSize() == 1)) { + // Emulate bf16->f32 upconversion + dst.setType(DataType::ud); + src0.setType(DataType::uw); + g.shl(mod, dst, src0, 16, loc); + } else + g.mov(mod, dst, src0, loc); + } + + template + static void emov(Generator &g, const InstructionModifier &mod, RegData dst, Immediate src0, const EmulationStrategy &strategy, SourceLocation loc = {}) + { + applyDefaultType
(dst); + applyDefaultType
(src0); + + bool dstQ = isQW(dst); + bool s0Q = isQW(src0); + + if ((dstQ || s0Q) && strategy.emulate64) { + if (!dstQ) stub(); + + RegData dstHi, dstLo; + Immediate s0Hi = 0, s0Lo = 0; + + splitToDW(src0, s0Lo, s0Hi); + + if (static_cast(s0Lo) == static_cast(s0Hi) && dst.getHS() <= 1) { + auto mod2x = mod; + mod2x.setExecSize(mod.getExecSize() * 2); + + downgradeToDW(dst); + dst.setRegion(0, 0, 1); + g.mov(mod2x, dst, s0Lo, loc); + } else { + splitToDW(dst, dstLo, dstHi); + g.mov(mod, dstLo, s0Lo, loc); + g.mov(mod, dstHi, s0Hi, loc); + } + } else + g.mov(mod, dst, src0, loc); + } + + template + static void eaddSignExtend1(Generator &g, const InstructionModifier &mod, bool &doSub, const Immediate &src1, Immediate &s1LoPos, const Immediate &s1Lo, const Immediate &s1Hi, bool &s1Q, const GRF (&temp)[2], const SourceLocation & loc) + { + uint64_t raw = static_cast(src1); + if (src1.getType() == DataType::d) { + auto val = int32_t(raw); + s1LoPos = uint32_t(std::abs(val)); + doSub = (val < 0); + } else if (src1.getType() == DataType::w) { + auto val = int16_t(raw); + s1LoPos = uint16_t(std::abs(val)); + doSub = (val < 0); + } + } + + template + static void eaddSignExtend1(Generator &g, const InstructionModifier &mod, bool &doSub, const RegData &src1, RegData &s1LoPos, RegData &s1Lo, RegData &s1Hi, bool &s1Q, const GRF (&temp)[2], const SourceLocation &loc) + { + s1Q = true; + s1Hi = temp[0].d(); + if (s1Lo.getNeg()) { + g.asr(mod, s1Hi, -s1Lo, uint16_t(31), loc); + s1Hi = -s1Hi; + } else + g.asr(mod, s1Hi, s1Lo, uint16_t(31), loc); + s1Lo.setType(DataType::ud); + } + + static void eaddHandleS1Neg(bool &doSub, RegData &s1LoPos, const RegData &s1Lo) + { + if (isSigned(s1Lo.getType())) stub(); + doSub = s1Lo.getNeg(); + s1LoPos = -s1Lo; + } + + static void eaddHandleS1Neg(bool &doSub, const Immediate &s1LoPos, const Immediate &s1Lo) + { + /* no-op */ + } + + template + static void eaddFixupQD(Generator &g, const InstructionModifier &mod, const FlagRegister &flag, const RegData &dstHi, const RegData &src1, const SourceLocation &loc) + { + if ((src1.getBytes() < 8) && isSigned(src1.getType())) { + // Add sign extension of src1 to high 32 bits of dst (inefficient but rarely used path). + g.cmp(mod | (src1.getNeg() ? g.le : g.lt) | flag, src1, 0, loc); + g.add(mod | flag, dstHi, dstHi, -1, loc); + } + } + + template + static void eaddFixupQD(Generator &g, const InstructionModifier &mod, const FlagRegister &flag, const RegData &dstHi, const Immediate &src1, const SourceLocation &loc) { + /* no-op */ + } + + static bool eaddIsNegative(const RegData &r) + { + return r.getNeg(); + } + + static bool eaddIsNegative(const Immediate &i) + { + return int32_t(uint64_t(i)) < 0; + } + + // Integer addition, emulating 64-bit arithmetic if configured. + template + static void eaddInternal(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, S1 src1, const EmulationStrategy &strategy, const EmulationState &state, const SourceLocation &loc) + { + const auto &temp = state.temp; + + applyDefaultType
(dst); + applyDefaultType
(src0); + applyDefaultType
(src1); + + bool dstQ = isQW(dst); + bool s0Q = isQW(src0); + bool s1Q = isQW(src1); + + if (dstQ && strategy.emulate64_add32) { + RegData dstHi, dstLo, s0Hi, s0Lo; + S1 s1Hi, s1Lo; + + splitToDW(dst, dstLo, dstHi); + splitToDW(src0, s0Lo, s0Hi); + splitToDW(src1, s1Lo, s1Hi); + g.add(mod, dstLo, s0Lo, s1Lo, loc); + + if (s0Q && s1Q) { + if (!equal(dstHi, s0Hi) && !equal(dstHi, s1Hi)) + g.add(mod, dstHi, s0Hi, s1Hi, loc); + } else if (s0Q) { + if (!equal(dstHi, s0Hi)) g.mov(mod, dstHi, s0Hi, loc); + } else if (s1Q) { + if (!equal(dstHi, s1Hi)) g.mov(mod, dstHi, s1Hi, loc); + } else + g.mov(mod, dstHi, uint16_t(0), loc); + } else if (!strategy.emulate64) + g.add(mod, dst, src0, src1, loc); + else { + if (!dstQ) { + downgradeToDW(src0); + downgradeToDW(src1); + g.add(mod, dst, src0, src1, loc); + } else { + RegData dstHi, dstLo, s0Hi, s0Lo; + S1 s1Hi, s1Lo, s1LoPos; + FlagRegister flag = state.flag; + + splitToDW(dst, dstLo, dstHi); + splitToDW(src0, s0Lo, s0Hi); + splitToDW(src1, s1Lo, s1Hi); + s1LoPos = s1Lo; + + bool s0Signed = isSigned(s0Lo.getType()); + bool s1Signed = isSigned(s1Lo.getType()); + + if (flag.isValid() && !eaddIsNegative(s0Lo)) { + // Use flag register + ov. + auto Mx = g.ExecutionOffset(state.flagOffset); + bool neg = eaddIsNegative(s1Lo); + bool revFlag = false; + + auto s0LoUD = s0Lo; + auto s1LoMod = s1Lo; + s0LoUD.setType(DataType::ud); + if (s1Signed && !std::is_base_of::value) { + s1LoMod.setType(DataType::ud); + revFlag = neg; + neg = false; + } + + g.add(mod | Mx | g.ov | flag, dstLo, s0LoUD, s1LoMod, loc); + if (s0Q && s1Q) + g.add(mod, dstHi, s0Hi, s1Hi, loc); + else if (s0Q && !equal(dstHi, s0Hi)) + g.mov(mod, dstHi, s0Hi, loc); + else if (s1Q && !equal(dstHi, s1Hi)) + g.mov(mod, dstHi, s1Hi, loc); + else if (!s0Q && !s1Q) + g.mov(mod, dstHi, 0, loc); + g.add(mod | Mx | (revFlag ? ~flag : flag), dstHi, dstHi, neg ? -1 : +1, loc); + eaddFixupQD(g, mod | Mx, flag, dstHi, src0, loc); + eaddFixupQD(g, mod | Mx, flag, dstHi, src1, loc); + } else { + // Slow path: addc/subb + acc. + RegData carry = temp[0].ud(); + bool lateCarry = false; + RegData subDstLo; + bool doSub = false; + + // For :uq + :d or :q + :ud, sign extend 32-bit input to 64 bits. + if (s0Signed != s1Signed) { + if (s0Signed) { + s0Q = true; + s0Hi = temp[0].d(); + g.asr(mod, s0Hi, s0Lo, uint16_t(31), loc); + s0Lo.setType(DataType::ud); + if (s0Lo.getNeg()) + s0Hi = -s0Hi; + } else + eaddSignExtend1(g, mod, doSub, src1, s1LoPos, s1Lo, s1Hi, s1Q, temp, loc); + carry = temp[1].ud(); + lateCarry = true; + } + + // Handle modifiers. + if (s0Lo.getNeg()) stub(); + eaddHandleS1Neg(doSub, s1LoPos, s1Lo); + + // Compute low 32 bits, saving carry/borrow. + if (dstLo.getOffset() != 0) { + doSub ? g.subb(mod, g.null.retype(s0Lo.getType()), s0Lo, s1LoPos, loc) + : g.addc(mod, g.null.retype(s0Lo.getType()), s0Lo, s1Lo, loc); + g.add(mod, dstLo, s0Lo, s1Lo, loc); + } else if ((mod.getExecSize() > 1) && !isUnitStride(dstLo)) { + subDstLo = temp[1].ud(); + doSub ? g.subb(mod, subDstLo, s0Lo, s1LoPos, loc) + : g.addc(mod, subDstLo, s0Lo, s1Lo, loc); + } else { + doSub ? g.subb(mod, dstLo, s0Lo, s1LoPos, loc) + : g.addc(mod, dstLo, s0Lo, s1Lo, loc); + } + + // Retrieve carry from accumulator, unless it conflicts with subDstLo. + if (!lateCarry) g.mov(mod, carry, g.acc0.ud(), loc); + + // Move low 32-bits to final resting place, if needed. + if (subDstLo.isValid()) g.mov(mod, dstLo, subDstLo, loc); + + // Retrieve carry from accumulator once subDstLo isn't needed. + if (lateCarry) g.mov(mod, carry, g.acc0.ud(), loc); + + if (doSub) + carry = -carry; + + // Compute high 32 bits of sum. + if (s0Q && s1Q) { + g.add(mod, dstHi, s0Hi, s1Hi, loc); + g.add(mod, dstHi, carry, dstHi, loc); + } else if (s0Q) + g.add(mod, dstHi, carry, s0Hi, loc); + else if (s1Q) + g.add(mod, dstHi, carry, s1Hi, loc); + else + g.mov(mod, dstHi, carry, loc); + } + } + } + } + + template + static void eadd(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) { + if (src0.getNeg() && !src1.getNeg() && strategy.emulate64 && !strategy.emulate64_add32) + eaddInternal
(g, mod, dst, src1, src0, strategy, state, loc); + else + eaddInternal
(g, mod, dst, src0, src1, strategy, state, loc); + } + + template + static void eadd(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Immediate src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) { + eaddInternal
(g, mod, dst, src0, src1, strategy, state, loc); + } + + // Integer multiplication, emulating 32x32 multiplication as configured. + template + static void emulInternal(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, S1 src1, const EmulationStrategy &strategy, const EmulationState &state, const SourceLocation &loc) { + applyDefaultType
(dst); + applyDefaultType
(src0); + applyDefaultType
(src1); + + bool dstD = isDW(dst); + bool dstQ = isQW(dst); + bool s0W = isW(src0); + bool s0D = isDW(src0); + bool s0Q = isQW(src0); + bool s1W = isW(src1); + bool s1D = isDW(src1); + bool s1Q = isQW(src1); + bool s1Immed = std::is_base_of::value; + + bool s0Signed = isSigned(src0.getType()); + bool s1Signed = isSigned(src1.getType()); + auto mulHiType = (s0Signed || s1Signed) ? DataType::d : DataType::ud; + + bool emulate64 = strategy.emulate64_mul; + + if (s0Q) { + if (!s1D || !dstQ) stub(); + auto temp = s1Signed ? state.temp[0].d() : state.temp[0].ud(); + auto &src1Reg = [&]() -> RegData & { + if (s1Immed || s1W) { + g.mov(mod, temp, src1, loc); + return temp; + } else { + return *reinterpret_cast(&src1); + } + }(); + return emulInternal(g, mod, dst, src1Reg, src0, strategy, state, loc); + } else if (s1Q) { + if (!s0D || !dstQ) stub(); + auto s0Type = src0.getType(); + RegData dstLo, dstHi; + S1 s1Hi, s1Lo; + splitToDW(dst, dstLo, dstHi); + splitToDW(src1, s1Lo, s1Hi); + s1Hi = expandDW(s1Hi); + s1Lo = expandDW(s1Lo); + dstLo.setType(src0.getType()); + dstHi.setType(src0.getType()); + auto s1W0 = lowWord(s1Lo); + auto s1W2 = lowWord(s1Hi); + auto accLo + = g.acc0.retype(s0Type)[dstLo.getOffset()](dstLo.getHS()); + auto accHi + = g.acc0.retype(s0Type)[dstHi.getOffset()](dstHi.getHS()); + g.mul(mod, accHi, src0, s1W2, loc); + g.macl(mod, dstHi, src0, s1Hi, loc); + g.mul(mod, accLo, src0, s1W0, loc); + g.mach(mod, dstLo, src0, s1Lo, loc); + g.add(mod, dstHi, dstHi, dstLo, loc); + g.mov(mod, dstLo, accLo, loc); + } else if (dstQ && s0W && s1W) { + RegData dstLo, dstHi; + splitToDW(dst, dstLo, dstHi); + + g.mul(mod, dstLo, src0, src1, loc); + + dstHi.setType(mulHiType); + dstLo.setType(mulHiType); + + if (s0Signed || s1Signed) + g.asr(mod, dstHi, dstLo, 31, loc); + else + g.mov(mod, dstHi, 0, loc); + } else if (dstQ && s0W && s1D) { + stub(); + } else if (dstQ && s0D && s1W && !s1Immed && !emulate64 && !strategy.emulateDWxDW) { + auto acc = g.acc0.d(); + g.mov(mod, acc, src1, loc); + g.mul(mod, dst, acc, src0, loc); + } else if (dstQ && s0D && ((s1W && !s1Immed) || ((s1W || s1D) && emulate64))) { + RegData dstLo, dstHi; + splitToDW(dst, dstLo, dstHi); + + auto acc = g.acc0.retype(mulHiType)[dstLo.getOffset()](dstLo.getHS()); + + g.mul(mod, acc, src0, lowWord(src1), loc); + if (s1D) + g.mach(mod, dstLo, src0, expandDW(src1), loc); + else + g.mach(mod, dstLo, src0, int32_t(0), loc); + g.mov(mod, dstHi, dstLo, loc); + g.mov(mod, dstLo, acc, loc); + } else if (dstD && s0D && s1D && strategy.emulateDWxDW) { + int ne1 = GRF::bytes(g.hardware) >> 2; + + for (int r = 0; r < mod.getExecSize(); r += ne1) { + auto mmod = mod; + mmod.setExecSize(std::min(mod.getExecSize() - r, ne1)); + + auto acc = g.acc0.retype(mulHiType)[dst.getOffset()](dst.getHS()); + auto dummy = g.null.retype(mulHiType)[dst.getOffset()](dst.getHS()); + + g.mul(mmod, acc, src0, lowWord(src1), loc); + + if (g.getHardware() < HW::Gen10) { + g.mach(mmod, dummy, src0, expandDW(src1), loc); + g.mov(mmod, dst, acc, loc); + } else { + g.macl(mmod, dst, src0, expandDW(src1), loc); + } + + regionVSAdvance(g.hardware, dst, ne1); + regionVSAdvance(g.hardware, src0, ne1); + regionVSAdvance(g.hardware, src1, ne1); + } + } else + g.mul(mod, dst, src0, src1, loc); + } + + template + static void emul(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) + { + emulInternal
(g, mod, dst, src0, src1, strategy, state, loc); + } + + template + static void emul(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Immediate src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) + { + emulInternal
(g, mod, dst, src0, src1, strategy, state, loc); + } + + template + static void emul32High(Generator &g, const InstructionModifier &mod, const RegData &dstHi, const RegData &src0, const S1 &src1, SourceLocation loc = {}) + { + g.mul(mod, g.acc0.ud(dstHi.getOffset()), src0, lowWord(src1), loc); + g.mach(mod, dstHi, src0, src1, loc); + } + + // Shift left, emulating 64-bit arithmetic if configured. + template + static void eshl(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) + { + const auto &temp = state.temp; + + applyDefaultType
(dst); + applyDefaultType
(src0); + + bool dstQ = isQW(dst); + bool s0Q = isQW(src0); + + if (src1 == 0) { + emov(g, mod, dst, src0, strategy, loc); + return; + } + + if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) { + if (src1 >= 32) stub(); + + RegData dstHi, dstLo, s0Hi, s0Lo; + + auto acc = temp[0].ud(); + + splitToDW(dst, dstLo, dstHi); + + if (s0Q) { + splitToDW(src0, s0Lo, s0Hi); + + g.shr(mod, acc, s0Lo, uint16_t(32 - src1), loc); + g.shl(mod, dstHi, s0Hi, src1, loc); + g.shl(mod, dstLo, s0Lo, src1, loc); + g.or_(mod, dstHi, acc, dstHi, loc); + } else { + dstHi.setType(DataType::ud); + g.shl(mod, dstLo, src0, src1, loc); + g.shr(mod, dstHi, src0, uint16_t(32 - src1), loc); + } + } else { + if (s0Q && !dstQ) downgradeToDW(src0); + g.shl(mod, dst, src0, src1, loc); + } + } + + // Shift right, emulating 64-bit arithmetic if configured. + template + static void eshr(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) + { + const auto &temp = state.temp; + + applyDefaultType
(dst); + applyDefaultType
(src0); + + bool dstQ = isQW(dst); + bool s0Q = isQW(src0); + + if (src1 == 0) { + emov(g, mod, dst, src0, strategy, loc); + return; + } + + if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) { + if (src1 >= 32) stub(); + + RegData dstHi, dstLo, s0Hi, s0Lo; + + auto acc = temp[0].ud(); + + splitToDW(dst, dstLo, dstHi); + + if (s0Q) { + splitToDW(src0, s0Lo, s0Hi); + + g.shl(mod, acc, s0Lo, uint16_t(32 - src1), loc); + g.shr(mod, dstLo, s0Lo, src1, loc); + isSigned(src0.getType()) ? g.asr(mod, dstHi, s0Hi, src1, loc) + : g.shr(mod, dstHi, s0Hi, src1, loc); + g.or_(mod, dstLo, acc, dstLo, loc); + } else { + dstLo.setType(dstHi.getType()); + isSigned(src0.getType()) ? g.asr(mod, dstLo, src0, src1, loc) + : g.shr(mod, dstLo, src0, src1, loc); + g.mov(mod, dstHi, uint16_t(0), loc); + } + } else { + if (s0Q && !dstQ) downgradeToDW(src0); + isSigned(src0.getType()) ? g.asr(mod, dst, src0, src1, loc) + : g.shr(mod, dst, src0, src1, loc); + } + } + + // Multiply by a constant, optimizing for power-of-2 constants and emulating 64-bit arithmetic if configured. + template + static void emulConstant(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, int32_t src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) + { + if (src1 == 0) + emov
(g, mod, dst, uint16_t(0), strategy, loc); + else if (src1 == 1) { + if (dst != src0) emov
(g, mod, dst, src0, strategy, loc); + } else if (utils::is_zero_or_pow2(src1)) + eshl
(g, mod, dst, src0, uint16_t(utils::log2(src1)), strategy, state, loc); + else if (src1 > 0) + emul
(g, mod, dst, src0, uint32_t(src1), strategy, state, loc); + else + emul
(g, mod, dst, src0, int32_t(src1), strategy, state, loc); + } +}; // struct EmulationHelper + +} /* namespace NGEN_NAMESPACE */ + +#define NGEN_EMULATION_FORWARD \ +template void emov(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::RegData src0, const EmulationStrategy &strategy) { EmulationImplementation::emov
(*this, mod, dst, src0, strategy); } \ +template void emov(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::Immediate src0, const EmulationStrategy &strategy) { EmulationImplementation::emov
(*this, mod, dst, src0, strategy); } \ +template void eadd(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, const NGEN_NAMESPACE::RegData &src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::eadd
(*this, mod, dst, src0, src1, strategy, state); } \ +template void eadd(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, NGEN_NAMESPACE::Immediate src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::eadd
(*this, mod, dst, src0, src1, strategy, state); } \ +template void emul(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, const NGEN_NAMESPACE::RegData &src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::emul
(*this, mod, dst, src0, src1, strategy, state); } \ +template void emul(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, NGEN_NAMESPACE::Immediate src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::emul
(*this, mod, dst, src0, src1, strategy, state); } \ +template void eshl(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::eshl
(*this, mod, dst, src0, src1, strategy, state); } \ +template void eshr(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::eshr
(*this, mod, dst, src0, src1, strategy, state); } \ +template void emulConstant(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, int32_t src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::emulConstant
(*this, mod, dst, src0, src1, strategy, state); } \ +template void emul32High(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dstHi, const NGEN_NAMESPACE::RegData &src0, const S1 &src1) { EmulationImplementation::emul32High(*this, mod, dstHi, src0, src1); } +#endif diff --git a/third_party/ngen/ngen_gen12.hpp b/third_party/ngen/ngen_gen12.hpp index e7d873bdcf9..e32aa422bcc 100644 --- a/third_party/ngen/ngen_gen12.hpp +++ b/third_party/ngen/ngen_gen12.hpp @@ -17,10 +17,12 @@ /* * Do not #include this file directly; ngen uses it internally. */ - #ifndef NGEN_GEN12_HPP #define NGEN_GEN12_HPP +#include "ngen_auto_swsb.hpp" +#include "ngen_gen8.hpp" + namespace NGEN_NAMESPACE { // Gen12 binary encoding. diff --git a/third_party/ngen/ngen_gen8.hpp b/third_party/ngen/ngen_gen8.hpp index f46adccaaa8..0a763126f87 100644 --- a/third_party/ngen/ngen_gen8.hpp +++ b/third_party/ngen/ngen_gen8.hpp @@ -610,5 +610,5 @@ static inline constexpr14 Align16Operand extToAlign16(const ExtendedReg ®) return Align16Operand::createWithMME(reg.getBase(), reg.getMMENum()); } -} +} // namespace NGEN_NAMESPACE #endif diff --git a/third_party/ngen/ngen_interface.hpp b/third_party/ngen/ngen_interface.hpp index d4233954cda..a9f2fc5260a 100644 --- a/third_party/ngen/ngen_interface.hpp +++ b/third_party/ngen/ngen_interface.hpp @@ -19,6 +19,8 @@ #include "ngen_core.hpp" +#include "ngen_asm.hpp" + #include @@ -136,13 +138,14 @@ class InterfaceHandler template inline void generatePrologue(CodeGenerator &generator, const GRF &temp = GRF(127)) const; -#ifdef NGEN_ASM - inline void dumpAssignments(std::ostream &stream) const; -#endif inline void generateDummyCL(std::ostream &stream) const; inline std::string generateZeInfo() const; +#ifdef NGEN_ASM + inline void dumpAssignments(std::ostream &stream) const; +#endif + static constexpr int noSurface = 0x80; // Returned by getArgumentSurfaceIfExists in case of no surface assignment protected: diff --git a/third_party/ngen/ngen_level_zero.hpp b/third_party/ngen/ngen_level_zero.hpp index c18479d0576..feb1b1a6a77 100644 --- a/third_party/ngen/ngen_level_zero.hpp +++ b/third_party/ngen/ngen_level_zero.hpp @@ -17,7 +17,7 @@ #ifndef NGEN_LEVEL_ZERO_HPP #define NGEN_LEVEL_ZERO_HPP -#include "ngen_config.hpp" +#include "ngen_config_internal.hpp" #include "level_zero/ze_api.h" @@ -94,16 +94,13 @@ class LevelZeroCodeGenerator : public ELFCodeGenerator this->interface_.setInlineGRFCount(0); } - explicit LevelZeroCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : LevelZeroCodeGenerator({genericProductFamily(hw), stepping_}, debugConfig) {} + explicit LevelZeroCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : LevelZeroCodeGenerator({genericProductFamily(hw), stepping_, PlatformType::Unknown}, debugConfig) {} explicit LevelZeroCodeGenerator(DebugConfig debugConfig) : LevelZeroCodeGenerator({genericProductFamily(hw), 0}, debugConfig) {} inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = ""); static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device); - static void detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, Product &outProduct); - - /* Deprecated. Use the Product-based API instead. */ - static void detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, int &outStepping); + static inline Product detectHWInfo(ze_context_handle_t context, ze_device_handle_t device); }; #define NGEN_FORWARD_LEVEL_ZERO(hw) NGEN_FORWARD_ELF(hw) @@ -147,66 +144,60 @@ ze_module_handle_t LevelZeroCodeGenerator::getModule(ze_context_handle_t con template HW LevelZeroCodeGenerator::detectHW(ze_context_handle_t context, ze_device_handle_t device) { - HW outHW; - int outStepping; - - detectHWInfo(context, device, outHW, outStepping); - - return outHW; + return getCore(detectHWInfo(context, device).family); } template -void LevelZeroCodeGenerator::detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, int &outStepping) +Product LevelZeroCodeGenerator::detectHWInfo(ze_context_handle_t context, ze_device_handle_t device) { - Product outProduct; - detectHWInfo(context, device, outHW, outProduct); - outStepping = outProduct.stepping; -} + Product product; + + ze_device_properties_t dprop = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, nullptr}; -template -void LevelZeroCodeGenerator::detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, Product &outProduct) -{ #ifdef ZE_DEVICE_IP_VERSION_EXT_NAME // Try ZE_extension_device_ip_version first if available. ze_device_ip_version_ext_t vprop = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0}; - auto dprop = ze_device_properties_t(); + dprop.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; dprop.pNext = ∝ if (call_zeDeviceGetProperties(device, &dprop) == ZE_RESULT_SUCCESS) { - outProduct = npack::decodeHWIPVersion(vprop.ipVersion); - outHW = getCore(outProduct.family); - if (outProduct.family != ProductFamily::Unknown) - return; - } + return npack::decodeHWIPVersion(vprop.ipVersion); + } else #endif + { + static const uint8_t dummySPV[] = {0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0E, 0x00, 0x06, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x05, 0x00, 0x01, 0x00, 0x00, 0x00, 0x4F, 0x70, 0x65, 0x6E, 0x43, 0x4C, 0x2E, 0x73, 0x74, 0x64, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00, 0x07, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6B, 0x65, 0x72, 0x6E, 0x65, 0x6C, 0x5F, 0x61, 0x72, 0x67, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x2E, 0x5F, 0x2E, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x70, 0x8E, 0x01, 0x00, 0x05, 0x00, 0x04, 0x00, 0x05, 0x00, 0x00, 0x00, 0x65, 0x6E, 0x74, 0x72, 0x79, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; + ze_module_desc_t moduleDesc = { + ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_IL_SPIRV, + sizeof(dummySPV), + dummySPV, + nullptr, + nullptr + }; + + ze_module_handle_t module; + detail::handleL0(call_zeModuleCreate(context, device, &moduleDesc, &module, nullptr)); + + if (module == nullptr) + throw level_zero_error{}; + + std::vector binary; + size_t binarySize; + + detail::handleL0(call_zeModuleGetNativeBinary(module, &binarySize, nullptr)); + binary.resize(binarySize); + detail::handleL0(call_zeModuleGetNativeBinary(module, &binarySize, binary.data())); + detail::handleL0(call_zeModuleDestroy(module)); + product = ELFCodeGenerator::getBinaryHWInfo(binary); + dprop.pNext = nullptr; + detail::handleL0(call_zeDeviceGetProperties(device, &dprop)); + } - static const uint8_t dummySPV[] = {0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0E, 0x00, 0x06, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x05, 0x00, 0x01, 0x00, 0x00, 0x00, 0x4F, 0x70, 0x65, 0x6E, 0x43, 0x4C, 0x2E, 0x73, 0x74, 0x64, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00, 0x07, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6B, 0x65, 0x72, 0x6E, 0x65, 0x6C, 0x5F, 0x61, 0x72, 0x67, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x2E, 0x5F, 0x2E, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x70, 0x8E, 0x01, 0x00, 0x05, 0x00, 0x04, 0x00, 0x05, 0x00, 0x00, 0x00, 0x65, 0x6E, 0x74, 0x72, 0x79, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; - ze_module_desc_t moduleDesc = { - ZE_STRUCTURE_TYPE_MODULE_DESC, - nullptr, - ZE_MODULE_FORMAT_IL_SPIRV, - sizeof(dummySPV), - dummySPV, - nullptr, - nullptr - }; - - ze_module_handle_t module; - detail::handleL0(call_zeModuleCreate(context, device, &moduleDesc, &module, nullptr)); - - if (module == nullptr) - throw level_zero_error{}; - - std::vector binary; - size_t binarySize; - - detail::handleL0(call_zeModuleGetNativeBinary(module, &binarySize, nullptr)); - binary.resize(binarySize); - detail::handleL0(call_zeModuleGetNativeBinary(module, &binarySize, binary.data())); - detail::handleL0(call_zeModuleDestroy(module)); + product.type = (dprop.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? PlatformType::Integrated : PlatformType::Discrete; - ELFCodeGenerator::getBinaryHWInfo(binary, outHW, outProduct); + return product; } } /* namespace NGEN_NAMESPACE */ diff --git a/third_party/ngen/ngen_opencl.hpp b/third_party/ngen/ngen_opencl.hpp index 7ee331e0472..6279e08724d 100644 --- a/third_party/ngen/ngen_opencl.hpp +++ b/third_party/ngen/ngen_opencl.hpp @@ -17,9 +17,11 @@ #ifndef NGEN_OPENCL_HPP #define NGEN_OPENCL_HPP -#include "ngen_config.hpp" +#include "ngen_config_internal.hpp" +#ifndef __OPENCL_CL_H #include +#endif #include #include @@ -55,17 +57,14 @@ class OpenCLCodeGenerator : public ELFCodeGenerator public: explicit OpenCLCodeGenerator(Product product_, DebugConfig debugConfig = {}) : ELFCodeGenerator(product_, debugConfig) {} explicit OpenCLCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : ELFCodeGenerator(stepping_, debugConfig) {} - explicit OpenCLCodeGenerator(DebugConfig debugConfig) : ELFCodeGenerator(0, debugConfig) {} + explicit OpenCLCodeGenerator(DebugConfig debugConfig) : ELFCodeGenerator(debugConfig) {} inline std::vector getBinary(cl_context context, cl_device_id device, const std::string &options = "-cl-std=CL2.0"); inline cl_kernel getKernel(cl_context context, cl_device_id device, const std::string &options = "-cl-std=CL2.0"); bool binaryIsZebin() { return isZebin; } static inline HW detectHW(cl_context context, cl_device_id device); - static inline void detectHWInfo(cl_context context, cl_device_id device, HW &outHW, Product &outProduct); - - /* Deprecated. Use the Product-based API instead. */ - static inline void detectHWInfo(cl_context context, cl_device_id device, HW &outHW, int &outStepping); + static inline Product detectHWInfo(cl_context context, cl_device_id device); private: bool isZebin = false; @@ -259,41 +258,31 @@ cl_kernel OpenCLCodeGenerator::getKernel(cl_context context, cl_device_id de template HW OpenCLCodeGenerator::detectHW(cl_context context, cl_device_id device) { - HW outHW; - Product outProduct; - - detectHWInfo(context, device, outHW, outProduct); - - return outHW; -} - -template -void OpenCLCodeGenerator::detectHWInfo(cl_context context, cl_device_id device, HW &outHW, int &outStepping) -{ - Product outProduct; - detectHWInfo(context, device, outHW, outProduct); - outStepping = outProduct.stepping; + return getCore(detectHWInfo(context, device).family); } template -void OpenCLCodeGenerator::detectHWInfo(cl_context context, cl_device_id device, HW &outHW, Product &outProduct) +Product OpenCLCodeGenerator::detectHWInfo(cl_context context, cl_device_id device) { - const char *dummyCL = "kernel void _ngen_hw_detect(){}"; - const char *dummyOptions = ""; + Product product; // Try CL_DEVICE_IP_VERSION_INTEL query first. cl_uint ipVersion = 0; /* should be cl_version, but older CL/cl.h may not define cl_version */ - if (clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL, sizeof(ipVersion), &ipVersion, nullptr) == CL_SUCCESS) { - outProduct = npack::decodeHWIPVersion(ipVersion); - outHW = getCore(outProduct.family); - if (outProduct.family != ProductFamily::Unknown) - return; + if (clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL, sizeof(ipVersion), &ipVersion, nullptr) == CL_SUCCESS) + product = npack::decodeHWIPVersion(ipVersion); + else { + // If it fails, compile a test program and extract the HW information from it. + const char *dummyCL = "kernel void _ngen_hw_detect(){}"; + const char *dummyOptions = ""; + auto binary = detail::getOpenCLCProgramBinary(context, device, dummyCL, dummyOptions); + product = ELFCodeGenerator::getBinaryHWInfo(binary); } - // If it fails, compile a test program and extract the HW information from it. - auto binary = detail::getOpenCLCProgramBinary(context, device, dummyCL, dummyOptions); + cl_bool integrated; + if (clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(integrated), &integrated, nullptr) == CL_SUCCESS) + product.type = integrated ? PlatformType::Integrated : PlatformType::Discrete; - ELFCodeGenerator::getBinaryHWInfo(binary, outHW, outProduct); + return product; } } /* namespace NGEN_NAMESPACE */ diff --git a/third_party/ngen/ngen_register_allocator.hpp b/third_party/ngen/ngen_register_allocator.hpp index d77acf71494..57d25b1336f 100644 --- a/third_party/ngen/ngen_register_allocator.hpp +++ b/third_party/ngen/ngen_register_allocator.hpp @@ -17,7 +17,7 @@ #ifndef NGEN_REGISTER_ALLOCATOR_HPP #define NGEN_REGISTER_ALLOCATOR_HPP -#ifdef ENABLE_LLVM_WCONVERSION +#if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wimplicit-int-conversion" #endif @@ -700,7 +700,7 @@ void RegisterAllocator::dump(std::ostream &str) } /* namespace NGEN_NAMESPACE */ -#ifdef ENABLE_LLVM_WCONVERSION +#if defined(__clang__) #pragma clang diagnostic pop #endif diff --git a/third_party/ngen/ngen_sycl.hpp b/third_party/ngen/ngen_sycl.hpp new file mode 100644 index 00000000000..ed4d8388ce1 --- /dev/null +++ b/third_party/ngen/ngen_sycl.hpp @@ -0,0 +1,163 @@ +/******************************************************************************* +* Copyright 2025 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef NGEN_SYCL_HPP +#define NGEN_SYCL_HPP + +#include "ngen_config_internal.hpp" +#include "ngen_opencl.hpp" +#include "ngen_level_zero.hpp" +#include "ngen_interface.hpp" + +#include +#include +#include + + +namespace NGEN_NAMESPACE { + + +// Exceptions. +class unsupported_sycl_device : public std::runtime_error { +public: + unsupported_sycl_device() : std::runtime_error("Unsupported SYCL device.") {} +}; + +// SYCL program generator class. +template +class SYCLCodeGenerator : public ELFCodeGenerator +{ +public: + explicit SYCLCodeGenerator(Product product_) : ELFCodeGenerator(product_) {} + explicit SYCLCodeGenerator(int stepping_ = 0) : ELFCodeGenerator(stepping_) {} + + inline sycl::kernel getKernel(const sycl::context &context, const sycl::device &device); + + static inline HW detectHW(const sycl::context &context, const sycl::device &device); + static inline Product detectHWInfo(const sycl::context &context, const sycl::device &device); + + // Queue-based convenience APIs. + sycl::kernel getKernel(sycl::queue &queue) { + return getKernel(queue.get_context(), queue.get_device()); + } + static HW detectHW(sycl::queue &queue) { + return detectHW(queue.get_context(), queue.get_device()); + } + static Product detectHWInfo(sycl::queue &queue) { + return detectHWInfo(queue.get_context(), queue.get_device()); + } +}; + +#define NGEN_FORWARD_SYCL(hw) NGEN_FORWARD_ELF(hw) + +template +sycl::kernel SYCLCodeGenerator::getKernel(const sycl::context &context, const sycl::device &device) +{ + using namespace sycl; + using super = ELFCodeGenerator; + + auto kernelName = super::interface_.getExternalName().c_str(); + auto binary = super::getBinary(); + + const auto *binaryPtr = binary.data(); + size_t binarySize = binary.size(); + + std::optional outKernel; + + switch (device.get_backend()) { + case backend::opencl: { + auto contextCL = get_native(context); + auto deviceCL = get_native(device); + + cl_int status = CL_SUCCESS; + auto programCL = clCreateProgramWithBinary(contextCL, 1, &deviceCL, &binarySize, &binaryPtr, nullptr, &status); + + detail::handleCL(status); + if (programCL == nullptr) + detail::handleCL(CL_OUT_OF_HOST_MEMORY); /* a tried and true "default" error */ + + detail::handleCL(clBuildProgram(programCL, 1, &deviceCL, "-cl-std=CL2.0", nullptr, nullptr)); + + auto kernelCL = clCreateKernel(programCL, kernelName, &status); + detail::handleCL(status); + + outKernel = make_kernel(kernelCL, context); + + detail::handleCL(clReleaseKernel(kernelCL)); + detail::handleCL(clReleaseProgram(programCL)); + detail::handleCL(clReleaseContext(contextCL)); + break; + } + case backend::ext_oneapi_level_zero: { + auto contextL0 = get_native(context); + auto deviceL0 = get_native(device); + + ze_module_desc_t moduleDesc = { + ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_NATIVE, + binarySize, + binaryPtr, + "", + nullptr + }; + + ze_module_handle_t moduleL0; + detail::handleL0(zeModuleCreate(contextL0, deviceL0, &moduleDesc, &moduleL0, nullptr)); + + ze_kernel_handle_t kernelL0; + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernelName}; + detail::handleL0(zeKernelCreate(moduleL0, &kernelDesc, &kernelL0)); + + auto bundle = make_kernel_bundle({moduleL0}, context); + outKernel = make_kernel({bundle, kernelL0}, context); + break; + } + default: throw unsupported_sycl_device(); + } + + return outKernel.value(); +} + +template +HW SYCLCodeGenerator::detectHW(const sycl::context &context, const sycl::device &device) +{ + return getCore(detectHWInfo(context, device).family); +} + +template +Product SYCLCodeGenerator::detectHWInfo(const sycl::context &context, const sycl::device &device) +{ + using namespace sycl; + switch (device.get_backend()) { + case backend::opencl: { + auto contextCL = get_native(context); + auto deviceCL = get_native(device); + auto ret = OpenCLCodeGenerator::detectHWInfo(contextCL, deviceCL); + detail::handleCL(clReleaseContext(contextCL)); + return ret; + } + case backend::ext_oneapi_level_zero: + return LevelZeroCodeGenerator::detectHWInfo(get_native(context), + get_native(device)); + default: throw unsupported_sycl_device(); + } + return Product{}; +} + +} /* namespace NGEN_NAMESPACE */ + +#endif diff --git a/third_party/ngen/ngen_utils.hpp b/third_party/ngen/ngen_utils.hpp index 02ef7355728..926f239eebc 100644 --- a/third_party/ngen/ngen_utils.hpp +++ b/third_party/ngen/ngen_utils.hpp @@ -18,13 +18,12 @@ #define NGEN_UTILS_HPP #include +#include #ifdef _MSC_VER #include #endif -#include - #ifdef NGEN_CPP11 #define constexpr14 #else diff --git a/third_party/ngen/npack/neo_packager.hpp b/third_party/ngen/npack/neo_packager.hpp index 8779e07d007..5041e912ef3 100644 --- a/third_party/ngen/npack/neo_packager.hpp +++ b/third_party/ngen/npack/neo_packager.hpp @@ -250,18 +250,23 @@ inline bool hasGatewayEOTSend(const std::vector &binary) return false; } -inline void getBinaryHWInfo(const std::vector &binary, HW &outHW, Product &outProduct) +inline Product getBinaryHWInfo(const std::vector &binary) { const SProgramBinaryHeader *pheader = nullptr; findDeviceBinary(binary, nullptr, &pheader, nullptr); - outHW = decodeGfxCoreFamily(pheader->Device); - outProduct.family = NGEN_NAMESPACE::ProductFamily::Unknown; - outProduct.stepping = pheader->SteppingId; + HW hw = decodeGfxCoreFamily(pheader->Device); // XeHPG identifies with older runtimes as XeHP. Check whether EOT goes to TS (XeHP) or gateway (XeHPG). - if (outHW == HW::XeHP && hasGatewayEOTSend(binary)) - outHW = HW::XeHPG; + if (hw == HW::XeHP && hasGatewayEOTSend(binary)) + hw = HW::XeHPG; + + Product ret; + ret.family = genericProductFamily(hw); + ret.stepping = pheader->SteppingId; + ret.type = PlatformType::Unknown; + + return ret; } inline NGEN_NAMESPACE::Product decodeHWIPVersion(uint32_t rawVersion) @@ -278,25 +283,27 @@ inline NGEN_NAMESPACE::Product decodeHWIPVersion(uint32_t rawVersion) }; } version; - ngen::Product outProduct = {ngen::ProductFamily::Unknown, 0}; + NGEN_NAMESPACE::Product outProduct = {NGEN_NAMESPACE::ProductFamily::Unknown, 0, NGEN_NAMESPACE::PlatformType::Unknown}; version.raw = rawVersion; switch (version.architecture) { - case 9: outProduct.family = ngen::ProductFamily::GenericGen9; break; - case 11: outProduct.family = ngen::ProductFamily::GenericGen11; break; + case 9: outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericGen9; break; + case 11: outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericGen11; break; case 12: if (version.release <= 10) - outProduct.family = ngen::ProductFamily::GenericGen12LP; + outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericGen12LP; else if (version.release == 50) - outProduct.family = ngen::ProductFamily::GenericXeHP; + outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericXeHP; else if (version.release > 50 && version.release <= 59) - outProduct.family = ngen::ProductFamily::DG2; - else if (version.release >= 60 && version.release <= 61) - outProduct.family = ngen::ProductFamily::PVC; + outProduct.family = NGEN_NAMESPACE::ProductFamily::DG2; + else if (version.release == 60) + outProduct.family = NGEN_NAMESPACE::ProductFamily::PVC; + else if (version.release == 61) + outProduct.family = NGEN_NAMESPACE::ProductFamily::PVCVG; else if (version.release >= 70 && version.release <= 71) - outProduct.family = ngen::ProductFamily::MTL; + outProduct.family = NGEN_NAMESPACE::ProductFamily::MTL; else if (version.release >= 73 && version.release <= 74) - outProduct.family = ngen::ProductFamily::ARL; + outProduct.family = NGEN_NAMESPACE::ProductFamily::ARL; break; case 20: if (version.release <= 2) @@ -310,13 +317,15 @@ inline NGEN_NAMESPACE::Product decodeHWIPVersion(uint32_t rawVersion) default: outProduct.family = ngen::ProductFamily::Unknown; break; } - if (outProduct.family != ngen::ProductFamily::Unknown) + if (outProduct.family != NGEN_NAMESPACE::ProductFamily::Unknown) outProduct.stepping = version.revision; + outProduct.type = getPlatformType(outProduct.family); + return outProduct; } } /* namespace npack */ -} /* namespace ngen */ +} /* namespace NGEN_NAMESPACE */ #endif /* header guard */