diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e8280bf0f..9759e74ce 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -85,7 +85,7 @@ if (ROCM_CCACHE_BUILD) endif() # if (ROCM_CCACHE_BUILD) ## Get version strings -get_version ( "1.9.0" ) +get_version ( "1.11.0" ) if ( ${ROCM_PATCH_VERSION} ) set ( VERSION_PATCH ${ROCM_PATCH_VERSION}) endif() @@ -125,11 +125,14 @@ target_include_directories( ${CORE_RUNTIME_TARGET} ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler) -## Set RUNPATH - ../../lib covers use of the legacy symlink in /hsa/lib/ -set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib;$ORIGIN/../../lib64;$ORIGIN/../lib64" ) ## ------------------------- Linux Compiler and Linker options ------------------------- -set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function -mmwaitx ) +set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function ) + +## Extra x86 specific settings +if ( CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64" ) + set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -mmwaitx ) +endif() ## Extra image settings - audit! set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -Wno-deprecated-declarations ) @@ -306,7 +309,9 @@ install ( TARGETS ${CORE_RUNTIME_TARGET} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan ) # Install license -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan ) +if(ENABLE_ASAN_PACKAGING) + install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan ) +endif() install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary ) # Install public headers diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp index 3802ae9b4..593ccb389 100644 --- a/src/core/common/hsa_table_interface.cpp +++ b/src/core/common/hsa_table_interface.cpp @@ -58,7 +58,15 @@ const HsaApiTable* hsa_table_interface_get_table() { } // Pass through stub functions -hsa_status_t HSA_API hsa_init() { return coreApiTable->hsa_init_fn(); } +hsa_status_t HSA_API hsa_init() { + // We initialize the api tables here once more since the code above is prone to a + // link-time ordering condition: This compilation unit here may get its global + // variables initialized earlier than the global objects in other compilation units. + // In particular Init::Init may get called earlier than that the underlying hsa_api_table_ + // object in hsa_api_trace.cpp has been initialized. + rocr::core::LoadInitialHsaApiTable(); + return coreApiTable->hsa_init_fn(); +} hsa_status_t HSA_API hsa_shut_down() { return coreApiTable->hsa_shut_down_fn(); } diff --git a/src/core/inc/amd_blit_shaders.h b/src/core/inc/amd_blit_shaders.h index 583d0bdd0..12c52cc4b 100644 --- a/src/core/inc/amd_blit_shaders.h +++ b/src/core/inc/amd_blit_shaders.h @@ -156,6 +156,62 @@ static const unsigned int kCodeFill8[] = { 0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000, }; +static const unsigned int kCodeCopyAligned940[] = { + 0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020, + 0xc00a0400, 0x00000030, 0xc00a0500, 0x00000040, 0xc0020600, 0x00000050, + 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, 0xd1196a02, 0x00000900, + 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, 0x00000d00, 0xd11c6a05, + 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf86000f, 0x86fe6a7e, 0xde410000, + 0x017f0002, 0xbf8c0f70, 0xd1196a02, 0x00003102, 0xd11c6a03, 0x01a90103, + 0xde610000, 0x007f0104, 0xd1196a04, 0x00003104, 0xd11c6a05, 0x01a90105, + 0xbf82ffee, 0xbefe01c1, 0x8e198418, 0x24020084, 0x7e060209, 0xd1196a02, + 0x00001101, 0xd11c6a03, 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001501, + 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001902, 0xbf86000e, 0xde5d0000, + 0x087f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, + 0xde7d0000, 0x007f0804, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105, + 0xbf82ffef, 0x8e198218, 0x24020082, 0x7e06020d, 0xd1196a02, 0x00001901, + 0xd11c6a03, 0x01a90103, 0x7e0a020f, 0xd1196a04, 0x00001d01, 0xd11c6a05, + 0x01a90105, 0xd0e9006a, 0x00002102, 0xbf86000f, 0x86fe6a7e, 0xde510000, + 0x017f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, + 0xde710000, 0x007f0104, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105, + 0xbf82ffee, 0xbefe01c1, 0x7e060211, 0xd1196a02, 0x00002100, 0xd11c6a03, + 0x01a90103, 0x7e0a0213, 0xd1196a04, 0x00002500, 0xd11c6a05, 0x01a90105, + 0xd0e9006a, 0x00002902, 0xbf860006, 0x86fe6a7e, 0xde410000, 0x017f0002, + 0xbf8c0f70, 0xde610000, 0x007f0104, 0xbf810000, +}; + +static const unsigned int kCodeCopyMisaligned940[] = { + 0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020, + 0xc0020400, 0x00000030, 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, + 0xd1196a02, 0x00000900, 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, + 0x00000d00, 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf860032, + 0xde410000, 0x067f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, + 0xde410000, 0x077f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, + 0xde410000, 0x087f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, + 0xde410000, 0x097f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, + 0xbf8c0f70, 0xde610000, 0x007f0604, 0xd1196a04, 0x00002104, 0xd11c6a05, + 0x01a90105, 0xde610000, 0x007f0704, 0xd1196a04, 0x00002104, 0xd11c6a05, + 0x01a90105, 0xde610000, 0x007f0804, 0xd1196a04, 0x00002104, 0xd11c6a05, + 0x01a90105, 0xde610000, 0x007f0904, 0xd1196a04, 0x00002104, 0xd11c6a05, + 0x01a90105, 0xbf82ffcb, 0x7e060209, 0xd1196a02, 0x00001100, 0xd11c6a03, + 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001500, 0xd11c6a05, 0x01a90105, + 0xd0e9006a, 0x00001902, 0xbf86000f, 0x86fe6a7e, 0xde410000, 0x017f0002, + 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, 0xde610000, + 0x007f0104, 0xd1196a04, 0x00002104, 0xd11c6a05, 0x01a90105, 0xbf82ffee, + 0xbf810000, 0x00000000, +}; + +static const unsigned int kCodeFill940[] = { + 0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xbf8cc07f, 0x8e028602, + 0x32000002, 0x7e08020a, 0x7e0a020a, 0x7e0c020a, 0x7e0e020a, 0x8e0c840b, + 0x24020084, 0x7e060205, 0xd1196a02, 0x00000901, 0xd11c6a03, 0x01a90103, + 0xd0e9006a, 0x00000d02, 0xbf860007, 0xde7d0000, 0x007f0402, 0xd1196a02, + 0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff6, 0x8e0c820b, 0x24020082, + 0x7e060207, 0xd1196a02, 0x00000d01, 0xd11c6a03, 0x01a90103, 0xd0e9006a, + 0x00001102, 0xbf860008, 0x86fe6a7e, 0xde710000, 0x007f0402, 0xd1196a02, + 0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff5, 0xbf810000, 0x00000000, +}; + static const unsigned int kCodeCopyAligned10[] = { 0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020, 0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050, diff --git a/src/core/inc/amd_elf_image.hpp b/src/core/inc/amd_elf_image.hpp index 0da61f5cf..177465c05 100644 --- a/src/core/inc/amd_elf_image.hpp +++ b/src/core/inc/amd_elf_image.hpp @@ -183,6 +183,10 @@ namespace elf { virtual StringTable* strtab() = 0; virtual SymbolTable* symtab() = 0; virtual SymbolTable* getSymtab(uint16_t index) = 0; + virtual SymbolTable* dynsym() = 0; + virtual SymbolTable* getDynsym(uint16_t index) = 0; + virtual SymbolTable* getSymbolTable() = 0; + virtual SymbolTable* getSymbolTable(uint16_t index) = 0; virtual StringTable* addStringTable(const std::string& name) = 0; virtual StringTable* getStringTable(uint16_t index) = 0; diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h index 359080c06..905472af4 100644 --- a/src/core/inc/amd_gpu_agent.h +++ b/src/core/inc/amd_gpu_agent.h @@ -550,6 +550,10 @@ class GpuAgent : public GpuAgentInt { // Bind the Blit object that will drive the copy operation lazy_ptr& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent, const size_t size); + + // Bind the Blit object that will drive the copy operation by engine ID + lazy_ptr& GetBlitObject(uint32_t engine_id); + // @brief Alternative aperture base address. Only on KV. uintptr_t ape1_base_; @@ -563,6 +567,15 @@ class GpuAgent : public GpuAgentInt { KernelMutex lock_; } gws_queue_; + // Sets and Tracks pending SDMA status check or request counts + void SetCopyRequestRefCount(bool set); + void SetCopyStatusCheckRefCount(bool set); + int pending_copy_req_ref_; + int pending_copy_stat_check_ref_; + + // Tracks what SDMA blits have been used since initialization. + uint32_t sdma_blit_used_mask_; + ScratchCache scratch_cache_; // System memory allocator in the nearest NUMA node. @@ -572,6 +585,9 @@ class GpuAgent : public GpuAgentInt { std::function system_deallocator_; DISALLOW_COPY_AND_ASSIGN(GpuAgent); + + // Check if SDMA engine by ID is free + bool DmaEngineIsFree(uint32_t engine_id); }; } // namespace amd diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h index e12f0d760..cb5d17e2b 100644 --- a/src/core/inc/amd_memory_region.h +++ b/src/core/inc/amd_memory_region.h @@ -95,8 +95,8 @@ class MemoryRegion : public core::MemoryRegion { /// @brief Unpin memory. static void MakeKfdMemoryUnresident(const void* ptr); - MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, - const HsaMemoryProperties& mem_props); + MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain, + core::Agent* owner, const HsaMemoryProperties& mem_props); ~MemoryRegion(); @@ -173,6 +173,8 @@ class MemoryRegion : public core::MemoryRegion { return static_cast(mem_props_.MemoryClockMax); } + __forceinline bool extended_scope_fine_grain() const { return extended_scope_fine_grain_; } + private: const HsaMemoryProperties mem_props_; @@ -182,6 +184,9 @@ class MemoryRegion : public core::MemoryRegion { size_t max_single_alloc_size_; + // Enables creating an extended scope fine grained memory pool region + const bool extended_scope_fine_grain_; + // Used to collect total system memory static size_t max_sysmem_alloc_size_; diff --git a/src/core/inc/memory_region.h b/src/core/inc/memory_region.h index 48ecb3dcd..feba80695 100644 --- a/src/core/inc/memory_region.h +++ b/src/core/inc/memory_region.h @@ -91,6 +91,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { AllocateIPC = (1 << 4), // System memory that can be IPC-shared AllocateNonPaged = (1 << 4), // Non-paged system memory (AllocateIPC alias) AllocatePCIeRW = (1 << 5), // Enforce pseudo fine grain/RW memory + AllocateAsan = (1 << 6), // ASAN - First page of allocation remapped to system memory }; typedef uint32_t AllocateFlags; diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h index 11e8ce82b..8f4db0783 100644 --- a/src/core/inc/runtime.h +++ b/src/core/inc/runtime.h @@ -70,6 +70,12 @@ #include "core/inc/amd_loader_context.hpp" #include "core/inc/amd_hsa_code.hpp" +#if defined(__clang__) +#if __has_feature(address_sanitizer) +#define SANITIZER_AMDGPU 1 +#endif +#endif + //---------------------------------------------------------------------------// // Constants // //---------------------------------------------------------------------------// @@ -112,6 +118,7 @@ class Runtime { struct KfdVersion_t { HsaVersionInfo version; bool supports_exception_debugging; + bool supports_event_age; }; /// @brief Open connection to kernel driver and increment reference count. @@ -395,7 +402,12 @@ class Runtime { uint64_t sys_clock_freq() const { return sys_clock_freq_; } - void KfdVersion(const HsaVersionInfo& version) { kfd_version.version = version; } + void KfdVersion(const HsaVersionInfo& version) { + kfd_version.version = version; + if (version.KernelInterfaceMajorVersion == 1 && + version.KernelInterfaceMinorVersion >= 14) + kfd_version.supports_event_age = true; + } void KfdVersion(bool exception_debugging) { kfd_version.supports_exception_debugging = exception_debugging; @@ -407,9 +419,19 @@ class Runtime { static void AsyncEventsLoop(void*); struct AllocationRegion { - AllocationRegion() : region(NULL), size(0), size_requested(0), user_ptr(nullptr) {} - AllocationRegion(const MemoryRegion* region_arg, size_t size_arg, size_t size_requested) - : region(region_arg), size(size_arg), size_requested(size_requested), user_ptr(nullptr) {} + AllocationRegion() + : region(NULL), + size(0), + size_requested(0), + alloc_flags(core::MemoryRegion::AllocateNoFlags), + user_ptr(nullptr) {} + AllocationRegion(const MemoryRegion* region_arg, size_t size_arg, size_t size_requested, + MemoryRegion::AllocateFlags alloc_flags) + : region(region_arg), + size(size_arg), + size_requested(size_requested), + alloc_flags(alloc_flags), + user_ptr(nullptr) {} struct notifier_t { void* ptr; @@ -420,6 +442,7 @@ class Runtime { const MemoryRegion* region; size_t size; /* actual size = align_up(size_requested, granularity) */ size_t size_requested; /* size requested by user */ + MemoryRegion::AllocateFlags alloc_flags; void* user_ptr; std::unique_ptr> notifiers; }; diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index f2aedcd8f..b09ea82a7 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -1332,7 +1332,11 @@ void AqlQueue::FillBufRsrcWord1_Gfx11() { void AqlQueue::FillBufRsrcWord2() { SQ_BUF_RSRC_WORD2 srd2; - srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size); + const auto& agent_props = agent_->properties(); + const uint32_t num_xcc = agent_props.NumXcc; + + // report size per XCC + srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size / num_xcc); amd_queue_.scratch_resource_descriptor[2] = srd2.u32All; } @@ -1403,8 +1407,10 @@ void AqlQueue::FillComputeTmpRingSize() { return; } - // Determine the maximum number of waves device can support const auto& agent_props = agent_->properties(); + const uint32_t num_xcc = agent_props.NumXcc; + + // Determine the maximum number of waves device can support uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU; uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU; @@ -1416,10 +1422,11 @@ void AqlQueue::FillComputeTmpRingSize() { tmpring_size.bits.WAVESIZE = wave_scratch; assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow."); uint32_t num_waves = - queue_scratch_.size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size); + (queue_scratch_.size / num_xcc) / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size); + tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves); amd_queue_.compute_tmpring_size = tmpring_size.u32All; - assert((tmpring_size.bits.WAVES % agent_props.NumShaderBanks == 0) && + assert((tmpring_size.bits.WAVES % (agent_props.NumShaderBanks / num_xcc) == 0) && "Invalid scratch wave count. Must be divisible by #SEs."); } @@ -1431,9 +1438,11 @@ void AqlQueue::FillComputeTmpRingSize_Gfx11() { return; } - // Determine the maximum number of waves device can support const auto& agent_props = agent_->properties(); - uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU; + const uint32_t num_xcc = agent_props.NumXcc; + + // Determine the maximum number of waves device can support + uint32_t num_cus = agent_props.NumFComputeCores / (agent_props.NumSIMDPerCU * num_xcc); uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU; // Scratch is allocated program COMPUTE_TMPRING_SIZE register @@ -1483,7 +1492,11 @@ void AqlQueue::InitScratchSRD() { // Populate flat scratch parameters in amd_queue_. amd_queue_.scratch_backing_memory_location = queue_scratch_.queue_process_offset; - amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size; + + const auto& agent_props = agent_->properties(); + const uint32_t num_xcc = agent_props.NumXcc; + // report size per XCC + amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size / num_xcc; // For backwards compatibility this field records the per-lane scratch // for a 64 lane wavefront. If scratch was allocated for 32 lane waves diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp index 681b310a1..661e3be4d 100644 --- a/src/core/runtime/amd_blit_kernel.cpp +++ b/src/core/runtime/amd_blit_kernel.cpp @@ -491,6 +491,388 @@ static std::string kBlitKernelSource(R"( L_FILL_PHASE_2_DONE: s_endpgm end + +shader CopyAligned_940 + type(CS) + user_sgpr_count(2) + sgpr_count(32) + vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth)) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20) + s_load_dwordx4 s[16:19], s[0:1], s_load_dword_offset(0x30) + s_load_dwordx4 s[20:23], s[0:1], s_load_dword_offset(0x40) + s_load_dword s24, s[0:1], s_load_dword_offset(0x50) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_co_u32 v0, vcc, s2, v0 + + // ===================================================== + // Phase 1: Byte copy up to 0x100 destination alignment. + // ===================================================== + + // Compute phase source address. + v_mov_b32 v3, s5 + v_add_co_u32 v2, vcc, v0, s4 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s7 + v_add_co_u32 v4, vcc, v0, s6 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + +L_COPY_ALIGNED_PHASE_1_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_1_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_ubyte v1, v[2:3] sc0:1 sc1:1 + s_waitcnt vmcnt(0) + v_add_co_u32 v2, vcc, v2, s24 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Write to/advance the destination address. + flat_store_byte v[4:5], v1 sc0:1 sc1:1 + v_add_co_u32 v4, vcc, v4, s24 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_1_LOOP + +L_COPY_ALIGNED_PHASE_1_DONE: + // Restore EXEC mask for all lanes. + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + + // ======================================================== + // Phase 2: Unrolled dword[x4] copy up to last whole block. + // ======================================================== + + // Compute unrolled dword[x4] stride across all threads. + if kCopyAlignedVecWidth == 4 + s_lshl_b32 s25, s24, 0x4 + else + s_lshl_b32 s25, s24, 0x2 + end + + // Compute phase source address. + if kCopyAlignedVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + else + v_lshlrev_b32 v1, 0x2, v0 + end + + v_mov_b32 v3, s9 + v_add_co_u32 v2, vcc, v1, s8 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s11 + v_add_co_u32 v4, vcc, v1, s10 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + +L_COPY_ALIGNED_PHASE_2_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[12:13] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_2_DONE + + // Load from/advance the source address. + for var i = 0; i < kCopyAlignedUnroll; i ++ + if kCopyAlignedVecWidth == 4 + flat_load_dwordx4 v[8 + (i * 4)], v[2:3] sc0:1 sc1:1 + else + flat_load_dword v[8 + i], v[2:3] sc0:1 sc1:1 + end + + v_add_co_u32 v2, vcc, v2, s25 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + end + + // Write to/advance the destination address. + s_waitcnt vmcnt(0) + + for var i = 0; i < kCopyAlignedUnroll; i ++ + if kCopyAlignedVecWidth == 4 + flat_store_dwordx4 v[4:5], v[8 + (i * 4)] sc0:1 sc1:1 + else + flat_store_dword v[4:5], v[8 + i] sc0:1 sc1:1 + end + + v_add_co_u32 v4, vcc, v4, s25 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_2_LOOP + +L_COPY_ALIGNED_PHASE_2_DONE: + + // =========================================== + // Phase 3: Dword copy up to last whole dword. + // =========================================== + + // Compute dword stride across all threads. + s_lshl_b32 s25, s24, 0x2 + + // Compute phase source address. + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s13 + v_add_co_u32 v2, vcc, v1, s12 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s15 + v_add_co_u32 v4, vcc, v1, s14 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + +L_COPY_ALIGNED_PHASE_3_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[16:17] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_3_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_dword v1, v[2:3] sc0:1 sc1:1 + v_add_co_u32 v2, vcc, v2, s25 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + s_waitcnt vmcnt(0) + + // Write to/advance the destination address. + flat_store_dword v[4:5], v1 sc0:1 sc1:1 + v_add_co_u32 v4, vcc, v4, s25 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_ALIGNED_PHASE_3_LOOP + +L_COPY_ALIGNED_PHASE_3_DONE: + // Restore EXEC mask for all lanes. + s_mov_b64 exec, 0xFFFFFFFFFFFFFFFF + + // ============================= + // Phase 4: Byte copy up to end. + // ============================= + + // Compute phase source address. + v_mov_b32 v3, s17 + v_add_co_u32 v2, vcc, v0, s16 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s19 + v_add_co_u32 v4, vcc, v0, s18 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[20:21] + s_cbranch_vccz L_COPY_ALIGNED_PHASE_4_DONE + s_and_b64 exec, exec, vcc + + // Load from the source address. + flat_load_ubyte v1, v[2:3] sc0:1 sc1:1 + s_waitcnt vmcnt(0) + + // Write to the destination address. + flat_store_byte v[4:5], v1 sc0:1 sc1:1 + +L_COPY_ALIGNED_PHASE_4_DONE: + s_endpgm +end + +shader CopyMisaligned_940 + type(CS) + user_sgpr_count(2) + sgpr_count(23) + vgpr_count(6 + kCopyMisalignedUnroll) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_load_dwordx4 s[12:15], s[0:1], s_load_dword_offset(0x20) + s_load_dword s16, s[0:1], s_load_dword_offset(0x30) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_co_u32 v0, vcc, s2, v0 + + // =================================================== + // Phase 1: Unrolled byte copy up to last whole block. + // =================================================== + + // Compute phase source address. + v_mov_b32 v3, s5 + v_add_co_u32 v2, vcc, v0, s4 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s7 + v_add_co_u32 v4, vcc, v0, s6 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + +L_COPY_MISALIGNED_PHASE_1_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_1_DONE + + // Load from/advance the source address. + for var i = 0; i < kCopyMisalignedUnroll; i ++ + flat_load_ubyte v[6 + i], v[2:3] sc0:1 sc1:1 + v_add_co_u32 v2, vcc, v2, s16 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + end + + // Write to/advance the destination address. + s_waitcnt vmcnt(0) + + for var i = 0; i < kCopyMisalignedUnroll; i ++ + flat_store_byte v[4:5], v[6 + i] sc0:1 sc1:1 + v_add_co_u32 v4, vcc, v4, s16 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_COPY_MISALIGNED_PHASE_1_LOOP + +L_COPY_MISALIGNED_PHASE_1_DONE: + + // ============================= + // Phase 2: Byte copy up to end. + // ============================= + + // Compute phase source address. + v_mov_b32 v3, s9 + v_add_co_u32 v2, vcc, v0, s8 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Compute phase destination address. + v_mov_b32 v5, s11 + v_add_co_u32 v4, vcc, v0, s10 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + +L_COPY_MISALIGNED_PHASE_2_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[12:13] + s_cbranch_vccz L_COPY_MISALIGNED_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + // Load from/advance the source address. + flat_load_ubyte v1, v[2:3] sc0:1 sc1:1 + v_add_co_u32 v2, vcc, v2, s16 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + s_waitcnt vmcnt(0) + + // Write to/advance the destination address. + flat_store_byte v[4:5], v1 sc0:1 sc1:1 + v_add_co_u32 v4, vcc, v4, s16 + v_addc_co_u32 v5, vcc, v5, 0x0, vcc + + // Repeat until branched out. + s_branch L_COPY_MISALIGNED_PHASE_2_LOOP + +L_COPY_MISALIGNED_PHASE_2_DONE: + s_endpgm +end + +shader Fill_940 + type(CS) + user_sgpr_count(2) + sgpr_count(19) + vgpr_count(8) + + // Retrieve kernel arguments. + s_load_dwordx4 s[4:7], s[0:1], s_load_dword_offset(0x0) + s_load_dwordx4 s[8:11], s[0:1], s_load_dword_offset(0x10) + s_waitcnt lgkmcnt(0) + + // Compute workitem id. + s_lshl_b32 s2, s2, 0x6 + v_add_co_u32 v0, vcc, s2, v0 + + // Copy fill pattern into VGPRs. + for var i = 0; i < kFillVecWidth; i ++ + v_mov_b32 v[4 + i], s10 + end + + // ======================================================== + // Phase 1: Unrolled dword[x4] fill up to last whole block. + // ======================================================== + + // Compute unrolled dword[x4] stride across all threads. + if kFillVecWidth == 4 + s_lshl_b32 s12, s11, 0x4 + else + s_lshl_b32 s12, s11, 0x2 + end + + // Compute phase destination address. + if kFillVecWidth == 4 + v_lshlrev_b32 v1, 0x4, v0 + else + v_lshlrev_b32 v1, 0x2, v0 + end + + v_mov_b32 v3, s5 + v_add_co_u32 v2, vcc, v1, s4 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + +L_FILL_PHASE_1_LOOP: + // Branch out after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[6:7] + s_cbranch_vccz L_FILL_PHASE_1_DONE + + // Write to/advance the destination address. + for var i = 0; i < kFillUnroll; i ++ + if kFillVecWidth == 4 + flat_store_dwordx4 v[2:3], v[4:7] sc0:1 sc1:1 + else + flat_store_dword v[2:3], v4 sc0:1 sc1:1 + end + + v_add_co_u32 v2, vcc, v2, s12 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + end + + // Repeat until branched out. + s_branch L_FILL_PHASE_1_LOOP + +L_FILL_PHASE_1_DONE: + + // ============================== + // Phase 2: Dword fill up to end. + // ============================== + + // Compute dword stride across all threads. + s_lshl_b32 s12, s11, 0x2 + + // Compute phase destination address. + v_lshlrev_b32 v1, 0x2, v0 + v_mov_b32 v3, s7 + v_add_co_u32 v2, vcc, v1, s6 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + +L_FILL_PHASE_2_LOOP: + // Mask off lanes (or branch out) after phase end. + v_cmp_lt_u64 vcc, v[2:3], s[8:9] + s_cbranch_vccz L_FILL_PHASE_2_DONE + s_and_b64 exec, exec, vcc + + // Write to/advance the destination address. + flat_store_dword v[2:3], v4 sc0:1 sc1:1 + v_add_co_u32 v2, vcc, v2, s12 + v_addc_co_u32 v3, vcc, v3, 0x0, vcc + + // Repeat until branched out. + s_branch L_FILL_PHASE_2_LOOP + +L_FILL_PHASE_2_DONE: + s_endpgm +end )"); // Search kernel source for variable definition and return value. diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp index 8b35d9955..bdb6070f8 100644 --- a/src/core/runtime/amd_cpu_agent.cpp +++ b/src/core/runtime/amd_cpu_agent.cpp @@ -85,15 +85,15 @@ void CpuAgent::InitRegionList() { if (system_prop != mem_props.end()) system_props = *system_prop; MemoryRegion* system_region_fine = - new MemoryRegion(true, false, is_apu_node, this, system_props); + new MemoryRegion(true, false, is_apu_node, false, this, system_props); regions_.push_back(system_region_fine); MemoryRegion* system_region_kernarg = - new MemoryRegion(true, true, is_apu_node, this, system_props); + new MemoryRegion(true, true, is_apu_node, false, this, system_props); regions_.push_back(system_region_kernarg); if (!is_apu_node) { MemoryRegion* system_region_coarse = - new MemoryRegion(false, false, is_apu_node, this, system_props); + new MemoryRegion(false, false, is_apu_node, false, this, system_props); regions_.push_back(system_region_coarse); } } @@ -385,6 +385,12 @@ hsa_status_t CpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AMD_AGENT_INFO_IOMMU_SUPPORT: *((hsa_amd_iommu_version_t*)value) = HSA_IOMMU_SUPPORT_NONE; break; + case HSA_AMD_AGENT_INFO_NUM_XCC: + *((uint32_t*)value) = 0; + break; + case HSA_AMD_AGENT_INFO_DRIVER_UID: + *((uint32_t*)value) = 0; + break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; break; diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 7acbd8a74..88b270848 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -3,7 +3,7 @@ // The University of Illinois/NCSA // Open Source License (NCSA) // -// Copyright (c) 2014-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2014-2023, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // @@ -104,6 +104,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna enum_index_(index), ape1_base_(0), ape1_size_(0), + pending_copy_req_ref_(0), + pending_copy_stat_check_ref_(0), + sdma_blit_used_mask_(0), scratch_cache_( [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) { const bool is_apu_node = (properties_.NumCPUCores > 0); @@ -124,8 +127,18 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported; if (isa_base->IsSrameccSupported()) { - sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled - : core::IsaFeature::Disabled; + switch (core::Runtime::runtime_singleton_->flag().sramecc_enable()) { + case Flag::SRAMECC_DISABLED: + sramecc = core::IsaFeature::Disabled; + break; + case Flag::SRAMECC_ENABLED: + sramecc = core::IsaFeature::Enabled; + break; + case Flag::SRAMECC_DEFAULT: + sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled + : core::IsaFeature::Disabled; + break; + } } rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported; @@ -243,6 +256,8 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar ASICShader compute_8; ASICShader compute_9; ASICShader compute_90a; + ASICShader compute_940; + ASICShader compute_942; ASICShader compute_1010; ASICShader compute_10; ASICShader compute_11; @@ -251,53 +266,63 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar std::map compiled_shaders = { {"TrapHandler", { - {NULL, 0, 0, 0}, - {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, - {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, - {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, - {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, - {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, - {NULL, 0, 0, 0}, + {NULL, 0, 0, 0}, // gfx7 + {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 + {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4}, // gfx9 + {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4}, // gfx90a + {NULL, 0, 0, 0}, // gfx940 + {NULL, 0, 0, 0}, // gfx942 + {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4}, // gfx1010 + {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4}, // gfx10 + {NULL, 0, 0, 0}, // gfx11 }}, {"TrapHandlerKfdExceptions", { - {NULL, 0, 0, 0}, - {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, - {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, - {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, - {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4}, - {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, - {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, + {NULL, 0, 0, 0}, // gfx7 + {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4}, // gfx8 + {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx9 + {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4}, // gfx90a + {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx940 + {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4}, // gfx942 + {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010 + {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4}, // gfx10 + {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4}, // gfx11 }}, {"CopyAligned", { - {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, - {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, - {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, - {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, - {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, + {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12}, // gfx7 + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx8 + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx9 + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx90a + {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12}, // gfx940 + {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12}, // gfx942 + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx1010 + {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12}, // gfx10 + {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12}, // gfx11 }}, {"CopyMisaligned", { - {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, - {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, - {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, - {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, - {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, + {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10}, // gfx7 + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx8 + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx9 + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx90a + {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940 + {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10}, // gfx942 + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx1010 + {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10}, // gfx10 + {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10}, // gfx11 }}, {"Fill", { - {kCodeFill7, sizeof(kCodeFill7), 19, 8}, - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, - {kCodeFill8, sizeof(kCodeFill8), 19, 8}, - {kCodeFill10, sizeof(kCodeFill10), 19, 8}, - {kCodeFill10, sizeof(kCodeFill10), 19, 8}, - {kCodeFill11, sizeof(kCodeFill11), 19, 8}, + {kCodeFill7, sizeof(kCodeFill7), 19, 8}, // gfx7 + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx8 + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx9 + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx90a + {kCodeFill940, sizeof(kCodeFill940), 19, 8}, // gfx940 + {kCodeFill8, sizeof(kCodeFill8), 19, 8}, // gfx942 + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx1010 + {kCodeFill10, sizeof(kCodeFill10), 19, 8}, // gfx10 + {kCodeFill11, sizeof(kCodeFill11), 19, 8}, // gfx11 }}}; auto compiled_shader_it = compiled_shaders.find(func_name); @@ -314,10 +339,22 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar asic_shader = &compiled_shader_it->second.compute_8; break; case 9: - if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) + if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) { asic_shader = &compiled_shader_it->second.compute_90a; - else + } else if(isa_->GetMinorVersion() == 4) { + switch(isa_->GetStepping()) { + case 0: + case 1: + asic_shader = &compiled_shader_it->second.compute_940; + break; + case 2: + default: + asic_shader = &compiled_shader_it->second.compute_942; + break; + } + } else { asic_shader = &compiled_shader_it->second.compute_9; + } break; case 10: if(isa_->GetMinorVersion() == 1) @@ -368,8 +405,10 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar AMD_HSA_BITS_SET(header->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1); - if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) && - (isa_->GetStepping() == 10)) { + // gfx90a, gfx940, gfx941, gfx942 + if ((isa_->GetMajorVersion() == 9) && + (((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) || + (isa_->GetMinorVersion() == 4))) { // Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a. // FIXME: Assemble code objects from source at build time int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1; @@ -411,15 +450,19 @@ void GpuAgent::InitRegionList() { memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax; case HSA_HEAPTYPE_GPU_LDS: case HSA_HEAPTYPE_GPU_SCRATCH: { - MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]); + MemoryRegion* region = + new MemoryRegion(false, false, false, false, this, mem_props[mem_idx]); regions_.push_back(region); if (region->IsLocalMemory()) { + regions_.push_back( + new MemoryRegion(false, false, false, true, this, mem_props[mem_idx])); // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI. if ((properties_.HiveID != 0) || (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) { - regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx])); + regions_.push_back( + new MemoryRegion(true, false, false, false, this, mem_props[mem_idx])); } } break; @@ -485,8 +528,11 @@ void GpuAgent::InitScratchPool() { void GpuAgent::ReserveScratch() { size_t reserved_sz = core::Runtime::runtime_singleton_->flag().scratch_single_limit(); + size_t available; + HSAKMT_STATUS err = hsaKmtAvailableMemory(node_id(), &available); + assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAvailableMemory failed"); ScopedAcquire lock(&scratch_lock_); - if (!scratch_cache_.reserved_bytes() && reserved_sz) { + if (!scratch_cache_.reserved_bytes() && reserved_sz && available > 8 * reserved_sz) { HSAuint64 alt_va; void* reserved_base = scratch_pool_.alloc(reserved_sz); assert(reserved_base && "Could not allocate reserved memory"); @@ -679,6 +725,7 @@ void GpuAgent::InitDma() { // On gfx90a ensure that HostToDevice queue is created first and so is placed on SDMA0. if ((!use_xgmi) && (!isHostToDev) && (isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) { + GetBlitObject(BlitHostToDev); *blits_[BlitHostToDev]; } @@ -686,7 +733,12 @@ void GpuAgent::InitDma() { if (ret != nullptr) return ret; } - auto ret = CreateBlitKernel((*queue).get()); + // pending_copy_stat_check_ref_ will prevent unnecessary compute queue creation + // since there is no graceful way to handle lazy loading when the caller needs to know + // the status of available SDMA HW resources without a fallback. + // Call to isSDMA should be used as a proxy error check if !blit_copy_fallback. + auto ret = pending_copy_stat_check_ref_ ? new AMD::BlitKernel(NULL) : + CreateBlitKernel((*queue).get()); if (ret == nullptr) throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed."); return ret; @@ -774,11 +826,34 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) { return blits_[BlitDevToDev]->SubmitLinearCopyCommand(dst, src, size); } +void GpuAgent::SetCopyRequestRefCount(bool set) { + ScopedAcquire lock(&blit_lock_); + while (pending_copy_stat_check_ref_) { + blit_lock_.Release(); + os::YieldThread(); + blit_lock_.Acquire(); + } + if (!set && pending_copy_req_ref_) pending_copy_req_ref_--; + else pending_copy_req_ref_++; +} + +void GpuAgent::SetCopyStatusCheckRefCount(bool set) { + ScopedAcquire lock(&blit_lock_); + while (pending_copy_req_ref_) { + blit_lock_.Release(); + os::YieldThread(); + blit_lock_.Acquire(); + } + if (!set && pending_copy_stat_check_ref_) pending_copy_stat_check_ref_--; + else pending_copy_stat_check_ref_++; +} + hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, const void* src, core::Agent& src_agent, size_t size, std::vector& dep_signals, core::Signal& out_signal) { + SetCopyRequestRefCount(true); // Bind the Blit object that will drive this copy operation lazy_ptr& blit = GetBlitObject(dst_agent, src_agent, size); @@ -789,6 +864,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent, } hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal); + SetCopyRequestRefCount(false); return stat; } @@ -812,28 +888,43 @@ hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent, // check if dst and src are the same gpu or over xGMI. bool is_same_gpu = (src_agent.public_handle().handle == dst_agent.public_handle().handle) && - (dst_agent.public_handle().handle == public_handle_.handle); - bool is_xgmi = !is_same_gpu && - src_agent.device_type() == core::Agent::kAmdGpuDevice && - dst_agent.device_type() == core::Agent::kAmdGpuDevice && - dst_agent.HiveId() && src_agent.HiveId() == dst_agent.HiveId() && + (dst_agent.public_handle().handle == public_handle_.handle); + + bool is_p2p = !is_same_gpu && src_agent.device_type() == core::Agent::kAmdGpuDevice && + dst_agent.device_type() == core::Agent::kAmdGpuDevice; + + if ((is_p2p && + core::Runtime::runtime_singleton_->flag().enable_peer_sdma() == Flag::SDMA_DISABLE) || + core::Runtime::runtime_singleton_->flag().enable_sdma() == Flag::SDMA_DISABLE) { + // Note that VDI/HIP will call DmaCopy instead of DmaCopyOnEngine for P2P copies, but + // we still want to handle force Blit Kernels in this function in case other libraries + // decide to use DmaCopyOnEngine for P2P copies + + engine_offset = BlitDevToDev; + } else { + bool is_xgmi = is_p2p && dst_agent.HiveId() && src_agent.HiveId() == dst_agent.HiveId() && properties_.NumSdmaXgmiEngines; - // Due to a RAS issue, GFX90a can only support H2D copies on SDMA0 - bool is_h2d_blit = (src_agent.device_type() == core::Agent::kAmdCpuDevice && - dst_agent.device_type() == core::Agent::kAmdGpuDevice); - bool limit_h2d_blit = isa_->GetVersion() == core::Isa::Version(9, 0, 10); + // Due to a RAS issue, GFX90a can only support H2D copies on SDMA0 + bool is_h2d_blit = (src_agent.device_type() == core::Agent::kAmdCpuDevice && + dst_agent.device_type() == core::Agent::kAmdGpuDevice); + bool limit_h2d_blit = isa_->GetVersion() == core::Isa::Version(9, 0, 10); + + // Ensure engine selection is within proper range based on transfer type + if ((is_xgmi && engine_offset <= properties_.NumSdmaEngines) || + (!is_xgmi && engine_offset > (properties_.NumSdmaEngines + + properties_.NumSdmaXgmiEngines)) || + (!is_h2d_blit && !is_same_gpu && limit_h2d_blit && + engine_offset == BlitHostToDev)) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } - // Ensure engine selection is within proper range based on transfer type - if ((is_xgmi && engine_offset <= properties_.NumSdmaEngines) || - (!is_xgmi && engine_offset > properties_.NumSdmaEngines) || - (!is_h2d_blit && !is_same_gpu && limit_h2d_blit && engine_offset == BlitHostToDev)) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; + engine_offset = is_same_gpu ?(force_copy_on_sdma ? BlitDevToHost : + BlitDevToDev) : engine_offset; } - lazy_ptr& blit = is_same_gpu ? - (force_copy_on_sdma ? blits_[BlitDevToHost] : - blits_[BlitDevToDev]) : blits_[engine_offset]; + SetCopyRequestRefCount(true); + lazy_ptr& blit = GetBlitObject(engine_offset); if (profiling_enabled()) { // Track the agent so we could translate the resulting timestamp to system @@ -842,10 +933,20 @@ hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent, } hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal); + SetCopyRequestRefCount(false); return stat; } +bool GpuAgent::DmaEngineIsFree(uint32_t engine_offset) { + SetCopyStatusCheckRefCount(true); + bool is_free = !!!(sdma_blit_used_mask_ & (1 << engine_offset)) || + (blits_[engine_offset]->isSDMA() && + !!!blits_[engine_offset]->PendingBytes()); + SetCopyStatusCheckRefCount(false); + return is_free; +} + hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_agent, uint32_t *engine_ids_mask) { assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) || @@ -857,10 +958,10 @@ hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_ag dst_agent.device_type() == core::Agent::kAmdGpuDevice && dst_agent.HiveId() && src_agent.HiveId() == dst_agent.HiveId() && properties_.NumSdmaXgmiEngines) { - // Find a free xGMI SDMA engine + //Find a free xGMI SDMA engine for (int i = 0; i < properties_.NumSdmaXgmiEngines; i++) { - if (!!!blits_[DefaultBlitCount + i]->PendingBytes()) { - *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i); + if (DmaEngineIsFree(DefaultBlitCount + i)) { + *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i); } } } else { @@ -869,14 +970,24 @@ hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_ag // Due to a RAS issue, GFX90a can only support H2D copies on SDMA0 bool limit_h2d_blit = isa_->GetVersion() == core::Isa::Version(9, 0, 10); - if (!!!blits_[BlitHostToDev]->PendingBytes()) { + // Check if H2D is free + if (DmaEngineIsFree(BlitHostToDev)) { if (is_h2d_blit || !limit_h2d_blit) { *engine_ids_mask |= HSA_AMD_SDMA_ENGINE_0; } } - if (!!!blits_[BlitDevToHost]->PendingBytes()) { - *engine_ids_mask |= HSA_AMD_SDMA_ENGINE_1; + // Check is D2H is free + if (DmaEngineIsFree(BlitDevToHost)) { + *engine_ids_mask |= properties_.NumSdmaEngines > 1 ? + HSA_AMD_SDMA_ENGINE_1 : + HSA_AMD_SDMA_ENGINE_0; + } + // Find a free xGMI SDMA engine for H2D/D2H though it may be lower bandwidth + for (int i = 0; i < properties_.NumSdmaXgmiEngines; i++) { + if (DmaEngineIsFree(DefaultBlitCount + i)) { + *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i); + } } } @@ -890,10 +1001,14 @@ hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_ core::Signal& out_signal) { if (isa_->GetMajorVersion() < 9) return HSA_STATUS_ERROR_INVALID_AGENT; - lazy_ptr& blit = - (dir == hsaHostToDevice) ? blits_[BlitHostToDev] : blits_[BlitDevToHost]; + SetCopyRequestRefCount(true); + lazy_ptr& blit = GetBlitObject((dir == hsaHostToDevice) ? BlitHostToDev : + BlitDevToHost); - if (!blit->isSDMA()) return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + if (!blit->isSDMA()) { + SetCopyRequestRefCount(false); + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } if (profiling_enabled()) { // Track the agent so we could translate the resulting timestamp to system @@ -904,6 +1019,7 @@ hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_ BlitSdmaBase* sdmaBlit = static_cast((*blit).get()); hsa_status_t stat = sdmaBlit->SubmitCopyRectCommand(dst, dst_offset, src, src_offset, range, dep_signals, out_signal); + SetCopyRequestRefCount(false); return stat; } @@ -922,6 +1038,12 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) { } } + // If we did not update t1 since agent initialization, force a SyncClock. Otherwise computing + // the SystemClockCounter to GPUClockCounter ratio in TranslateTime(tick) results to a division + // by 0. We perform the check here because we do not want to check everytime there is a call to + // TranslateTime(tick) + if (enable && t0_.GPUClockCounter == t1_.GPUClockCounter) SyncClocks(); + return HSA_STATUS_SUCCESS; } @@ -931,6 +1053,9 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { // agent, and vendor name length limit excluding terminating nul character. constexpr size_t hsa_name_size = 63; + const bool isa_has_image_support = + (isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() == 4) ? false : true; + switch (attribute_u) { case HSA_AGENT_INFO_NAME: { std::string name = isa_->GetProcessorName(); @@ -1066,18 +1191,21 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS: case HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS: case HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS: - return hsa_amd_image_get_info_max_dim(public_handle(), attribute, value); + if (!isa_has_image_support) + *((uint32_t*)value) = 0; + else + return hsa_amd_image_get_info_max_dim(public_handle(), attribute, value); + break; case HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES: // TODO: hardcode based on OCL constants. - *((uint32_t*)value) = 128; + *((uint32_t*)value) = isa_has_image_support ? 128 : 0; break; case HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES: - // TODO: hardcode based on OCL constants. - *((uint32_t*)value) = 64; + *((uint32_t*)value) = isa_has_image_support ? 64 : 0; break; case HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS: - // TODO: hardcode based on OCL constants. - *((uint32_t*)value) = 16; + *((uint32_t*)value) = isa_has_image_support ? 16 : 0; + break; case HSA_AMD_AGENT_INFO_CHIP_ID: *((uint32_t*)value) = properties_.DeviceId; break; @@ -1231,6 +1359,12 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { else *((hsa_amd_iommu_version_t*)value) = HSA_IOMMU_SUPPORT_NONE; break; + case HSA_AMD_AGENT_INFO_NUM_XCC: + *((uint32_t*)value) = static_cast(properties_.NumXcc); + break; + case HSA_AMD_AGENT_INFO_DRIVER_UID: + *((uint32_t*)value) = KfdGpuID(); + break; default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; break; @@ -1454,8 +1588,11 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n", scratch.size); const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU; + const uint64_t se_per_xcc = properties_.NumShaderBanks / properties_.NumXcc; + const uint64_t total_waves = scratch.size / size_per_wave; - uint64_t waves_per_cu = total_waves / num_cus; + uint64_t waves_per_cu = AlignUp(total_waves / num_cus, scratch.waves_per_group); + while (waves_per_cu != 0) { size_t size = waves_per_cu * num_cus * size_per_wave; void* base = scratch_pool_.alloc_high(size); @@ -1475,7 +1612,14 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { return; } scratch_pool_.free(base); - waves_per_cu = waves_per_cu - scratch.waves_per_group; + + // Wave count must be divisible by #SEs in an XCC. If occupancy must be reduced + // such that waves_per_cu < waves_per_group, continue reducing by #SEs per XCC + // (only allowed if waves_per_group is a multiple #SEs per XCC). + waves_per_cu -= (waves_per_cu <= scratch.waves_per_group && + se_per_xcc < scratch.waves_per_group && + scratch.waves_per_group % se_per_xcc == 0) ? + se_per_xcc : scratch.waves_per_group; } // Failed to allocate minimal scratch @@ -1658,6 +1802,12 @@ void GpuAgent::BindTrapHandler() { AssembleShader("TrapHandlerKfdExceptions", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_); } else { + if (isa_->GetMajorVersion() >= 11 || + (isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() == 4)) { + // No trap handler support without exception handling, soft error. + return; + } + AssembleShader("TrapHandler", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_); // Make an empty map from doorbell index to queue. @@ -1728,6 +1878,11 @@ void GpuAgent::InvalidateCodeCaches() { queues_[QueueUtility]->ExecutePM4(cache_inv, cache_inv_size_dw * sizeof(uint32_t)); } +lazy_ptr& GpuAgent::GetBlitObject(uint32_t engine_offset) { + sdma_blit_used_mask_ |= 1 << engine_offset; + return blits_[engine_offset]; +} + lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { // Determine if destination is a member xgmi peers list uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines; @@ -1739,25 +1894,21 @@ lazy_ptr& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) { uint64_t dst_handle = dst_agent.public_handle().handle; uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle; if (peer_handle == dst_handle) { - return blits_[(idx % xgmi_engine_cnt) + DefaultBlitCount]; + return GetBlitObject((idx % xgmi_engine_cnt) + DefaultBlitCount); } } // Add agent to the xGMI neighbours list xgmi_peer_list_.push_back(&dst_agent); - return blits_[((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount]; + return GetBlitObject(((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount); } lazy_ptr& GpuAgent::GetPcieBlit(const core::Agent& dst_agent, const core::Agent& src_agent) { - lazy_ptr& blit = - (src_agent.device_type() == core::Agent::kAmdCpuDevice && - dst_agent.device_type() == core::Agent::kAmdGpuDevice) - ? blits_[BlitHostToDev] // CPU->GPU transfer. - : (src_agent.device_type() == core::Agent::kAmdGpuDevice && - dst_agent.device_type() == core::Agent::kAmdCpuDevice) - ? blits_[BlitDevToHost] // GPU->CPU transfer. - : blits_[BlitDevToHost]; // GPU->GPU transfer. + bool is_h2d = (src_agent.device_type() == core::Agent::kAmdCpuDevice && + dst_agent.device_type() == core::Agent::kAmdGpuDevice); + + lazy_ptr& blit = GetBlitObject(is_h2d ? BlitHostToDev : BlitDevToHost); return blit; } @@ -1776,11 +1927,17 @@ lazy_ptr& GpuAgent::GetBlitObject(const core::Agent& dst_agent, // If the copy is very small then cache flush overheads can dominate. // Choose a (potentially) SDMA enabled engine to avoid cache flushing. if (size < core::Runtime::runtime_singleton_->flag().force_sdma_size()) { - return blits_[BlitDevToHost]; + return GetBlitObject(BlitDevToHost); } return blits_[BlitDevToDev]; } + if (core::Runtime::runtime_singleton_->flag().enable_peer_sdma() == Flag::SDMA_DISABLE + && src_agent.device_type() == core::Agent::kAmdGpuDevice + && dst_agent.device_type() == core::Agent::kAmdGpuDevice) { + return blits_[BlitDevToDev]; + } + // Acquire Hive Id of Src and Dst devices - ignore hive id for CPU devices. // CPU-GPU connections should always use the host (aka pcie) facing SDMA engines, even if the // connection is XGMI. diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp index b6f91efa7..2218dfc9f 100644 --- a/src/core/runtime/amd_memory_region.cpp +++ b/src/core/runtime/amd_memory_region.cpp @@ -100,20 +100,28 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) { hsaKmtUnmapMemoryToGPU(const_cast(ptr)); } -MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner, +MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, + bool extended_scope_fine_grain, core::Agent* owner, const HsaMemoryProperties& mem_props) : core::MemoryRegion(fine_grain, kernarg, full_profile, owner), mem_props_(mem_props), + extended_scope_fine_grain_(extended_scope_fine_grain), max_single_alloc_size_(0), virtual_size_(0), fragment_allocator_(BlockAllocator(*this)) { virtual_size_ = GetPhysicalSize(); + // extended_scope_fine_grain and fine_grain memory regions are mutually exclusive + assert(!(fine_grain && extended_scope_fine_grain)); + mem_flag_.Value = 0; map_flag_.Value = 0; - static const HSAuint64 kGpuVmSize = (1ULL << 40); + // Bind the memory region based on whether it is + // coarse or fine grain or extended scope fine grain. + mem_flag_.ui32.CoarseGrain = (fine_grain || extended_scope_fine_grain) ? 0 : 1; + if (IsLocalMemory()) { mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; mem_flag_.ui32.NoSubstitute = 1; @@ -122,6 +130,20 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, cor mem_flag_.ui32.NonPaged = 1; virtual_size_ = kGpuVmSize; + + // If memory region is extended scope fine grained + // mark the page table entries for this memory region + // as MTYPE_UC. Full read and write ordering are guaranteed + // to this address. + if (extended_scope_fine_grain) { + AMD::GpuAgent* agent_ = + const_cast(reinterpret_cast(owner)); + if (agent_->isa()->GetVersion() == core::Isa::Version(9, 4, 0) || + agent_->isa()->GetVersion() == core::Isa::Version(9, 4, 1) || + agent_->isa()->GetVersion() == core::Isa::Version(9, 4, 2)) + mem_flag_.ui32.Uncached = 1; + } + } else if (IsSystem()) { mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB; mem_flag_.ui32.NoSubstitute = 0; @@ -134,8 +156,6 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, cor (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize; } - // Bind if memory region is coarse or fine grain - mem_flag_.ui32.CoarseGrain = (fine_grain) ? 0 : 1; // Adjust allocatable size per page align max_single_alloc_size_ = AlignDown(static_cast(GetPhysicalSize()), kPageSize_); @@ -199,6 +219,13 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, useSubAlloc &= ((alloc_flags & (~AllocateRestrict)) == 0); if (useSubAlloc) { *address = fragment_allocator_.alloc(size); + + if ((alloc_flags & AllocateAsan) && + hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) { + fragment_allocator_.free(*address); + *address = NULL; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } return HSA_STATUS_SUCCESS; } } @@ -253,6 +280,12 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags, return HSA_STATUS_ERROR_OUT_OF_RESOURCES; } + if ((alloc_flags & AllocateAsan) && + hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) { + FreeKfdMemory(*address, size); + *address = NULL; + return HSA_STATUS_ERROR_OUT_OF_RESOURCES; + } return HSA_STATUS_SUCCESS; } @@ -304,8 +337,12 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute, case HSA_HEAPTYPE_SYSTEM: case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC: case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: { - uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED - : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + uint32_t ret = 0; + + ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED + : extended_scope_fine_grain() ? HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED + : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED; + if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG; *((uint32_t*)value) = ret; break; @@ -458,21 +495,16 @@ hsa_amd_memory_pool_access_t MemoryRegion::GetAccessInfo( // Determine access type for device local memory which is // guaranteed to be HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC - // Return disallowed by default if framebuffer is coarse grained - // without regard to type of requesting device (CPU / GPU) - // Return disallowed by default if framebuffer is fine grained - // and requesting device is connected via xGMI link if (IsLocalMemory()) { - // Return disallowed by default if memory is coarse - // grained without regard to link type - if (fine_grain() == false) { + // grained or extended scope fine grained without regard to link type + if (fine_grain() == false) { return HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT; } // Return disallowed by default if memory is fine - // grained and link type is xGMI. + // grained and requesting device is connected via xGMI link if (agent.HiveId() == owner()->HiveId()) { return HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT; } diff --git a/src/core/runtime/default_signal.cpp b/src/core/runtime/default_signal.cpp index bd2f7cf1f..820fc75ca 100644 --- a/src/core/runtime/default_signal.cpp +++ b/src/core/runtime/default_signal.cpp @@ -42,9 +42,11 @@ #include "core/inc/default_signal.h" #include "core/util/timer.h" -#include +#if defined(__i386__) || defined(__x86_64__) +#include #define MWAITX_ECX_TIMER_ENABLE 0x2 // BIT(1) +#endif namespace rocr { namespace core { @@ -103,7 +105,9 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition, timer::duration_from_seconds( double(timeout) / double(hsa_freq)); +#if defined(__i386__) || defined(__x86_64__) if (g_use_mwaitx) _mm_monitorx(const_cast(&signal_.value), 0, 0); +#endif while (true) { if (!IsValid()) return 0; @@ -138,11 +142,13 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition, return hsa_signal_value_t(value); } - if (time - start_time > kMaxElapsed) + if (time - start_time > kMaxElapsed) { os::uSleep(20); - else if (g_use_mwaitx) { +#if defined(__i386__) || defined(__x86_64__) + } else if (g_use_mwaitx) { _mm_mwaitx(0, 60000, MWAITX_ECX_TIMER_ENABLE); // 60000 ~20us on a 1.5Ghz CPU _mm_monitorx(const_cast(&signal_.value), 0, 0); +#endif } } } diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index bcc141457..c509fda5e 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -1084,6 +1084,8 @@ hsa_status_t TRY; IS_OPEN(); + core::MemoryRegion::AllocateFlags alloc_flag = core::MemoryRegion::AllocateNoFlags; + if (size == 0 || ptr == NULL) { return HSA_STATUS_ERROR_INVALID_ARGUMENT; } @@ -1091,8 +1093,7 @@ hsa_status_t const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region); IS_VALID(mem_region); - return core::Runtime::runtime_singleton_->AllocateMemory( - mem_region, size, core::MemoryRegion::AllocateNoFlags, ptr); + return core::Runtime::runtime_singleton_->AllocateMemory(mem_region, size, alloc_flag, ptr); CATCH; } diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp index a4feac989..09c5976e4 100644 --- a/src/core/runtime/hsa_ext_amd.cpp +++ b/src/core/runtime/hsa_ext_amd.cpp @@ -761,6 +761,10 @@ hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, siz if (flags == HSA_AMD_MEMORY_POOL_PCIE_FLAG) alloc_flag |= core::MemoryRegion::AllocatePCIeRW; +#ifdef SANITIZER_AMDGPU + alloc_flag |= core::MemoryRegion::AllocateAsan; +#endif + return core::Runtime::runtime_singleton_->AllocateMemory(mem_region, size, alloc_flag, ptr); CATCH; } diff --git a/src/core/runtime/interrupt_signal.cpp b/src/core/runtime/interrupt_signal.cpp index 773bbffa1..9d7691aa2 100644 --- a/src/core/runtime/interrupt_signal.cpp +++ b/src/core/runtime/interrupt_signal.cpp @@ -44,9 +44,11 @@ #include "core/inc/runtime.h" #include "core/util/timer.h" #include "core/util/locks.h" -#include +#if defined(__i386__) || defined(__x86_64__) +#include #define MWAITX_ECX_TIMER_ENABLE 0x2 // BIT(1) +#endif namespace rocr { namespace core { @@ -147,8 +149,15 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( uint32_t prior = waiting_++; MAKE_SCOPE_GUARD([&]() { waiting_--; }); - // Allow only the first waiter to sleep (temporary, known to be bad). - if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE; + + uint64_t event_age = 1; + + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_event_age) { + event_age = 0; + // Allow only the first waiter to sleep. Without event age tracking, + // race condition can cause some threads to sleep without wakeup since missing interrupt. + if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE; + } int64_t value; @@ -165,7 +174,10 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( double(timeout) / double(hsa_freq)); bool condition_met = false; + +#if defined(__i386__) || defined(__x86_64__) if (g_use_mwaitx) _mm_monitorx(const_cast(&signal_.value), 0, 0); +#endif while (true) { if (!IsValid()) return 0; @@ -201,19 +213,23 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( } if (wait_hint == HSA_WAIT_STATE_ACTIVE) { +#if defined(__i386__) || defined(__x86_64__) if (g_use_mwaitx) { _mm_mwaitx(0, 0, 0); _mm_monitorx(const_cast(&signal_.value), 0, 0); } +#endif continue; } if (time - start_time < kMaxElapsed) { // os::uSleep(20); +#if defined(__i386__) || defined(__x86_64__) if (g_use_mwaitx) { _mm_mwaitx(0, 60000, MWAITX_ECX_TIMER_ENABLE); _mm_monitorx(const_cast(&signal_.value), 0, 0); } +#endif continue; } @@ -222,7 +238,7 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed( uint64_t ct=timer::duration_cast( time_remaining).count(); wait_ms = (ct>0xFFFFFFFEu) ? 0xFFFFFFFEu : ct; - hsaKmtWaitOnEvent(event_, wait_ms); + hsaKmtWaitOnEvent_Ext(event_, wait_ms, &event_age); } } diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index 7d3cf906e..3674848a7 100755 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -297,6 +297,33 @@ constexpr size_t hsa_name_size = 63; ISAREG_ENTRY_GEN("gfx90c", 9, 0, 12, unsupported, any, 64) ISAREG_ENTRY_GEN("gfx90c:xnack-", 9, 0, 12, unsupported, disabled, 64) ISAREG_ENTRY_GEN("gfx90c:xnack+", 9, 0, 12, unsupported, enabled, 64) + ISAREG_ENTRY_GEN("gfx940", 9, 4, 0, any, any, 64) + ISAREG_ENTRY_GEN("gfx940:xnack-", 9, 4, 0, any, disabled, 64) + ISAREG_ENTRY_GEN("gfx940:xnack+", 9, 4, 0, any, enabled, 64) + ISAREG_ENTRY_GEN("gfx940:sramecc-", 9, 4, 0, disabled, any, 64) + ISAREG_ENTRY_GEN("gfx940:sramecc+", 9, 4, 0, enabled, any, 64) + ISAREG_ENTRY_GEN("gfx940:sramecc-:xnack-", 9, 4, 0, disabled, disabled, 64) + ISAREG_ENTRY_GEN("gfx940:sramecc-:xnack+", 9, 4, 0, disabled, enabled, 64) + ISAREG_ENTRY_GEN("gfx940:sramecc+:xnack-", 9, 4, 0, enabled, disabled, 64) + ISAREG_ENTRY_GEN("gfx940:sramecc+:xnack+", 9, 4, 0, enabled, enabled, 64) + ISAREG_ENTRY_GEN("gfx941", 9, 4, 1, any, any, 64) + ISAREG_ENTRY_GEN("gfx941:xnack-", 9, 4, 1, any, disabled, 64) + ISAREG_ENTRY_GEN("gfx941:xnack+", 9, 4, 1, any, enabled, 64) + ISAREG_ENTRY_GEN("gfx941:sramecc-", 9, 4, 1, disabled, any, 64) + ISAREG_ENTRY_GEN("gfx941:sramecc+", 9, 4, 1, enabled, any, 64) + ISAREG_ENTRY_GEN("gfx941:sramecc-:xnack-", 9, 4, 1, disabled, disabled, 64) + ISAREG_ENTRY_GEN("gfx941:sramecc-:xnack+", 9, 4, 1, disabled, enabled, 64) + ISAREG_ENTRY_GEN("gfx941:sramecc+:xnack-", 9, 4, 1, enabled, disabled, 64) + ISAREG_ENTRY_GEN("gfx941:sramecc+:xnack+", 9, 4, 1, enabled, enabled, 64) + ISAREG_ENTRY_GEN("gfx942", 9, 4, 2, any, any, 64) + ISAREG_ENTRY_GEN("gfx942:xnack-", 9, 4, 2, any, disabled, 64) + ISAREG_ENTRY_GEN("gfx942:xnack+", 9, 4, 2, any, enabled, 64) + ISAREG_ENTRY_GEN("gfx942:sramecc-", 9, 4, 2, disabled, any, 64) + ISAREG_ENTRY_GEN("gfx942:sramecc+", 9, 4, 2, enabled, any, 64) + ISAREG_ENTRY_GEN("gfx942:sramecc-:xnack-", 9, 4, 2, disabled, disabled, 64) + ISAREG_ENTRY_GEN("gfx942:sramecc-:xnack+", 9, 4, 2, disabled, enabled, 64) + ISAREG_ENTRY_GEN("gfx942:sramecc+:xnack-", 9, 4, 2, enabled, disabled, 64) + ISAREG_ENTRY_GEN("gfx942:sramecc+:xnack+", 9, 4, 2, enabled, enabled, 64) ISAREG_ENTRY_GEN("gfx1010", 10, 1, 0, unsupported, any, 32) ISAREG_ENTRY_GEN("gfx1010:xnack-", 10, 1, 0, unsupported, disabled, 32) ISAREG_ENTRY_GEN("gfx1010:xnack+", 10, 1, 0, unsupported, enabled, 32) diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index 74d061749..9647c3690 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -291,7 +291,7 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size, // Track the allocation result so that it could be freed properly. if (status == HSA_STATUS_SUCCESS) { ScopedAcquire lock(&memory_lock_); - allocation_map_[*address] = AllocationRegion(region, size, size_requested); + allocation_map_[*address] = AllocationRegion(region, size, size_requested, alloc_flags); } return status; @@ -305,6 +305,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) { const MemoryRegion* region = nullptr; size_t size = 0; std::unique_ptr> notifiers; + MemoryRegion::AllocateFlags alloc_flags = core::MemoryRegion::AllocateNoFlags; { ScopedAcquire lock(&memory_lock_); @@ -317,6 +318,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) { } region = it->second.region; size = it->second.size; + alloc_flags = it->second.alloc_flags; // Imported fragments can't be released with FreeMemory. if (region == nullptr) { @@ -338,6 +340,9 @@ hsa_status_t Runtime::FreeMemory(void* ptr) { } } + if (alloc_flags & core::MemoryRegion::AllocateAsan) + assert(hsaKmtReturnAsanHeaderPage(ptr) == HSAKMT_STATUS_SUCCESS); + return region->Free(ptr, size); } @@ -690,8 +695,8 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) { // Implemented in KFD in 1.12 if (kfd_version.KernelInterfaceMajorVersion > 1 || - kfd_version.KernelInterfaceMajorVersion == 1 && - kfd_version.KernelInterfaceMinorVersion >= 12) + (kfd_version.KernelInterfaceMajorVersion == 1 && + kfd_version.KernelInterfaceMinorVersion >= 12)) *(reinterpret_cast(value)) = true; else *(reinterpret_cast(value)) = false; @@ -1025,7 +1030,8 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len, len = Min(len, importSize - fragOffset); } ScopedAcquire lock(&memory_lock_); - allocation_map_[importAddress] = AllocationRegion(nullptr, len, len); + allocation_map_[importAddress] = + AllocationRegion(nullptr, len, len, core::MemoryRegion::AllocateNoFlags); }; if ((importHandle.handle[6] & 0x80000000) != 0) { @@ -1415,7 +1421,11 @@ hsa_status_t Runtime::Load() { // Assume features are not supported if parse CPUID fails if (!os::ParseCpuID(&cpuinfo)) { - fprintf(stderr, "Failed to parse CPUID\n"); + /* + * This is not a failure, in some environments such as SRIOV, not all CPUID info is + * exposed inside the guest + */ + debug_warning("Parsing CPUID failed."); } flag_.Refresh(); diff --git a/src/core/runtime/signal.cpp b/src/core/runtime/signal.cpp index 486a2a305..3d554a03b 100644 --- a/src/core/runtime/signal.cpp +++ b/src/core/runtime/signal.cpp @@ -197,8 +197,10 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals, for (uint32_t i = 0; i < signal_count; i++) signals[i]->waiting_--; }); - // Allow only the first waiter to sleep (temporary, known to be bad). - if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE; + if (!core::Runtime::runtime_singleton_->KfdVersion().supports_event_age) + // Allow only the first waiter to sleep. Without event age tracking, + // race condition can cause some threads to sleep without wakeup since missing interrupt. + if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE; // Ensure that all signals in the list can be slept on. if (wait_hint != HSA_WAIT_STATE_ACTIVE) { @@ -229,6 +231,12 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals, if (signal_count > small_size) delete[] evts; }); + uint64_t event_age[unique_evts]; + memset(event_age, 0, unique_evts * sizeof(uint64_t)); + if (core::Runtime::runtime_singleton_->KfdVersion().supports_event_age) + for (uint32_t i = 0; i < unique_evts; i++) + event_age[i] = 1; + int64_t value; timer::fast_clock::time_point start_time = timer::fast_clock::now(); @@ -310,7 +318,7 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals, uint64_t ct=timer::duration_cast( time_remaining).count(); wait_ms = (ct>0xFFFFFFFEu) ? 0xFFFFFFFEu : ct; - hsaKmtWaitOnMultipleEvents(evts, unique_evts, false, wait_ms); + hsaKmtWaitOnMultipleEvents_Ext(evts, unique_evts, false, wait_ms, event_age); } } diff --git a/src/core/runtime/trap_handler/CMakeLists.txt b/src/core/runtime/trap_handler/CMakeLists.txt index c98e10486..2196cb0e9 100644 --- a/src/core/runtime/trap_handler/CMakeLists.txt +++ b/src/core/runtime/trap_handler/CMakeLists.txt @@ -46,8 +46,8 @@ cmake_minimum_required ( VERSION 3.7 ) find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm ) find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm ) -set (TARGET_DEVS "gfx900;gfx1010;gfx1030;gfx1100") -set (POSTFIX "9;1010;10;11") +set (TARGET_DEVS "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1030;gfx1100") +set (POSTFIX "9;940;941;942;1010;10;11") if(${CMAKE_VERBOSE_MAKEFILE}) get_property(clang_path TARGET clang PROPERTY LOCATION) diff --git a/src/core/runtime/trap_handler/trap_handler.s b/src/core/runtime/trap_handler/trap_handler.s index 9e1d56bd4..1803f4fa1 100644 --- a/src/core/runtime/trap_handler/trap_handler.s +++ b/src/core/runtime/trap_handler/trap_handler.s @@ -71,22 +71,27 @@ .set TTMP6_SAVED_TRAP_ID_SIZE , 4 .set TTMP6_SAVED_TRAP_ID_MASK , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT) .set TTMP6_SAVED_TRAP_ID_BFE , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16)) -.set TTMP11_PC_HI_SHIFT , 7 -.set TTMP11_DEBUG_ENABLED_SHIFT , 23 + +.set TTMP_PC_HI_SHIFT , 7 +.set TTMP_DEBUG_ENABLED_SHIFT , 23 .if .amdgcn.gfx_generation_number == 9 - .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 + .set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000 .elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3 - .set TTMP11_SAVE_REPLAY_W64H_SHIFT , 31 - .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 24 + .set TTMP_SAVE_REPLAY_W64H_SHIFT , 31 + .set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT , 24 .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT , 25 .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT , 15 .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000 .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK , 0x2000000 .endif +.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4 + .set TTMP11_TTMPS_SETUP_SHIFT , 31 +.endif + // ABI between first and second level trap handler: // ttmp0 = PC[31:0] // ttmp12 = SQ_WAVE_STATUS @@ -94,7 +99,10 @@ // ttmp15 = TMA[63:32] // gfx9: // ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] +// gfx906/gfx908/gfx90a: // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0] +// gfx940/gfx941/gfx942: +// ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0] // gfx10: // ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32] // gfx1010: @@ -117,7 +125,11 @@ trap_entry: // If llvm.debugtrap and debugger is not attached. s_cmp_eq_u32 ttmp2, TRAP_ID_DEBUGTRAP s_cbranch_scc0 .no_skip_debugtrap - s_bitcmp0_b32 ttmp11, TTMP11_DEBUG_ENABLED_SHIFT +.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || .amdgcn.gfx_generation_number == 10 + s_bitcmp0_b32 ttmp11, TTMP_DEBUG_ENABLED_SHIFT +.else + s_bitcmp0_b32 ttmp13, TTMP_DEBUG_ENABLED_SHIFT +.endif s_cbranch_scc0 .no_skip_debugtrap // Ignore llvm.debugtrap. @@ -200,6 +212,7 @@ trap_entry: s_mov_b32 m0, ttmp3 s_nop 0x0 // Manually inserted wait states s_sendmsg sendmsg(MSG_INTERRUPT) + s_waitcnt lgkmcnt(0) // Wait for the message to go out. s_mov_b32 m0, ttmp2 // Parking the wave requires saving the original pc in the preserved ttmps. @@ -212,26 +225,18 @@ trap_entry: // // ttmp7: pc_lo[31:0] // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0] - -.if ((.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) || .amdgcn.gfx_generation_number > 10) - s_branch .halt_wave -.else +.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3) || (.amdgcn.gfx_generation_number == 11) // Save the PC s_mov_b32 ttmp7, ttmp0 s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK - s_lshl_b32 ttmp1, ttmp1, TTMP11_PC_HI_SHIFT - s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP11_PC_HI_SHIFT) + s_lshl_b32 ttmp1, ttmp1, TTMP_PC_HI_SHIFT + s_andn2_b32 ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT) s_or_b32 ttmp11, ttmp11, ttmp1 // Park the wave s_getpc_b64 [ttmp0, ttmp1] s_add_u32 ttmp0, ttmp0, .parked - . s_addc_u32 ttmp1, ttmp1, 0x0 - s_branch .halt_wave - -.parked: - s_trap 0x2 - s_branch .parked .endif .halt_wave: @@ -239,17 +244,29 @@ trap_entry: s_bitset1_b32 ttmp6, TTMP6_WAVE_STOPPED_SHIFT s_bitset1_b32 ttmp12, SQ_WAVE_STATUS_HALT_SHIFT +.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4) + s_bitcmp1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT + s_cbranch_scc1 .ttmps_initialized + s_mov_b32 ttmp4, 0 + s_mov_b32 ttmp5, 0 + s_bitset1_b32 ttmp11, TTMP11_TTMPS_SETUP_SHIFT +.ttmps_initialized: +.endif + .exit_trap: // Restore SQ_WAVE_IB_STS. .if .amdgcn.gfx_generation_number == 9 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) +.if .amdgcn.gfx_generation_minor < 4 + s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) +.else + s_lshr_b32 ttmp2, ttmp13, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) +.endif s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 -.endif -.if .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) +.elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3 + s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) + s_lshr_b32 ttmp2, ttmp11, (TTMP_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK s_or_b32 ttmp2, ttmp2, ttmp3 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 @@ -262,3 +279,15 @@ trap_entry: // Return to original (possibly modified) PC. s_rfe_b64 [ttmp0, ttmp1] + +.parked: + s_trap 0x2 + s_branch .parked + +// For gfx11, add padding instructions so we can ensure instruction cache +// prefetch always has something to load. +.if .amdgcn.gfx_generation_number == 11 +.rept (256 - ((. - trap_entry) % 64)) / 4 + s_code_end +.endr +.endif diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 5c5e8f310..507b29901 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -57,6 +57,7 @@ namespace rocr { class Flag { public: enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT }; + enum SRAMECC_ENABLE { SRAMECC_DISABLED, SRAMECC_ENABLED, SRAMECC_DEFAULT }; // The values are meaningful and chosen to satisfy the thunk API. enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 }; @@ -86,6 +87,9 @@ class Flag { var = os::GetEnvVar("HSA_ENABLE_SDMA"); enable_sdma_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + var = os::GetEnvVar("HSA_ENABLE_PEER_SDMA"); + enable_peer_sdma_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT); + visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES"); filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES"); @@ -175,6 +179,10 @@ class Flag { var = os::GetEnvVar("HSA_SVM_PROFILE"); svm_profile_ = var; + var = os::GetEnvVar("HSA_ENABLE_SRAMECC"); + sramecc_enable_ = + (var == "0") ? SRAMECC_DISABLED : ((var == "1") ? SRAMECC_ENABLED : SRAMECC_DEFAULT); + var = os::GetEnvVar("HSA_IMAGE_PRINT_SRD"); image_print_srd_ = (var == "1") ? true : false; @@ -220,6 +228,8 @@ class Flag { SDMA_OVERRIDE enable_sdma() const { return enable_sdma_; } + SDMA_OVERRIDE enable_peer_sdma() const { return enable_peer_sdma_; } + std::string visible_gpus() const { return visible_gpus_; } bool filter_visible_gpus() const { return filter_visible_gpus_; } @@ -269,6 +279,8 @@ class Flag { const std::string& svm_profile() const { return svm_profile_; } + SRAMECC_ENABLE sramecc_enable() const { return sramecc_enable_; } + private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -295,6 +307,7 @@ class Flag { bool enable_mwaitx_; SDMA_OVERRIDE enable_sdma_; + SDMA_OVERRIDE enable_peer_sdma_; bool filter_visible_gpus_; std::string visible_gpus_; @@ -312,6 +325,8 @@ class Flag { // Indicates user preference for Xnack state. XNACK_REQUEST xnack_; + SRAMECC_ENABLE sramecc_enable_; + // Map GPU index post RVD to its default cu mask. std::map> cu_mask_; diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp index 838b61983..f9f842cb5 100644 --- a/src/core/util/lnx/os_linux.cpp +++ b/src/core/util/lnx/os_linux.cpp @@ -60,7 +60,9 @@ #include #include #include "core/inc/runtime.h" +#if defined(__i386__) || defined(__x86_64__) #include +#endif namespace rocr { namespace os { @@ -84,6 +86,7 @@ class os_thread { public: explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize) : thread(0), lock(nullptr), state(RUNNING) { + int err; std::unique_ptr args(new ThreadArgs); lock = CreateMutex(); if (lock == nullptr) return; @@ -92,45 +95,68 @@ class os_thread { args->entry_function = function; pthread_attr_t attrib; - pthread_attr_init(&attrib); + err = pthread_attr_init(&attrib); + if (err != 0) { + fprintf(stderr, "pthread_attr_init failed: %s\n", strerror(err)); + return; + } if (stackSize != 0) { stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize); stackSize = AlignUp(stackSize, 4096); - int err = pthread_attr_setstacksize(&attrib, stackSize); - assert(err == 0 && "pthread_attr_setstacksize failed."); + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err)); + return; + } } if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) { int cores = get_nprocs_conf(); cpu_set_t* cpuset = CPU_ALLOC(cores); + if (cpuset == nullptr) { + fprintf(stderr, "CPU_ALLOC failed: %s\n", strerror(errno)); + return; + } CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset); for (int i = 0; i < cores; i++) { CPU_SET(i, cpuset); } - int err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset); - assert(err == 0 && "pthread_attr_setaffinity_np failed."); + err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset); CPU_FREE(cpuset); + if (err != 0) { + fprintf(stderr, "pthread_attr_setaffinity_np failed: %s\n", strerror(err)); + return; + } } - int err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); + err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN // Attempt to grow the stack within reason. if ((err == EINVAL) && stackSize != 0) { while (stackSize < 20 * 1024 * 1024) { stackSize *= 2; - pthread_attr_setstacksize(&attrib, stackSize); + err = pthread_attr_setstacksize(&attrib, stackSize); + if (err != 0) { + fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err)); + return; + } err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get()); if (err != EINVAL) break; + debug_print("pthread_create returned EINVAL, doubling stack size\n"); } } - pthread_attr_destroy(&attrib); if (err == 0) args.release(); else thread = 0; + + err = pthread_attr_destroy(&attrib); + if (err != 0) { + fprintf(stderr, "pthread_attr_destroy failed: %s\n", strerror(err)); + } } os_thread(os_thread&& rhs) { @@ -145,7 +171,10 @@ class os_thread { ~os_thread() { if (lock != nullptr) DestroyMutex(lock); - if ((state == RUNNING) && (thread != 0)) pthread_detach(thread); + if ((state == RUNNING) && (thread != 0)) { + int err = pthread_detach(thread); + if (err != 0) fprintf(stderr, "pthread_detach failed: %s\n", strerror(err)); + } } bool Valid() { return (lock != nullptr) && (thread != 0); } @@ -192,11 +221,17 @@ void* GetExportAddress(LibHandle lib, std::string export_name) { link_map* map; int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map); - assert(err != -1 && "dlinfo failed."); + if (err == -1) { + fprintf(stderr, "dlinfo failed: %s\n", dlerror()); + return nullptr; + } Dl_info info; err = dladdr(ret, &info); - assert(err != 0 && "dladdr failed."); + if (err == 0) { + fprintf(stderr, "dladdr failed.\n"); + return nullptr; + } if (strcmp(info.dli_fname, map->l_name) == 0) return ret; @@ -529,7 +564,10 @@ uint64_t ReadAccurateClock() { if (invPeriod == 0.0) AccurateClockFrequency(); timespec time; int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time); - assert(err == 0 && "clock_gettime(CLOCK_MONOTONIC_RAW,...) failed"); + if (err != 0) { + perror("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed"); + abort(); + } return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod; } @@ -558,13 +596,16 @@ uint64_t AccurateClockFrequency() { } timespec time; int err = clock_getres(clock, &time); - assert(err == 0 && "clock_getres(CLOCK_MONOTONIC(_RAW),...) failed"); - assert(time.tv_sec == 0 && - "clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency " - "(<1Hz)."); - assert(time.tv_nsec < 0xFFFFFFFF && - "clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency " - "(<1Hz)."); + if (err != 0) { + perror("clock_getres failed"); + abort(); + } + if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) { + fprintf(stderr, + "clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low " + "frequency (<1Hz).\n"); + abort(); + } if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec); return 1000000000ull / uint64_t(time.tv_nsec); } @@ -573,16 +614,19 @@ SharedMutex CreateSharedMutex() { pthread_rwlockattr_t attrib; int err = pthread_rwlockattr_init(&attrib); if (err != 0) { - assert(false && "rw lock attribute init failed."); + fprintf(stderr, "rw lock attribute init failed: %s\n", strerror(err)); return nullptr; } err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP); - assert(err == 0 && "Set rw lock attribute failure."); + if (err != 0) { + fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err)); + return nullptr; + } pthread_rwlock_t* lock = new pthread_rwlock_t; err = pthread_rwlock_init(lock, &attrib); if (err != 0) { - assert(false && "rw lock init failed."); + fprintf(stderr, "rw lock init failed: %s\n", strerror(err)); return nullptr; } @@ -602,7 +646,10 @@ bool AcquireSharedMutex(SharedMutex lock) { void ReleaseSharedMutex(SharedMutex lock) { int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); - assert(err == 0 && "SharedMutex unlock failed."); + if (err != 0) { + fprintf(stderr, "SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } } bool TrySharedAcquireSharedMutex(SharedMutex lock) { @@ -617,7 +664,10 @@ bool SharedAcquireSharedMutex(SharedMutex lock) { void SharedReleaseSharedMutex(SharedMutex lock) { int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock); - assert(err == 0 && "SharedMutex unlock failed."); + if (err != 0) { + fprintf(stderr, "SharedMutex unlock failed: %s\n", strerror(err)); + abort(); + } } void DestroySharedMutex(SharedMutex lock) { @@ -645,8 +695,8 @@ uint64_t SystemClockFrequency() { } bool ParseCpuID(cpuid_t* cpuinfo) { +#if defined(__i386__) || defined(__x86_64__) uint32_t eax, ebx, ecx, edx, max_eax = 0; - memset(cpuinfo, 0, sizeof(*cpuinfo)); /* Make sure current CPU supports at least EAX 4 */ @@ -665,6 +715,9 @@ bool ParseCpuID(cpuid_t* cpuinfo) { } } return true; +#else + return false; +#endif } } // namespace os diff --git a/src/image/blit_kernel.cpp b/src/image/blit_kernel.cpp index 3eea38d6e..067c668f7 100644 --- a/src/image/blit_kernel.cpp +++ b/src/image/blit_kernel.cpp @@ -85,6 +85,9 @@ extern uint8_t ocl_blit_object_gfx908[]; extern uint8_t ocl_blit_object_gfx909[]; extern uint8_t ocl_blit_object_gfx90a[]; extern uint8_t ocl_blit_object_gfx90c[]; +extern uint8_t ocl_blit_object_gfx940[]; +extern uint8_t ocl_blit_object_gfx941[]; +extern uint8_t ocl_blit_object_gfx942[]; extern uint8_t ocl_blit_object_gfx1010[]; extern uint8_t ocl_blit_object_gfx1011[]; extern uint8_t ocl_blit_object_gfx1012[]; @@ -1007,6 +1010,12 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name, *blit_code_object = ocl_blit_object_gfx90a; } else if (sname == "gfx90c") { *blit_code_object = ocl_blit_object_gfx90c; + } else if (sname == "gfx940") { + *blit_code_object = ocl_blit_object_gfx940; + } else if (sname == "gfx941") { + *blit_code_object = ocl_blit_object_gfx941; + } else if (sname == "gfx942") { + *blit_code_object = ocl_blit_object_gfx942; } else if (sname == "gfx1010") { *blit_code_object = ocl_blit_object_gfx1010; } else if (sname == "gfx1011") { diff --git a/src/image/blit_src/CMakeLists.txt b/src/image/blit_src/CMakeLists.txt index 21f998bf7..1b3c7cfa3 100644 --- a/src/image/blit_src/CMakeLists.txt +++ b/src/image/blit_src/CMakeLists.txt @@ -47,7 +47,7 @@ find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm # Determine the target devices if not specified if (NOT DEFINED TARGET_DEVICES) - set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1103") + set(TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1103") endif() set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE ) diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h index b2de2a20a..45962b1dd 100644 --- a/src/inc/amd_hsa_elf.h +++ b/src/inc/amd_hsa_elf.h @@ -119,17 +119,19 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, - EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040, + EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, EF_AMDGPU_MACH_AMDGCN_GFX1103 = 0x044, EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, + EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX942, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/src/inc/hsa.h b/src/inc/hsa.h index 3c0db5d52..a70fd0f06 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -3218,7 +3218,16 @@ typedef enum { * region, the application must explicitely invoke ::hsa_memory_assign_agent * in order to transfer ownership to that agent for a particular buffer. */ - HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4 + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** + * Updates to memory in this region have extended scope, where the device-scope atomics + * to this memory type act as system-scope with respect to all variables located in + * memory regions of this type. + * Note: On non-compliant systems, the application may still be responsible for performing + * device-specific actions necessary to achieve system-scope coherence. + */ + HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8 } hsa_region_global_flag_t; /** diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 508a45cf3..36f5ae146 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -51,9 +51,11 @@ /* * - 1.0 - initial version * - 1.1 - dmabuf export + * - 1.2 - hsa_amd_memory_async_copy_on_engine + * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool */ #define HSA_AMD_INTERFACE_VERSION_MAJOR 1 -#define HSA_AMD_INTERFACE_VERSION_MINOR 1 +#define HSA_AMD_INTERFACE_VERSION_MINOR 3 #ifdef __cplusplus extern "C" { @@ -387,7 +389,17 @@ typedef enum hsa_amd_agent_info_s { * Queries for version of IOMMU supported by agent. * The type of this attribute is hsa_amd_iommu_version_t. */ - HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110 + HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110, + /** + * Queries for number of XCCs within the agent. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_NUM_XCC = 0xA111, + /** + * Queries for driver unique identifier. + * The type of this attribute is uint32_t. + */ + HSA_AMD_AGENT_INFO_DRIVER_UID = 0xA112 } hsa_amd_agent_info_t; /** @@ -1008,7 +1020,14 @@ typedef enum hsa_amd_memory_pool_global_flag_s { /** * Writes to memory in this pool can be performed by a single agent at a time. */ - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4 + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4, + + /** Updates to memory in this memory pool have extended scope, acting as + * system-scope atomics for variables in memory regions of this type. + * Note: On non-compliant systems, device-specific actions may be required + * for system-scope coherence. */ + HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8, + } hsa_amd_memory_pool_global_flag_t; typedef enum hsa_amd_memory_pool_location_s { @@ -1942,31 +1961,38 @@ typedef struct hsa_amd_pointer_info_s { */ hsa_amd_pointer_type_t type; /* - Base address at which non-host agents may access the allocation. + Base address at which non-host agents may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. */ void* agentBaseAddress; /* - Base address at which the host agent may access the allocation. + Base address at which the host agent may access the allocation. This field is + not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. */ void* hostBaseAddress; /* - Size of the allocation + Size of the allocation. This field is not meaningful if the type of the allocation + is HSA_EXT_POINTER_TYPE_UNKNOWN. */ size_t sizeInBytes; /* - Application provided value. + Application provided value. This field is not meaningful if the type of the + allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. */ void* userData; /* - Reports an agent which "owns" (ie has preferred access to) the pool in which the allocation was + Reports an agent which "owns" (ie has preferred access to) the pool in which the + allocation was made. When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die - GPU boards) any such agent may be returned. + GPU boards) any such agent may be returned. This field is not meaningful if + the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN or if this agent is not available in + this process, for e.g if this agent is masked using ROCR_VISIBLE_DEVICES. */ hsa_agent_t agentOwner; /* Contains a bitfield of hsa_amd_memory_pool_global_flag_t values. - Reports the effective global flags bitmask for the allocation. This field is not meaningful if - the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + Reports the effective global flags bitmask for the allocation. This field is not + meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. */ uint32_t global_flags; } hsa_amd_pointer_info_t; @@ -1974,7 +2000,9 @@ typedef struct hsa_amd_pointer_info_s { /** * @brief Retrieves information about the allocation referenced by the given * pointer. Optionally returns the number and list of agents which can - * directly access the allocation. + * directly access the allocation. In case this virtual address is unknown, the + * pointer type returned will be HSA_EXT_POINTER_TYPE_UNKNOWN and the only fields + * that are valid after hsa_amd_pointer_info returns are size and type. * * @param[in] ptr Pointer which references the allocation to retrieve info for. * @@ -2047,9 +2075,10 @@ typedef struct hsa_amd_ipc_memory_s { * region has been attached (via hsa_amd_ipc_memory_attach) in the remote * process prior to releasing that memory in the local process. * Repeated calls for the same allocation may, but are not required to, return - * unique handles. + * unique handles. The allocation needs to be on memory on an agent of type + * HSA_DEVICE_TYPE_GPU. * - * @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for + * @param[in] ptr Pointer to device memory allocated via ROCr APIs to prepare for * sharing. * * @param[in] len Length in bytes of the allocation to share. diff --git a/src/inc/hsa_ven_amd_aqlprofile.h b/src/inc/hsa_ven_amd_aqlprofile.h index 169ab5278..7aea7858f 100644 --- a/src/inc/hsa_ven_amd_aqlprofile.h +++ b/src/inc/hsa_ven_amd_aqlprofile.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -122,6 +122,10 @@ typedef enum { HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30, HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31, + // UMC & MMEA System Blocks + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32, + HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33, + HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER } hsa_ven_amd_aqlprofile_block_name_t; diff --git a/src/libamdhsacode/amd_elf_image.cpp b/src/libamdhsacode/amd_elf_image.cpp index 33871dd2b..831c1f7a7 100644 --- a/src/libamdhsacode/amd_elf_image.cpp +++ b/src/libamdhsacode/amd_elf_image.cpp @@ -711,16 +711,38 @@ namespace elf { GElfStringTable* shstrtab() override; GElfStringTable* strtab() override; - GElfSymbolTable* getSymtab(uint16_t index) override + GElfSymbolTable* getReferencedSymbolTable(uint16_t index) { return static_cast(section(index)); } + GElfSymbolTable* getSymtab(uint16_t index) override + { + if (section(index)->type() == SHT_SYMTAB) + return static_cast(section(index)); + return nullptr; + } + GElfSymbolTable* getDynsym(uint16_t index) override + { + if (section(index)->type() == SHT_DYNSYM) + return static_cast(section(index)); + return nullptr; + } + + GElfSymbolTable* getSymbolTable() override; + GElfSymbolTable* getSymbolTable(uint16_t index) override + { + const char *UseDynsym = getenv("LOADER_USE_DYNSYM"); + if (UseDynsym && std::strncmp(UseDynsym, "0", 1) != 0) + return getDynsym(index); + return getSymtab(index); + } GElfStringTable* addStringTable(const std::string& name) override; GElfStringTable* getStringTable(uint16_t index) override; GElfSymbolTable* addSymbolTable(const std::string& name, StringTable* stab = 0) override; GElfSymbolTable* symtab() override; + GElfSymbolTable* dynsym() override; GElfSegment* segment(size_t i) override { return segments[i].get(); } Segment* segmentByVAddr(uint64_t vaddr) override; @@ -759,6 +781,7 @@ namespace elf { GElfStringTable* shstrtabSection; GElfStringTable* strtabSection; GElfSymbolTable* symtabSection; + GElfSymbolTable* dynsymSection; GElfNoteSection* noteSection; std::vector> segments; std::vector> sections; @@ -1242,7 +1265,7 @@ namespace elf { bool GElfRelocationSection::pullData() { section = elf->section(hdr.sh_info); - symtab = elf->getSymtab(hdr.sh_link); + symtab = elf->getReferencedSymbolTable(hdr.sh_link); Elf_Scn *lScn = elf_getscn(elf->e, ndxscn); assert(lScn); Elf_Data *lData = elf_getdata(lScn, nullptr); @@ -1261,6 +1284,7 @@ namespace elf { e(0), shstrtabSection(0), strtabSection(0), symtabSection(0), + dynsymSection(0), noteSection(0) { if (EV_NONE == elf_version(EV_CURRENT)) { @@ -1436,6 +1460,7 @@ namespace elf { if (section->type() == SHT_STRTAB) { strtabSection = static_cast(section.get()); } if (section->type() == SHT_SYMTAB) { symtabSection = static_cast(section.get()); } if (section->type() == SHT_NOTE) { noteSection = static_cast(section.get()); } + if (section->type() == SHT_DYNSYM) { dynsymSection = static_cast(section.get()); } } size_t phnum; @@ -1553,7 +1578,7 @@ namespace elf { } } - GElfStringTable* GElfImage::addStringTable(const std::string& name) + GElfStringTable* GElfImage::addStringTable(const std::string& name) { GElfStringTable* stab = new GElfStringTable(this); sections.push_back(std::unique_ptr(stab)); @@ -1597,6 +1622,21 @@ namespace elf { return symtabSection; } + GElfSymbolTable* GElfImage::dynsym() + { + if (!dynsymSection) { + dynsymSection = addSymbolTable(".dynsym", strtab()); + } + return dynsymSection; + } + + GElfSymbolTable* GElfImage::getSymbolTable() + { + const char *UseDynsym = getenv("LOADER_USE_DYNSYM"); + if (UseDynsym && std::strncmp(UseDynsym, "0", 1) != 0) + return dynsym(); + return symtab(); + } GElfNoteSection* GElfImage::note() { diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp index 64eac2e65..da0a6532e 100644 --- a/src/libamdhsacode/amd_hsa_code.cpp +++ b/src/libamdhsacode/amd_hsa_code.cpp @@ -580,6 +580,9 @@ namespace code { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: MI.Name = "gfx909"; MI.XnackSupported = true; MI.SrameccSupported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: MI.Name = "gfx90a"; MI.XnackSupported = true; MI.SrameccSupported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: MI.Name = "gfx90c"; MI.XnackSupported = true; MI.SrameccSupported = false; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: MI.Name = "gfx940"; MI.XnackSupported = true; MI.SrameccSupported = true; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: MI.Name = "gfx941"; MI.XnackSupported = true; MI.SrameccSupported = true; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: MI.Name = "gfx942"; MI.XnackSupported = true; MI.SrameccSupported = true; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: MI.Name = "gfx1010"; MI.XnackSupported = true; MI.SrameccSupported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: MI.Name = "gfx1011"; MI.XnackSupported = true; MI.SrameccSupported = false; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: MI.Name = "gfx1012"; MI.XnackSupported = true; MI.SrameccSupported = false; break; @@ -643,14 +646,6 @@ namespace code { mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; else if (old_name == "AMD:AMDGPU:9:0:12") mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; - else if (old_name == "AMD:AMDGPU:11:0:0") - mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100; - else if (old_name == "AMD:AMDGPU:11:0:1") - mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101; - else if (old_name == "AMD:AMDGPU:11:0:2") - mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102; - else if (old_name == "AMD:AMDGPU:11:0:3") - mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103; else { // Code object v2 only supports asics up to gfx906 plus gfx90c. Do NOT // add handling of new asics into this if-else-if* block. @@ -1770,8 +1765,8 @@ namespace code { hsatext = sec; } } - for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) { - amd::elf::Symbol* elfsym = img->symtab()->symbol(i); + for (size_t i = 0; i < img->getSymbolTable()->symbolCount(); ++i) { + amd::elf::Symbol* elfsym = img->getSymbolTable()->symbol(i); Symbol* sym = 0; switch (elfsym->type()) { case STT_AMDGPU_HSA_KERNEL: { diff --git a/src/libamdhsacode/amd_hsa_code_util.cpp b/src/libamdhsacode/amd_hsa_code_util.cpp index 150840dd4..a1247de7f 100644 --- a/src/libamdhsacode/amd_hsa_code_util.cpp +++ b/src/libamdhsacode/amd_hsa_code_util.cpp @@ -957,12 +957,8 @@ int OpenTempFile(const char* prefix) #ifdef _WIN32 char dir[MAX_PATH+1]; if (!GetTempPath(sizeof(dir), dir)) { return -1; } -#else // _WIN32 - char *dir = NULL; -#endif // _WIN32 char *name = _tempnam(dir, tname.c_str()); if (!name) { return -1; } -#ifdef _WIN32 HANDLE h = CreateFile( name, GENERIC_READ | GENERIC_WRITE, @@ -975,10 +971,10 @@ int OpenTempFile(const char* prefix) if (h == INVALID_HANDLE_VALUE) { continue; } return _open_osfhandle((intptr_t)h, 0); #else // _WIN32 - int d = _open(name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); - if (d < 0) { free(name); continue; } - if (unlink(name) < 0) { free(name); _close(d); return -1; } - free(name); + tname += "XXXXXX"; + int d = mkstemp((char*)tname.c_str()); + if (d < 0) { continue; } + if (unlink(tname.c_str()) < 0) { _close(d); return -1; } return d; #endif // _WIN32 } diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index dbad4b93b..2a3b96f24 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -80,7 +80,9 @@ __attribute__((noinline)) static void _loader_debug_state() { // 5: New trap handler ABI. Save the PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped // 6: New trap handler ABI. ttmp6[25:0] contains dispatch index modulo queue size // 7: New trap handler ABI. Send interrupts as a bitmask, coalescing concurrent exceptions. -HSA_API r_debug _amdgpu_r_debug = {7, +// 8: New trap handler ABI. for gfx940: Initialize ttmp[4:5] if ttmp11[31] == 0. +// 9: New trap handler API. For gfx11: Save PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped. +HSA_API r_debug _amdgpu_r_debug = {9, nullptr, reinterpret_cast(&_loader_debug_state), r_debug::RT_CONSISTENT,