Skip to content

Commit

Permalink
ROCm 5.7.0 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
dayatsin-amd committed Sep 15, 2023
1 parent 3454554 commit b2b6811
Show file tree
Hide file tree
Showing 34 changed files with 1,204 additions and 241 deletions.
15 changes: 10 additions & 5 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ if (ROCM_CCACHE_BUILD)
endif() # if (ROCM_CCACHE_BUILD)

## Get version strings
get_version ( "1.9.0" )
get_version ( "1.11.0" )
if ( ${ROCM_PATCH_VERSION} )
set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
endif()
Expand Down Expand Up @@ -125,11 +125,14 @@ target_include_directories( ${CORE_RUNTIME_TARGET}
${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode
${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler)

## Set RUNPATH - ../../lib covers use of the legacy symlink in /hsa/lib/
set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib;$ORIGIN/../../lib64;$ORIGIN/../lib64" )

## ------------------------- Linux Compiler and Linker options -------------------------
set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function -mmwaitx )
set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function )

## Extra x86 specific settings
if ( CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64" )
set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -mmwaitx )
endif()

## Extra image settings - audit!
set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -Wno-deprecated-declarations )
Expand Down Expand Up @@ -306,7 +309,9 @@ install ( TARGETS ${CORE_RUNTIME_TARGET}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan )

# Install license
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan )
if(ENABLE_ASAN_PACKAGING)
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan )
endif()
install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )

# Install public headers
Expand Down
10 changes: 9 additions & 1 deletion src/core/common/hsa_table_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,15 @@ const HsaApiTable* hsa_table_interface_get_table() {
}

// Pass through stub functions
hsa_status_t HSA_API hsa_init() { return coreApiTable->hsa_init_fn(); }
hsa_status_t HSA_API hsa_init() {
// We initialize the api tables here once more since the code above is prone to a
// link-time ordering condition: This compilation unit here may get its global
// variables initialized earlier than the global objects in other compilation units.
// In particular Init::Init may get called earlier than that the underlying hsa_api_table_
// object in hsa_api_trace.cpp has been initialized.
rocr::core::LoadInitialHsaApiTable();
return coreApiTable->hsa_init_fn();
}

hsa_status_t HSA_API hsa_shut_down() { return coreApiTable->hsa_shut_down_fn(); }

Expand Down
56 changes: 56 additions & 0 deletions src/core/inc/amd_blit_shaders.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,62 @@ static const unsigned int kCodeFill8[] = {
0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
};

static const unsigned int kCodeCopyAligned940[] = {
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
0xc00a0400, 0x00000030, 0xc00a0500, 0x00000040, 0xc0020600, 0x00000050,
0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, 0xd1196a02, 0x00000900,
0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, 0x00000d00, 0xd11c6a05,
0x01a90105, 0xd0e9006a, 0x00001102, 0xbf86000f, 0x86fe6a7e, 0xde410000,
0x017f0002, 0xbf8c0f70, 0xd1196a02, 0x00003102, 0xd11c6a03, 0x01a90103,
0xde610000, 0x007f0104, 0xd1196a04, 0x00003104, 0xd11c6a05, 0x01a90105,
0xbf82ffee, 0xbefe01c1, 0x8e198418, 0x24020084, 0x7e060209, 0xd1196a02,
0x00001101, 0xd11c6a03, 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001501,
0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001902, 0xbf86000e, 0xde5d0000,
0x087f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
0xde7d0000, 0x007f0804, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
0xbf82ffef, 0x8e198218, 0x24020082, 0x7e06020d, 0xd1196a02, 0x00001901,
0xd11c6a03, 0x01a90103, 0x7e0a020f, 0xd1196a04, 0x00001d01, 0xd11c6a05,
0x01a90105, 0xd0e9006a, 0x00002102, 0xbf86000f, 0x86fe6a7e, 0xde510000,
0x017f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
0xde710000, 0x007f0104, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
0xbf82ffee, 0xbefe01c1, 0x7e060211, 0xd1196a02, 0x00002100, 0xd11c6a03,
0x01a90103, 0x7e0a0213, 0xd1196a04, 0x00002500, 0xd11c6a05, 0x01a90105,
0xd0e9006a, 0x00002902, 0xbf860006, 0x86fe6a7e, 0xde410000, 0x017f0002,
0xbf8c0f70, 0xde610000, 0x007f0104, 0xbf810000,
};

static const unsigned int kCodeCopyMisaligned940[] = {
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
0xc0020400, 0x00000030, 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205,
0xd1196a02, 0x00000900, 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04,
0x00000d00, 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf860032,
0xde410000, 0x067f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xde410000, 0x077f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xde410000, 0x087f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xde410000, 0x097f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
0xbf8c0f70, 0xde610000, 0x007f0604, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xde610000, 0x007f0704, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xde610000, 0x007f0804, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xde610000, 0x007f0904, 0xd1196a04, 0x00002104, 0xd11c6a05,
0x01a90105, 0xbf82ffcb, 0x7e060209, 0xd1196a02, 0x00001100, 0xd11c6a03,
0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001500, 0xd11c6a05, 0x01a90105,
0xd0e9006a, 0x00001902, 0xbf86000f, 0x86fe6a7e, 0xde410000, 0x017f0002,
0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, 0xde610000,
0x007f0104, 0xd1196a04, 0x00002104, 0xd11c6a05, 0x01a90105, 0xbf82ffee,
0xbf810000, 0x00000000,
};

static const unsigned int kCodeFill940[] = {
0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xbf8cc07f, 0x8e028602,
0x32000002, 0x7e08020a, 0x7e0a020a, 0x7e0c020a, 0x7e0e020a, 0x8e0c840b,
0x24020084, 0x7e060205, 0xd1196a02, 0x00000901, 0xd11c6a03, 0x01a90103,
0xd0e9006a, 0x00000d02, 0xbf860007, 0xde7d0000, 0x007f0402, 0xd1196a02,
0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff6, 0x8e0c820b, 0x24020082,
0x7e060207, 0xd1196a02, 0x00000d01, 0xd11c6a03, 0x01a90103, 0xd0e9006a,
0x00001102, 0xbf860008, 0x86fe6a7e, 0xde710000, 0x007f0402, 0xd1196a02,
0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff5, 0xbf810000, 0x00000000,
};

static const unsigned int kCodeCopyAligned10[] = {
0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
Expand Down
4 changes: 4 additions & 0 deletions src/core/inc/amd_elf_image.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ namespace elf {
virtual StringTable* strtab() = 0;
virtual SymbolTable* symtab() = 0;
virtual SymbolTable* getSymtab(uint16_t index) = 0;
virtual SymbolTable* dynsym() = 0;
virtual SymbolTable* getDynsym(uint16_t index) = 0;
virtual SymbolTable* getSymbolTable() = 0;
virtual SymbolTable* getSymbolTable(uint16_t index) = 0;

virtual StringTable* addStringTable(const std::string& name) = 0;
virtual StringTable* getStringTable(uint16_t index) = 0;
Expand Down
16 changes: 16 additions & 0 deletions src/core/inc/amd_gpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,10 @@ class GpuAgent : public GpuAgentInt {
// Bind the Blit object that will drive the copy operation
lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent,
const size_t size);

// Bind the Blit object that will drive the copy operation by engine ID
lazy_ptr<core::Blit>& GetBlitObject(uint32_t engine_id);

// @brief Alternative aperture base address. Only on KV.
uintptr_t ape1_base_;

Expand All @@ -563,6 +567,15 @@ class GpuAgent : public GpuAgentInt {
KernelMutex lock_;
} gws_queue_;

// Sets and Tracks pending SDMA status check or request counts
void SetCopyRequestRefCount(bool set);
void SetCopyStatusCheckRefCount(bool set);
int pending_copy_req_ref_;
int pending_copy_stat_check_ref_;

// Tracks what SDMA blits have been used since initialization.
uint32_t sdma_blit_used_mask_;

ScratchCache scratch_cache_;

// System memory allocator in the nearest NUMA node.
Expand All @@ -572,6 +585,9 @@ class GpuAgent : public GpuAgentInt {
std::function<void(void*)> system_deallocator_;

DISALLOW_COPY_AND_ASSIGN(GpuAgent);

// Check if SDMA engine by ID is free
bool DmaEngineIsFree(uint32_t engine_id);
};

} // namespace amd
Expand Down
9 changes: 7 additions & 2 deletions src/core/inc/amd_memory_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ class MemoryRegion : public core::MemoryRegion {
/// @brief Unpin memory.
static void MakeKfdMemoryUnresident(const void* ptr);

MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
const HsaMemoryProperties& mem_props);
MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
core::Agent* owner, const HsaMemoryProperties& mem_props);

~MemoryRegion();

Expand Down Expand Up @@ -173,6 +173,8 @@ class MemoryRegion : public core::MemoryRegion {
return static_cast<uint32_t>(mem_props_.MemoryClockMax);
}

__forceinline bool extended_scope_fine_grain() const { return extended_scope_fine_grain_; }

private:
const HsaMemoryProperties mem_props_;

Expand All @@ -182,6 +184,9 @@ class MemoryRegion : public core::MemoryRegion {

size_t max_single_alloc_size_;

// Enables creating an extended scope fine grained memory pool region
const bool extended_scope_fine_grain_;

// Used to collect total system memory
static size_t max_sysmem_alloc_size_;

Expand Down
1 change: 1 addition & 0 deletions src/core/inc/memory_region.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
AllocateIPC = (1 << 4), // System memory that can be IPC-shared
AllocateNonPaged = (1 << 4), // Non-paged system memory (AllocateIPC alias)
AllocatePCIeRW = (1 << 5), // Enforce pseudo fine grain/RW memory
AllocateAsan = (1 << 6), // ASAN - First page of allocation remapped to system memory
};

typedef uint32_t AllocateFlags;
Expand Down
31 changes: 27 additions & 4 deletions src/core/inc/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@
#include "core/inc/amd_loader_context.hpp"
#include "core/inc/amd_hsa_code.hpp"

#if defined(__clang__)
#if __has_feature(address_sanitizer)
#define SANITIZER_AMDGPU 1
#endif
#endif

//---------------------------------------------------------------------------//
// Constants //
//---------------------------------------------------------------------------//
Expand Down Expand Up @@ -112,6 +118,7 @@ class Runtime {
struct KfdVersion_t {
HsaVersionInfo version;
bool supports_exception_debugging;
bool supports_event_age;
};

/// @brief Open connection to kernel driver and increment reference count.
Expand Down Expand Up @@ -395,7 +402,12 @@ class Runtime {

uint64_t sys_clock_freq() const { return sys_clock_freq_; }

void KfdVersion(const HsaVersionInfo& version) { kfd_version.version = version; }
void KfdVersion(const HsaVersionInfo& version) {
kfd_version.version = version;
if (version.KernelInterfaceMajorVersion == 1 &&
version.KernelInterfaceMinorVersion >= 14)
kfd_version.supports_event_age = true;
}

void KfdVersion(bool exception_debugging) {
kfd_version.supports_exception_debugging = exception_debugging;
Expand All @@ -407,9 +419,19 @@ class Runtime {
static void AsyncEventsLoop(void*);

struct AllocationRegion {
AllocationRegion() : region(NULL), size(0), size_requested(0), user_ptr(nullptr) {}
AllocationRegion(const MemoryRegion* region_arg, size_t size_arg, size_t size_requested)
: region(region_arg), size(size_arg), size_requested(size_requested), user_ptr(nullptr) {}
AllocationRegion()
: region(NULL),
size(0),
size_requested(0),
alloc_flags(core::MemoryRegion::AllocateNoFlags),
user_ptr(nullptr) {}
AllocationRegion(const MemoryRegion* region_arg, size_t size_arg, size_t size_requested,
MemoryRegion::AllocateFlags alloc_flags)
: region(region_arg),
size(size_arg),
size_requested(size_requested),
alloc_flags(alloc_flags),
user_ptr(nullptr) {}

struct notifier_t {
void* ptr;
Expand All @@ -420,6 +442,7 @@ class Runtime {
const MemoryRegion* region;
size_t size; /* actual size = align_up(size_requested, granularity) */
size_t size_requested; /* size requested by user */
MemoryRegion::AllocateFlags alloc_flags;
void* user_ptr;
std::unique_ptr<std::vector<notifier_t>> notifiers;
};
Expand Down
27 changes: 20 additions & 7 deletions src/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,11 @@ void AqlQueue::FillBufRsrcWord1_Gfx11() {

void AqlQueue::FillBufRsrcWord2() {
SQ_BUF_RSRC_WORD2 srd2;
srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
const auto& agent_props = agent_->properties();
const uint32_t num_xcc = agent_props.NumXcc;

// report size per XCC
srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size / num_xcc);

amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
}
Expand Down Expand Up @@ -1403,8 +1407,10 @@ void AqlQueue::FillComputeTmpRingSize() {
return;
}

// Determine the maximum number of waves device can support
const auto& agent_props = agent_->properties();
const uint32_t num_xcc = agent_props.NumXcc;

// Determine the maximum number of waves device can support
uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU;
uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;

Expand All @@ -1416,10 +1422,11 @@ void AqlQueue::FillComputeTmpRingSize() {
tmpring_size.bits.WAVESIZE = wave_scratch;
assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
uint32_t num_waves =
queue_scratch_.size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
(queue_scratch_.size / num_xcc) / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);

tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
amd_queue_.compute_tmpring_size = tmpring_size.u32All;
assert((tmpring_size.bits.WAVES % agent_props.NumShaderBanks == 0) &&
assert((tmpring_size.bits.WAVES % (agent_props.NumShaderBanks / num_xcc) == 0) &&
"Invalid scratch wave count. Must be divisible by #SEs.");
}

Expand All @@ -1431,9 +1438,11 @@ void AqlQueue::FillComputeTmpRingSize_Gfx11() {
return;
}

// Determine the maximum number of waves device can support
const auto& agent_props = agent_->properties();
uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU;
const uint32_t num_xcc = agent_props.NumXcc;

// Determine the maximum number of waves device can support
uint32_t num_cus = agent_props.NumFComputeCores / (agent_props.NumSIMDPerCU * num_xcc);
uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;

// Scratch is allocated program COMPUTE_TMPRING_SIZE register
Expand Down Expand Up @@ -1483,7 +1492,11 @@ void AqlQueue::InitScratchSRD() {

// Populate flat scratch parameters in amd_queue_.
amd_queue_.scratch_backing_memory_location = queue_scratch_.queue_process_offset;
amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;

const auto& agent_props = agent_->properties();
const uint32_t num_xcc = agent_props.NumXcc;
// report size per XCC
amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size / num_xcc;

// For backwards compatibility this field records the per-lane scratch
// for a 64 lane wavefront. If scratch was allocated for 32 lane waves
Expand Down
Loading

0 comments on commit b2b6811

Please sign in to comment.