Skip to content

Commit

Permalink
ROCm 6.2.1 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
dayatsin-amd committed Sep 19, 2024
1 parent 7f45923 commit df75490
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 86 deletions.
12 changes: 8 additions & 4 deletions src/core/inc/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -667,11 +667,15 @@ class Runtime {
// Deprecated HSA Region API GPU (for legacy APU support only)
Agent* region_gpu_;

AsyncEventsControl async_events_control_;

AsyncEvents async_events_;
struct AsyncEventsInfo {
AsyncEventsControl control;
AsyncEvents events;
AsyncEvents new_events;
bool monitor_exceptions;
};

AsyncEvents new_async_events_;
struct AsyncEventsInfo asyncSignals_;
struct AsyncEventsInfo asyncExceptions_;

// System clock frequency.
uint64_t sys_clock_freq_;
Expand Down
14 changes: 14 additions & 0 deletions src/core/inc/signal.h
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,20 @@ class Signal {
uint64_t timeout_hint, hsa_wait_state_t wait_hint,
hsa_signal_value_t* satisfying_value);

/// @brief Dedicated funtion to wait on signals that are not of type HSA_EVENTTYPE_SIGNAL
/// these events can only be received by calling the underlying driver (i.e via the hsaKmtWaitOnMultipleEvents_Ext
/// function call). We still need to have 1 signal of type HSA_EVENT_TYPE_SIGNAL attached to the list of signals
/// to be able to force hsaKmtWaitOnMultipleEvents_Ext to return.
/// @param signal_count Number of hsa_signals
/// @param hsa_signals Pointer to array of signals. All signals should have a valid EopEvent()
/// @param conds list of conditions
/// @param values list of values
/// @param satisfying_value value to be satisfied
/// @return index of signal that satisfies condition
static uint32_t WaitAnyExceptions(uint32_t signal_count, const hsa_signal_t* hsa_signals,
const hsa_signal_condition_t* conds, const hsa_signal_value_t* values,
hsa_signal_value_t* satisfying_value);

__forceinline bool IsType(rtti_t id) { return _IsA(id); }

/// @brief Prevents the signal from being destroyed until the matching Release().
Expand Down
30 changes: 15 additions & 15 deletions src/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1290,21 +1290,6 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
return false;
}

// Fallback if KFD does not support GPU core dump. In this case, there core dump is
// generated by hsa-runtime.
if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump &&
queue->agent_->isa()->GetMajorVersion() != 11) {

if (pcs::PcsRuntime::instance()->SessionsActive())
fprintf(stderr, "GPU core dump skipped because PC Sampling active\n");
else if (amd::coredump::dump_gpu_core())
fprintf(stderr, "GPU core dump failed\n");
// supports_core_dump flag is overwritten to avoid generate core dump file again
// caught by a different exception handler. Such as VMFaultHandler.
core::Runtime::runtime_singleton_->KfdVersion(
core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true);
}

for (auto& error : QueueErrors) {
if (error_code & (1 << (error.code - 1))) {
errorCode = error.status;
Expand All @@ -1322,6 +1307,21 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
return false;
}

// Fallback if KFD does not support GPU core dump. In this case, there core dump is
// generated by hsa-runtime.
if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump &&
queue->agent_->isa()->GetMajorVersion() != 11) {

if (pcs::PcsRuntime::instance()->SessionsActive())
fprintf(stderr, "GPU core dump skipped because PC Sampling active\n");
else if (amd::coredump::dump_gpu_core())
fprintf(stderr, "GPU core dump failed\n");
// supports_core_dump flag is overwritten to avoid generate core dump file again
// caught by a different exception handler. Such as VMFaultHandler.
core::Runtime::runtime_singleton_->KfdVersion(
core::Runtime::runtime_singleton_->KfdVersion().supports_exception_debugging, true);
}

queue->Suspend();
if (queue->errors_callback_ != nullptr) {
queue->errors_callback_(errorCode, queue->public_handle(), queue->errors_data_);
Expand Down
77 changes: 54 additions & 23 deletions src/core/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -755,35 +755,44 @@ hsa_status_t Runtime::SetAsyncSignalHandler(hsa_signal_t signal,
hsa_signal_value_t value,
hsa_amd_signal_handler handler,
void* arg) {
// Indicate that this signal is in use.
if (signal.handle != 0) hsa_signal_handle(signal)->Retain();

ScopedAcquire<HybridMutex> scope_lock(&async_events_control_.lock);
struct AsyncEventsInfo* asyncInfo = &asyncSignals_;

if (signal.handle != 0) {
// Indicate that this signal is in use.
hsa_signal_handle(signal)->Retain();

core::Signal* coreSignal = core::Signal::Convert(signal);
if (coreSignal->EopEvent() && coreSignal->EopEvent()->EventData.EventType != HSA_EVENTTYPE_SIGNAL)
asyncInfo = &asyncExceptions_;
}

ScopedAcquire<HybridMutex> scope_lock(&asyncInfo->control.lock);

// Lazy initializer
if (async_events_control_.async_events_thread_ == NULL) {
if (asyncInfo->control.async_events_thread_ == NULL) {
// Create monitoring thread control signal
auto err = HSA::hsa_signal_create(0, 0, NULL, &async_events_control_.wake);
auto err = HSA::hsa_signal_create(0, 0, NULL, &asyncInfo->control.wake);
if (err != HSA_STATUS_SUCCESS) {
assert(false && "Asyncronous events control signal creation error.");
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
async_events_.PushBack(async_events_control_.wake, HSA_SIGNAL_CONDITION_NE,
0, NULL, NULL);
asyncInfo->events.PushBack(asyncInfo->control.wake, HSA_SIGNAL_CONDITION_NE,
0, NULL, NULL);

// Start event monitoring thread
async_events_control_.exit = false;
async_events_control_.async_events_thread_ =
os::CreateThread(AsyncEventsLoop, NULL);
if (async_events_control_.async_events_thread_ == NULL) {
asyncInfo->control.exit = false;
asyncInfo->control.async_events_thread_ =
os::CreateThread(AsyncEventsLoop, asyncInfo);
if (asyncInfo->control.async_events_thread_ == NULL) {
assert(false && "Asyncronous events thread creation error.");
return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
}
}

new_async_events_.PushBack(signal, cond, value, handler, arg);
asyncInfo->new_events.PushBack(signal, cond, value, handler, arg);

hsa_signal_handle(async_events_control_.wake)->StoreRelease(1);
hsa_signal_handle(asyncInfo->control.wake)->StoreRelease(1);

return HSA_STATUS_SUCCESS;
}
Expand Down Expand Up @@ -1499,18 +1508,35 @@ hsa_status_t Runtime::IPCDetach(void* ptr) {
return HSA_STATUS_SUCCESS;
}

void Runtime::AsyncEventsLoop(void*) {
auto& async_events_control_ = runtime_singleton_->async_events_control_;
auto& async_events_ = runtime_singleton_->async_events_;
auto& new_async_events_ = runtime_singleton_->new_async_events_;
void Runtime::AsyncEventsLoop(void* _eventsInfo) {
struct AsyncEventsInfo* eventsInfo = reinterpret_cast<struct AsyncEventsInfo*>(_eventsInfo);

auto& async_events_control_ = eventsInfo->control;
auto& async_events_ = eventsInfo->events;
auto& new_async_events_ = eventsInfo->new_events;

while (!async_events_control_.exit) {
// Wait for a signal
hsa_signal_value_t value;
uint32_t index = AMD::hsa_amd_signal_wait_any(
uint32_t(async_events_.Size()), &async_events_.signal_[0],
&async_events_.cond_[0], &async_events_.value_[0], uint64_t(-1),
HSA_WAIT_STATE_BLOCKED, &value);
uint32_t index = 0;

if (eventsInfo->monitor_exceptions) {
index = Signal::WaitAnyExceptions(
uint32_t(async_events_.Size()),
&async_events_.signal_[0],
&async_events_.cond_[0],
&async_events_.value_[0],
&value);
} else {
index = AMD::hsa_amd_signal_wait_any(
uint32_t(async_events_.Size()),
&async_events_.signal_[0],
&async_events_.cond_[0],
&async_events_.value_[0],
uint64_t(-1),
HSA_WAIT_STATE_BLOCKED,
&value);
}

// Reset the control signal
if (index == 0) {
Expand Down Expand Up @@ -1875,7 +1901,11 @@ Runtime::Runtime()
hw_exception_event_(nullptr),
hw_exception_signal_(nullptr),
ref_count_(0),
kfd_version{} {}
kfd_version{} {

asyncSignals_.monitor_exceptions = false;
asyncExceptions_.monitor_exceptions = true;
}

hsa_status_t Runtime::Load() {
os::cpuid_t cpuinfo;
Expand Down Expand Up @@ -1953,7 +1983,8 @@ void Runtime::Unload() {
std::for_each(disabled_gpu_agents_.begin(), disabled_gpu_agents_.end(), DeleteObject());
disabled_gpu_agents_.clear();

async_events_control_.Shutdown();
asyncSignals_.control.Shutdown();
asyncExceptions_.control.Shutdown();

if (vm_fault_signal_ != nullptr) {
vm_fault_signal_->DestroySignal();
Expand Down
124 changes: 107 additions & 17 deletions src/core/runtime/signal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,23 +255,8 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals,
while (true) {
// Cannot mwaitx - polling multiple signals
for (uint32_t i = 0; i < signal_count; i++) {
if (!signals[i]->IsValid()) return uint32_t(-1);

// Handling special event.
if (signals[i]->EopEvent() != NULL) {
const HSA_EVENTTYPE event_type =
signals[i]->EopEvent()->EventData.EventType;
if (event_type == HSA_EVENTTYPE_MEMORY) {
const HsaMemoryAccessFault& fault =
signals[i]->EopEvent()->EventData.EventData.MemoryAccessFault;
if (fault.Flags == HSA_EVENTID_MEMORY_FATAL_PROCESS) {
return i;
}
} else if (event_type == HSA_EVENTTYPE_HW_EXCEPTION) {
const HsaHwException& exception = signals[i]->EopEvent()->EventData.EventData.HwException;
if (exception.MemoryLost) return i;
}
}
if (!signals[i]->IsValid())
return uint32_t(-1);

value =
atomic::Load(&signals[i]->signal_.value, std::memory_order_relaxed);
Expand Down Expand Up @@ -325,6 +310,111 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals,
}
}

/*
* Special handler to wait listen for exceptions from underlying driver.
*/
uint32_t Signal::WaitAnyExceptions(uint32_t signal_count, const hsa_signal_t* hsa_signals,
const hsa_signal_condition_t* conds, const hsa_signal_value_t* values,
hsa_signal_value_t* satisfying_value) {

uint32_t wait_ms = uint32_t(-1);
hsa_signal_handle* signals =
reinterpret_cast<hsa_signal_handle*>(const_cast<hsa_signal_t*>(hsa_signals));

for (uint32_t i = 0; i < signal_count; i++) signals[i]->Retain();

MAKE_SCOPE_GUARD([&]() {
for (uint32_t i = 0; i < signal_count; i++) signals[i]->Release();
});

uint32_t prior = 0;
for (uint32_t i = 0; i < signal_count; i++) prior = Max(prior, signals[i]->waiting_++);


MAKE_SCOPE_GUARD([&]() {
for (uint32_t i = 0; i < signal_count; i++) signals[i]->waiting_--;
});

if (!core::Runtime::runtime_singleton_->KfdVersion().supports_event_age)
// Allow only the first waiter to sleep. Without event age tracking,
// race condition can cause some threads to sleep without wakeup since missing interrupt.
if (prior != 0) wait_ms = 0;

HsaEvent** evts = new HsaEvent* [signal_count];
MAKE_SCOPE_GUARD([&]() { delete[] evts; });

uint32_t unique_evts = 0;

for (uint32_t i = 0; i < signal_count; i++) {
assert(signals[i]->EopEvent() != NULL);
evts[i] = signals[i]->EopEvent();
}

std::sort(evts, evts + signal_count);
HsaEvent** end = std::unique(evts, evts + signal_count);
unique_evts = uint32_t(end - evts);

uint64_t event_age[unique_evts];
memset(event_age, 0, unique_evts * sizeof(uint64_t));
if (core::Runtime::runtime_singleton_->KfdVersion().supports_event_age)
for (uint32_t i = 0; i < unique_evts; i++)
event_age[i] = 1;

int64_t value;

bool condition_met = false;
while (true) {
// Cannot mwaitx - polling multiple signals

for (uint32_t i = 0; i < signal_count; i++) {
if (!signals[i]->IsValid())
return uint32_t(-1);

const HSA_EVENTTYPE event_type = signals[i]->EopEvent()->EventData.EventType;
if (event_type == HSA_EVENTTYPE_MEMORY) {
const HsaMemoryAccessFault& fault =
signals[i]->EopEvent()->EventData.EventData.MemoryAccessFault;
if (fault.Flags == HSA_EVENTID_MEMORY_FATAL_PROCESS) return i;
} else if (event_type == HSA_EVENTTYPE_HW_EXCEPTION) {
const HsaHwException& exception =
signals[i]->EopEvent()->EventData.EventData.HwException;
if (exception.MemoryLost) return i;
}

value = atomic::Load(&signals[i]->signal_.value, std::memory_order_relaxed);

switch (conds[i]) {
case HSA_SIGNAL_CONDITION_EQ: {
condition_met = (value == values[i]);
break;
}
case HSA_SIGNAL_CONDITION_NE: {
condition_met = (value != values[i]);
break;
}
case HSA_SIGNAL_CONDITION_GTE: {
condition_met = (value >= values[i]);
break;
}
case HSA_SIGNAL_CONDITION_LT: {
condition_met = (value < values[i]);
break;
}
default: {
return uint32_t(-1);
}
}
if (condition_met) {
if (satisfying_value != NULL) *satisfying_value = value;
// Some other signal in the list satisfied condition
return i;
}
}

hsaKmtWaitOnMultipleEvents_Ext(evts, unique_evts, false, wait_ms, event_age);
} //while
}

SignalGroup::SignalGroup(uint32_t num_signals, const hsa_signal_t* hsa_signals)
: count(num_signals) {
if (count != 0) {
Expand Down
27 changes: 0 additions & 27 deletions src/inc/hsa_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -2822,33 +2822,6 @@ hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf);
hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address,
uint64_t flags);

/**
* @brief Allocate a reserved address range
*
* Reserve a virtual address range. The size must be a multiple of the system page size.
* If it is not possible to allocate the address specified by @p address, then @p va will be
* a different address range.
* Address range should be released by calling hsa_amd_vmem_address_free.
*
* @param[out] va virtual address allocated
* @param[in] size of address range requested
* @param[in] address requested
* @param[in] flags currently unsupported
*
* @retval ::HSA_STATUS_SUCCESS Address range allocated successfully
*
* @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been
* initialized.
*
* @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address
* range of this size.
*
* Note that this API will be deprecated in a future release and replaced by
* hsa_amd_vmem_address_reserve_align
*/
hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address,
uint64_t flags);

/**
* @brief Allocate a reserved address range
*
Expand Down

0 comments on commit df75490

Please sign in to comment.