diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e8280bf0f..9759e74ce 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -85,7 +85,7 @@ if (ROCM_CCACHE_BUILD)
 endif() # if (ROCM_CCACHE_BUILD)
 
 ## Get version strings
-get_version ( "1.9.0" )
+get_version ( "1.11.0" )
 if ( ${ROCM_PATCH_VERSION} )
   set ( VERSION_PATCH ${ROCM_PATCH_VERSION})
 endif()
@@ -125,11 +125,14 @@ target_include_directories( ${CORE_RUNTIME_TARGET}
   ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode
   ${CMAKE_CURRENT_BINARY_DIR}/core/runtime/trap_handler)
 
-## Set RUNPATH - ../../lib covers use of the legacy symlink in /hsa/lib/
-set_property(TARGET ${CORE_RUNTIME_TARGET} PROPERTY INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib;$ORIGIN/../../lib64;$ORIGIN/../lib64" )
 
 ## ------------------------- Linux Compiler and Linker options -------------------------
-set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function -mmwaitx )
+set ( HSA_CXX_FLAGS ${HSA_COMMON_CXX_FLAGS} -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=missing-braces -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function )
+
+## Extra x86 specific settings
+if ( CMAKE_SYSTEM_PROCESSOR MATCHES "i?86|x86_64|amd64|AMD64" )
+  set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -mmwaitx )
+endif()
 
 ## Extra image settings - audit!
 set ( HSA_CXX_FLAGS ${HSA_CXX_FLAGS} -Wno-deprecated-declarations )
@@ -306,7 +309,9 @@ install ( TARGETS ${CORE_RUNTIME_TARGET}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT asan )
 
 # Install license
-install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan )
+if(ENABLE_ASAN_PACKAGING)
+   install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR}-asan COMPONENT asan )
+endif()
 install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT binary )
 
 # Install public headers
diff --git a/src/core/common/hsa_table_interface.cpp b/src/core/common/hsa_table_interface.cpp
index 3802ae9b4..593ccb389 100644
--- a/src/core/common/hsa_table_interface.cpp
+++ b/src/core/common/hsa_table_interface.cpp
@@ -58,7 +58,15 @@ const HsaApiTable* hsa_table_interface_get_table() {
 }
 
 // Pass through stub functions
-hsa_status_t HSA_API hsa_init() { return coreApiTable->hsa_init_fn(); }
+hsa_status_t HSA_API hsa_init() {
+  // We initialize the api tables here once more since the code above is prone to a
+  // link-time ordering condition: This compilation unit here may get its global
+  // variables initialized earlier than the global objects in other compilation units.
+  // In particular Init::Init may get called earlier than that the underlying hsa_api_table_
+  // object in hsa_api_trace.cpp has been initialized.
+  rocr::core::LoadInitialHsaApiTable();
+  return coreApiTable->hsa_init_fn();
+}
 
 hsa_status_t HSA_API hsa_shut_down() { return coreApiTable->hsa_shut_down_fn(); }
 
diff --git a/src/core/inc/amd_blit_shaders.h b/src/core/inc/amd_blit_shaders.h
index 583d0bdd0..12c52cc4b 100644
--- a/src/core/inc/amd_blit_shaders.h
+++ b/src/core/inc/amd_blit_shaders.h
@@ -156,6 +156,62 @@ static const unsigned int kCodeFill8[] = {
     0x00001902, 0xD11C6A03, 0x01A90103, 0xBF82FFF5, 0xBF810000,
 };
 
+static const unsigned int kCodeCopyAligned940[] = {
+    0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
+    0xc00a0400, 0x00000030, 0xc00a0500, 0x00000040, 0xc0020600, 0x00000050,
+    0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205, 0xd1196a02, 0x00000900,
+    0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04, 0x00000d00, 0xd11c6a05,
+    0x01a90105, 0xd0e9006a, 0x00001102, 0xbf86000f, 0x86fe6a7e, 0xde410000,
+    0x017f0002, 0xbf8c0f70, 0xd1196a02, 0x00003102, 0xd11c6a03, 0x01a90103,
+    0xde610000, 0x007f0104, 0xd1196a04, 0x00003104, 0xd11c6a05, 0x01a90105,
+    0xbf82ffee, 0xbefe01c1, 0x8e198418, 0x24020084, 0x7e060209, 0xd1196a02,
+    0x00001101, 0xd11c6a03, 0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001501,
+    0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001902, 0xbf86000e, 0xde5d0000,
+    0x087f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
+    0xde7d0000, 0x007f0804, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
+    0xbf82ffef, 0x8e198218, 0x24020082, 0x7e06020d, 0xd1196a02, 0x00001901,
+    0xd11c6a03, 0x01a90103, 0x7e0a020f, 0xd1196a04, 0x00001d01, 0xd11c6a05,
+    0x01a90105, 0xd0e9006a, 0x00002102, 0xbf86000f, 0x86fe6a7e, 0xde510000,
+    0x017f0002, 0xd1196a02, 0x00003302, 0xd11c6a03, 0x01a90103, 0xbf8c0f70,
+    0xde710000, 0x007f0104, 0xd1196a04, 0x00003304, 0xd11c6a05, 0x01a90105,
+    0xbf82ffee, 0xbefe01c1, 0x7e060211, 0xd1196a02, 0x00002100, 0xd11c6a03,
+    0x01a90103, 0x7e0a0213, 0xd1196a04, 0x00002500, 0xd11c6a05, 0x01a90105,
+    0xd0e9006a, 0x00002902, 0xbf860006, 0x86fe6a7e, 0xde410000, 0x017f0002,
+    0xbf8c0f70, 0xde610000, 0x007f0104, 0xbf810000,
+};
+
+static const unsigned int kCodeCopyMisaligned940[] = {
+    0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xc00a0300, 0x00000020,
+    0xc0020400, 0x00000030, 0xbf8cc07f, 0x8e028602, 0x32000002, 0x7e060205,
+    0xd1196a02, 0x00000900, 0xd11c6a03, 0x01a90103, 0x7e0a0207, 0xd1196a04,
+    0x00000d00, 0xd11c6a05, 0x01a90105, 0xd0e9006a, 0x00001102, 0xbf860032,
+    0xde410000, 0x067f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
+    0xde410000, 0x077f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
+    0xde410000, 0x087f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
+    0xde410000, 0x097f0002, 0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103,
+    0xbf8c0f70, 0xde610000, 0x007f0604, 0xd1196a04, 0x00002104, 0xd11c6a05,
+    0x01a90105, 0xde610000, 0x007f0704, 0xd1196a04, 0x00002104, 0xd11c6a05,
+    0x01a90105, 0xde610000, 0x007f0804, 0xd1196a04, 0x00002104, 0xd11c6a05,
+    0x01a90105, 0xde610000, 0x007f0904, 0xd1196a04, 0x00002104, 0xd11c6a05,
+    0x01a90105, 0xbf82ffcb, 0x7e060209, 0xd1196a02, 0x00001100, 0xd11c6a03,
+    0x01a90103, 0x7e0a020b, 0xd1196a04, 0x00001500, 0xd11c6a05, 0x01a90105,
+    0xd0e9006a, 0x00001902, 0xbf86000f, 0x86fe6a7e, 0xde410000, 0x017f0002,
+    0xd1196a02, 0x00002102, 0xd11c6a03, 0x01a90103, 0xbf8c0f70, 0xde610000,
+    0x007f0104, 0xd1196a04, 0x00002104, 0xd11c6a05, 0x01a90105, 0xbf82ffee,
+    0xbf810000, 0x00000000,
+};
+
+static const unsigned int kCodeFill940[] = {
+    0xc00a0100, 0x00000000, 0xc00a0200, 0x00000010, 0xbf8cc07f, 0x8e028602,
+    0x32000002, 0x7e08020a, 0x7e0a020a, 0x7e0c020a, 0x7e0e020a, 0x8e0c840b,
+    0x24020084, 0x7e060205, 0xd1196a02, 0x00000901, 0xd11c6a03, 0x01a90103,
+    0xd0e9006a, 0x00000d02, 0xbf860007, 0xde7d0000, 0x007f0402, 0xd1196a02,
+    0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff6, 0x8e0c820b, 0x24020082,
+    0x7e060207, 0xd1196a02, 0x00000d01, 0xd11c6a03, 0x01a90103, 0xd0e9006a,
+    0x00001102, 0xbf860008, 0x86fe6a7e, 0xde710000, 0x007f0402, 0xd1196a02,
+    0x00001902, 0xd11c6a03, 0x01a90103, 0xbf82fff5, 0xbf810000, 0x00000000,
+};
+
 static const unsigned int kCodeCopyAligned10[] = {
     0xF4080100, 0xFA000000, 0xF4080200, 0xFA000010, 0xF4080300, 0xFA000020,
     0xF4080400, 0xFA000030, 0xF4080500, 0xFA000040, 0xF4000600, 0xFA000050,
diff --git a/src/core/inc/amd_elf_image.hpp b/src/core/inc/amd_elf_image.hpp
index 0da61f5cf..177465c05 100644
--- a/src/core/inc/amd_elf_image.hpp
+++ b/src/core/inc/amd_elf_image.hpp
@@ -183,6 +183,10 @@ namespace elf {
       virtual StringTable* strtab() = 0;
       virtual SymbolTable* symtab() = 0;
       virtual SymbolTable* getSymtab(uint16_t index) = 0;
+      virtual SymbolTable* dynsym() = 0;
+      virtual SymbolTable* getDynsym(uint16_t index) = 0;
+      virtual SymbolTable* getSymbolTable() = 0;
+      virtual SymbolTable* getSymbolTable(uint16_t index) = 0;
 
       virtual StringTable* addStringTable(const std::string& name) = 0;
       virtual StringTable* getStringTable(uint16_t index) = 0;
diff --git a/src/core/inc/amd_gpu_agent.h b/src/core/inc/amd_gpu_agent.h
index 359080c06..905472af4 100644
--- a/src/core/inc/amd_gpu_agent.h
+++ b/src/core/inc/amd_gpu_agent.h
@@ -550,6 +550,10 @@ class GpuAgent : public GpuAgentInt {
   // Bind the Blit object that will drive the copy operation
   lazy_ptr<core::Blit>& GetBlitObject(const core::Agent& dst_agent, const core::Agent& src_agent,
                                       const size_t size);
+
+  // Bind the Blit object that will drive the copy operation by engine ID
+  lazy_ptr<core::Blit>& GetBlitObject(uint32_t engine_id);
+
   // @brief Alternative aperture base address. Only on KV.
   uintptr_t ape1_base_;
 
@@ -563,6 +567,15 @@ class GpuAgent : public GpuAgentInt {
     KernelMutex lock_;
   } gws_queue_;
 
+  // Sets and Tracks pending SDMA status check or request counts
+  void SetCopyRequestRefCount(bool set);
+  void SetCopyStatusCheckRefCount(bool set);
+  int pending_copy_req_ref_;
+  int pending_copy_stat_check_ref_;
+
+  // Tracks what SDMA blits have been used since initialization.
+  uint32_t sdma_blit_used_mask_;
+
   ScratchCache scratch_cache_;
 
   // System memory allocator in the nearest NUMA node.
@@ -572,6 +585,9 @@ class GpuAgent : public GpuAgentInt {
   std::function<void(void*)> system_deallocator_;
 
   DISALLOW_COPY_AND_ASSIGN(GpuAgent);
+
+  // Check if SDMA engine by ID is free
+  bool DmaEngineIsFree(uint32_t engine_id);
 };
 
 }  // namespace amd
diff --git a/src/core/inc/amd_memory_region.h b/src/core/inc/amd_memory_region.h
index e12f0d760..cb5d17e2b 100644
--- a/src/core/inc/amd_memory_region.h
+++ b/src/core/inc/amd_memory_region.h
@@ -95,8 +95,8 @@ class MemoryRegion : public core::MemoryRegion {
   /// @brief Unpin memory.
   static void MakeKfdMemoryUnresident(const void* ptr);
 
-  MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
-               const HsaMemoryProperties& mem_props);
+  MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, bool extended_scope_fine_grain,
+               core::Agent* owner, const HsaMemoryProperties& mem_props);
 
   ~MemoryRegion();
 
@@ -173,6 +173,8 @@ class MemoryRegion : public core::MemoryRegion {
     return static_cast<uint32_t>(mem_props_.MemoryClockMax);
   }
 
+  __forceinline bool extended_scope_fine_grain() const { return extended_scope_fine_grain_; }
+
  private:
   const HsaMemoryProperties mem_props_;
 
@@ -182,6 +184,9 @@ class MemoryRegion : public core::MemoryRegion {
 
   size_t max_single_alloc_size_;
 
+  // Enables creating an extended scope fine grained memory pool region
+  const bool extended_scope_fine_grain_;
+
   // Used to collect total system memory
   static size_t max_sysmem_alloc_size_;
 
diff --git a/src/core/inc/memory_region.h b/src/core/inc/memory_region.h
index 48ecb3dcd..feba80695 100644
--- a/src/core/inc/memory_region.h
+++ b/src/core/inc/memory_region.h
@@ -91,6 +91,7 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> {
     AllocateIPC = (1 << 4),         // System memory that can be IPC-shared
     AllocateNonPaged = (1 << 4),    // Non-paged system memory (AllocateIPC alias)
     AllocatePCIeRW = (1 << 5),      // Enforce pseudo fine grain/RW memory
+    AllocateAsan = (1 << 6),        // ASAN - First page of allocation remapped to system memory
   };
 
   typedef uint32_t AllocateFlags;
diff --git a/src/core/inc/runtime.h b/src/core/inc/runtime.h
index 11e8ce82b..8f4db0783 100644
--- a/src/core/inc/runtime.h
+++ b/src/core/inc/runtime.h
@@ -70,6 +70,12 @@
 #include "core/inc/amd_loader_context.hpp"
 #include "core/inc/amd_hsa_code.hpp"
 
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define SANITIZER_AMDGPU 1
+#endif
+#endif
+
 //---------------------------------------------------------------------------//
 //    Constants                                                              //
 //---------------------------------------------------------------------------//
@@ -112,6 +118,7 @@ class Runtime {
   struct KfdVersion_t {
     HsaVersionInfo version;
     bool supports_exception_debugging;
+    bool supports_event_age;
   };
 
   /// @brief Open connection to kernel driver and increment reference count.
@@ -395,7 +402,12 @@ class Runtime {
 
   uint64_t sys_clock_freq() const { return sys_clock_freq_; }
 
-  void KfdVersion(const HsaVersionInfo& version) { kfd_version.version = version; }
+  void KfdVersion(const HsaVersionInfo& version) {
+    kfd_version.version = version;
+    if (version.KernelInterfaceMajorVersion == 1 &&
+      version.KernelInterfaceMinorVersion >= 14)
+      kfd_version.supports_event_age = true;
+  }
 
   void KfdVersion(bool exception_debugging) {
     kfd_version.supports_exception_debugging = exception_debugging;
@@ -407,9 +419,19 @@ class Runtime {
   static void AsyncEventsLoop(void*);
 
   struct AllocationRegion {
-    AllocationRegion() : region(NULL), size(0), size_requested(0), user_ptr(nullptr) {}
-    AllocationRegion(const MemoryRegion* region_arg, size_t size_arg, size_t size_requested)
-        : region(region_arg), size(size_arg), size_requested(size_requested), user_ptr(nullptr) {}
+    AllocationRegion()
+        : region(NULL),
+          size(0),
+          size_requested(0),
+          alloc_flags(core::MemoryRegion::AllocateNoFlags),
+          user_ptr(nullptr) {}
+    AllocationRegion(const MemoryRegion* region_arg, size_t size_arg, size_t size_requested,
+                     MemoryRegion::AllocateFlags alloc_flags)
+        : region(region_arg),
+          size(size_arg),
+          size_requested(size_requested),
+          alloc_flags(alloc_flags),
+          user_ptr(nullptr) {}
 
     struct notifier_t {
       void* ptr;
@@ -420,6 +442,7 @@ class Runtime {
     const MemoryRegion* region;
     size_t size;           /* actual size = align_up(size_requested, granularity) */
     size_t size_requested; /* size requested by user */
+    MemoryRegion::AllocateFlags alloc_flags;
     void* user_ptr;
     std::unique_ptr<std::vector<notifier_t>> notifiers;
   };
diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
index f2aedcd8f..b09ea82a7 100644
--- a/src/core/runtime/amd_aql_queue.cpp
+++ b/src/core/runtime/amd_aql_queue.cpp
@@ -1332,7 +1332,11 @@ void AqlQueue::FillBufRsrcWord1_Gfx11() {
 
 void AqlQueue::FillBufRsrcWord2() {
   SQ_BUF_RSRC_WORD2 srd2;
-  srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size);
+  const auto& agent_props = agent_->properties();
+  const uint32_t num_xcc = agent_props.NumXcc;
+
+   // report size per XCC
+  srd2.bits.NUM_RECORDS = uint32_t(queue_scratch_.size / num_xcc);
 
   amd_queue_.scratch_resource_descriptor[2] = srd2.u32All;
 }
@@ -1403,8 +1407,10 @@ void AqlQueue::FillComputeTmpRingSize() {
     return;
   }
 
-  // Determine the maximum number of waves device can support
   const auto& agent_props = agent_->properties();
+  const uint32_t num_xcc = agent_props.NumXcc;
+
+  // Determine the maximum number of waves device can support
   uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU;
   uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;
 
@@ -1416,10 +1422,11 @@ void AqlQueue::FillComputeTmpRingSize() {
   tmpring_size.bits.WAVESIZE = wave_scratch;
   assert(wave_scratch == tmpring_size.bits.WAVESIZE && "WAVESIZE Overflow.");
   uint32_t num_waves =
-      queue_scratch_.size / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
+      (queue_scratch_.size / num_xcc) / (tmpring_size.bits.WAVESIZE * queue_scratch_.mem_alignment_size);
+
   tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
   amd_queue_.compute_tmpring_size = tmpring_size.u32All;
-  assert((tmpring_size.bits.WAVES % agent_props.NumShaderBanks == 0) &&
+  assert((tmpring_size.bits.WAVES % (agent_props.NumShaderBanks / num_xcc) == 0) &&
          "Invalid scratch wave count.  Must be divisible by #SEs.");
 }
 
@@ -1431,9 +1438,11 @@ void AqlQueue::FillComputeTmpRingSize_Gfx11() {
     return;
   }
 
-  // Determine the maximum number of waves device can support
   const auto& agent_props = agent_->properties();
-  uint32_t num_cus = agent_props.NumFComputeCores / agent_props.NumSIMDPerCU;
+  const uint32_t num_xcc = agent_props.NumXcc;
+
+  // Determine the maximum number of waves device can support
+  uint32_t num_cus = agent_props.NumFComputeCores / (agent_props.NumSIMDPerCU * num_xcc);
   uint32_t max_scratch_waves = num_cus * agent_props.MaxSlotsScratchCU;
 
   // Scratch is allocated program COMPUTE_TMPRING_SIZE register
@@ -1483,7 +1492,11 @@ void AqlQueue::InitScratchSRD() {
 
   // Populate flat scratch parameters in amd_queue_.
   amd_queue_.scratch_backing_memory_location = queue_scratch_.queue_process_offset;
-  amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size;
+
+  const auto& agent_props = agent_->properties();
+  const uint32_t num_xcc = agent_props.NumXcc;
+  // report size per XCC
+  amd_queue_.scratch_backing_memory_byte_size = queue_scratch_.size / num_xcc;
 
   // For backwards compatibility this field records the per-lane scratch
   // for a 64 lane wavefront. If scratch was allocated for 32 lane waves
diff --git a/src/core/runtime/amd_blit_kernel.cpp b/src/core/runtime/amd_blit_kernel.cpp
index 681b310a1..661e3be4d 100644
--- a/src/core/runtime/amd_blit_kernel.cpp
+++ b/src/core/runtime/amd_blit_kernel.cpp
@@ -491,6 +491,388 @@ static std::string kBlitKernelSource(R"(
   L_FILL_PHASE_2_DONE:
     s_endpgm
   end
+
+shader CopyAligned_940
+  type(CS)
+  user_sgpr_count(2)
+  sgpr_count(32)
+  vgpr_count(8 + (kCopyAlignedUnroll * kCopyAlignedVecWidth))
+
+  // Retrieve kernel arguments.
+  s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+  s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+  s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
+  s_load_dwordx4          s[16:19], s[0:1], s_load_dword_offset(0x30)
+  s_load_dwordx4          s[20:23], s[0:1], s_load_dword_offset(0x40)
+  s_load_dword            s24, s[0:1], s_load_dword_offset(0x50)
+  s_waitcnt               lgkmcnt(0)
+
+  // Compute workitem id.
+  s_lshl_b32              s2, s2, 0x6
+  v_add_co_u32            v0, vcc, s2, v0
+
+  // =====================================================
+  // Phase 1: Byte copy up to 0x100 destination alignment.
+  // =====================================================
+
+  // Compute phase source address.
+  v_mov_b32               v3, s5
+  v_add_co_u32            v2, vcc, v0, s4
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Compute phase destination address.
+  v_mov_b32               v5, s7
+  v_add_co_u32            v4, vcc, v0, s6
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+L_COPY_ALIGNED_PHASE_1_LOOP:
+  // Mask off lanes (or branch out) after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+  s_cbranch_vccz          L_COPY_ALIGNED_PHASE_1_DONE
+  s_and_b64               exec, exec, vcc
+
+  // Load from/advance the source address.
+  flat_load_ubyte         v1, v[2:3] sc0:1 sc1:1
+  s_waitcnt               vmcnt(0)
+  v_add_co_u32            v2, vcc, v2, s24
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Write to/advance the destination address.
+  flat_store_byte         v[4:5], v1 sc0:1 sc1:1
+  v_add_co_u32            v4, vcc, v4, s24
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+  // Repeat until branched out.
+  s_branch                L_COPY_ALIGNED_PHASE_1_LOOP
+
+L_COPY_ALIGNED_PHASE_1_DONE:
+  // Restore EXEC mask for all lanes.
+  s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+
+  // ========================================================
+  // Phase 2: Unrolled dword[x4] copy up to last whole block.
+  // ========================================================
+
+  // Compute unrolled dword[x4] stride across all threads.
+  if kCopyAlignedVecWidth == 4
+    s_lshl_b32            s25, s24, 0x4
+  else
+    s_lshl_b32            s25, s24, 0x2
+  end
+
+  // Compute phase source address.
+  if kCopyAlignedVecWidth == 4
+    v_lshlrev_b32         v1, 0x4, v0
+  else
+    v_lshlrev_b32         v1, 0x2, v0
+  end
+
+  v_mov_b32               v3, s9
+  v_add_co_u32            v2, vcc, v1, s8
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Compute phase destination address.
+  v_mov_b32               v5, s11
+  v_add_co_u32            v4, vcc, v1, s10
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+L_COPY_ALIGNED_PHASE_2_LOOP:
+  // Branch out after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[12:13]
+  s_cbranch_vccz          L_COPY_ALIGNED_PHASE_2_DONE
+
+  // Load from/advance the source address.
+  for var i = 0; i < kCopyAlignedUnroll; i ++
+    if kCopyAlignedVecWidth == 4
+      flat_load_dwordx4   v[8 + (i * 4)], v[2:3] sc0:1 sc1:1
+    else
+      flat_load_dword     v[8 + i], v[2:3] sc0:1 sc1:1
+    end
+
+    v_add_co_u32             v2, vcc, v2, s25
+    v_addc_co_u32         v3, vcc, v3, 0x0, vcc
+  end
+
+  // Write to/advance the destination address.
+  s_waitcnt               vmcnt(0)
+
+  for var i = 0; i < kCopyAlignedUnroll; i ++
+    if kCopyAlignedVecWidth == 4
+      flat_store_dwordx4  v[4:5], v[8 + (i * 4)] sc0:1 sc1:1
+    else
+      flat_store_dword    v[4:5], v[8 + i] sc0:1 sc1:1
+    end
+
+    v_add_co_u32             v4, vcc, v4, s25
+    v_addc_co_u32         v5, vcc, v5, 0x0, vcc
+  end
+
+  // Repeat until branched out.
+  s_branch                L_COPY_ALIGNED_PHASE_2_LOOP
+
+L_COPY_ALIGNED_PHASE_2_DONE:
+
+  // ===========================================
+  // Phase 3: Dword copy up to last whole dword.
+  // ===========================================
+
+  // Compute dword stride across all threads.
+  s_lshl_b32              s25, s24, 0x2
+
+  // Compute phase source address.
+  v_lshlrev_b32           v1, 0x2, v0
+  v_mov_b32               v3, s13
+  v_add_co_u32            v2, vcc, v1, s12
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Compute phase destination address.
+  v_mov_b32               v5, s15
+  v_add_co_u32            v4, vcc, v1, s14
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+L_COPY_ALIGNED_PHASE_3_LOOP:
+  // Mask off lanes (or branch out) after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[16:17]
+  s_cbranch_vccz          L_COPY_ALIGNED_PHASE_3_DONE
+  s_and_b64               exec, exec, vcc
+
+  // Load from/advance the source address.
+  flat_load_dword         v1, v[2:3] sc0:1 sc1:1
+  v_add_co_u32            v2, vcc, v2, s25
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+  s_waitcnt               vmcnt(0)
+
+  // Write to/advance the destination address.
+  flat_store_dword        v[4:5], v1 sc0:1 sc1:1
+  v_add_co_u32            v4, vcc, v4, s25
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+  // Repeat until branched out.
+  s_branch                L_COPY_ALIGNED_PHASE_3_LOOP
+
+L_COPY_ALIGNED_PHASE_3_DONE:
+  // Restore EXEC mask for all lanes.
+  s_mov_b64               exec, 0xFFFFFFFFFFFFFFFF
+
+  // =============================
+  // Phase 4: Byte copy up to end.
+  // =============================
+
+  // Compute phase source address.
+  v_mov_b32               v3, s17
+  v_add_co_u32            v2, vcc, v0, s16
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Compute phase destination address.
+  v_mov_b32               v5, s19
+  v_add_co_u32            v4, vcc, v0, s18
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+  // Mask off lanes (or branch out) after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[20:21]
+  s_cbranch_vccz          L_COPY_ALIGNED_PHASE_4_DONE
+  s_and_b64               exec, exec, vcc
+
+  // Load from the source address.
+  flat_load_ubyte         v1, v[2:3] sc0:1 sc1:1
+  s_waitcnt               vmcnt(0)
+
+  // Write to the destination address.
+  flat_store_byte         v[4:5], v1 sc0:1 sc1:1
+
+L_COPY_ALIGNED_PHASE_4_DONE:
+  s_endpgm
+end
+
+shader CopyMisaligned_940
+  type(CS)
+  user_sgpr_count(2)
+  sgpr_count(23)
+  vgpr_count(6 + kCopyMisalignedUnroll)
+
+  // Retrieve kernel arguments.
+  s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+  s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+  s_load_dwordx4          s[12:15], s[0:1], s_load_dword_offset(0x20)
+  s_load_dword            s16, s[0:1], s_load_dword_offset(0x30)
+  s_waitcnt               lgkmcnt(0)
+
+  // Compute workitem id.
+  s_lshl_b32              s2, s2, 0x6
+  v_add_co_u32            v0, vcc, s2, v0
+
+  // ===================================================
+  // Phase 1: Unrolled byte copy up to last whole block.
+  // ===================================================
+
+  // Compute phase source address.
+  v_mov_b32               v3, s5
+  v_add_co_u32            v2, vcc, v0, s4
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Compute phase destination address.
+  v_mov_b32               v5, s7
+  v_add_co_u32            v4, vcc, v0, s6
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+L_COPY_MISALIGNED_PHASE_1_LOOP:
+  // Branch out after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+  s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_1_DONE
+
+  // Load from/advance the source address.
+  for var i = 0; i < kCopyMisalignedUnroll; i ++
+    flat_load_ubyte       v[6 + i], v[2:3] sc0:1 sc1:1
+    v_add_co_u32          v2, vcc, v2, s16
+    v_addc_co_u32         v3, vcc, v3, 0x0, vcc
+  end
+
+  // Write to/advance the destination address.
+  s_waitcnt               vmcnt(0)
+
+  for var i = 0; i < kCopyMisalignedUnroll; i ++
+    flat_store_byte       v[4:5], v[6 + i] sc0:1 sc1:1
+    v_add_co_u32          v4, vcc, v4, s16
+    v_addc_co_u32         v5, vcc, v5, 0x0, vcc
+  end
+
+  // Repeat until branched out.
+  s_branch                L_COPY_MISALIGNED_PHASE_1_LOOP
+
+L_COPY_MISALIGNED_PHASE_1_DONE:
+
+  // =============================
+  // Phase 2: Byte copy up to end.
+  // =============================
+
+  // Compute phase source address.
+  v_mov_b32               v3, s9
+  v_add_co_u32            v2, vcc, v0, s8
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Compute phase destination address.
+  v_mov_b32               v5, s11
+  v_add_co_u32            v4, vcc, v0, s10
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+L_COPY_MISALIGNED_PHASE_2_LOOP:
+  // Mask off lanes (or branch out) after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[12:13]
+  s_cbranch_vccz          L_COPY_MISALIGNED_PHASE_2_DONE
+  s_and_b64               exec, exec, vcc
+
+  // Load from/advance the source address.
+  flat_load_ubyte         v1, v[2:3] sc0:1 sc1:1
+  v_add_co_u32            v2, vcc, v2, s16
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+  s_waitcnt               vmcnt(0)
+
+  // Write to/advance the destination address.
+  flat_store_byte         v[4:5], v1 sc0:1 sc1:1
+  v_add_co_u32            v4, vcc, v4, s16
+  v_addc_co_u32           v5, vcc, v5, 0x0, vcc
+
+  // Repeat until branched out.
+  s_branch                L_COPY_MISALIGNED_PHASE_2_LOOP
+
+L_COPY_MISALIGNED_PHASE_2_DONE:
+  s_endpgm
+end
+
+shader Fill_940
+  type(CS)
+  user_sgpr_count(2)
+  sgpr_count(19)
+  vgpr_count(8)
+
+  // Retrieve kernel arguments.
+  s_load_dwordx4          s[4:7], s[0:1], s_load_dword_offset(0x0)
+  s_load_dwordx4          s[8:11], s[0:1], s_load_dword_offset(0x10)
+  s_waitcnt               lgkmcnt(0)
+
+  // Compute workitem id.
+  s_lshl_b32              s2, s2, 0x6
+  v_add_co_u32            v0, vcc, s2, v0
+
+  // Copy fill pattern into VGPRs.
+  for var i = 0; i < kFillVecWidth; i ++
+    v_mov_b32           v[4 + i], s10
+  end
+
+  // ========================================================
+  // Phase 1: Unrolled dword[x4] fill up to last whole block.
+  // ========================================================
+
+  // Compute unrolled dword[x4] stride across all threads.
+  if kFillVecWidth == 4
+    s_lshl_b32            s12, s11, 0x4
+  else
+    s_lshl_b32            s12, s11, 0x2
+  end
+
+  // Compute phase destination address.
+  if kFillVecWidth == 4
+    v_lshlrev_b32         v1, 0x4, v0
+  else
+    v_lshlrev_b32         v1, 0x2, v0
+  end
+
+  v_mov_b32               v3, s5
+  v_add_co_u32            v2, vcc, v1, s4
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+L_FILL_PHASE_1_LOOP:
+  // Branch out after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[6:7]
+  s_cbranch_vccz          L_FILL_PHASE_1_DONE
+
+  // Write to/advance the destination address.
+  for var i = 0; i < kFillUnroll; i ++
+    if kFillVecWidth == 4
+      flat_store_dwordx4  v[2:3], v[4:7] sc0:1 sc1:1
+    else
+      flat_store_dword    v[2:3], v4 sc0:1 sc1:1
+    end
+
+    v_add_co_u32          v2, vcc, v2, s12
+    v_addc_co_u32         v3, vcc, v3, 0x0, vcc
+  end
+
+  // Repeat until branched out.
+  s_branch                L_FILL_PHASE_1_LOOP
+
+L_FILL_PHASE_1_DONE:
+
+  // ==============================
+  // Phase 2: Dword fill up to end.
+  // ==============================
+
+  // Compute dword stride across all threads.
+  s_lshl_b32              s12, s11, 0x2
+
+  // Compute phase destination address.
+  v_lshlrev_b32           v1, 0x2, v0
+  v_mov_b32               v3, s7
+  v_add_co_u32            v2, vcc, v1, s6
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+L_FILL_PHASE_2_LOOP:
+  // Mask off lanes (or branch out) after phase end.
+  v_cmp_lt_u64            vcc, v[2:3], s[8:9]
+  s_cbranch_vccz          L_FILL_PHASE_2_DONE
+  s_and_b64               exec, exec, vcc
+
+  // Write to/advance the destination address.
+  flat_store_dword        v[2:3], v4 sc0:1 sc1:1
+  v_add_co_u32            v2, vcc, v2, s12
+  v_addc_co_u32           v3, vcc, v3, 0x0, vcc
+
+  // Repeat until branched out.
+  s_branch                L_FILL_PHASE_2_LOOP
+
+L_FILL_PHASE_2_DONE:
+  s_endpgm
+end
 )");
 
 // Search kernel source for variable definition and return value.
diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp
index 8b35d9955..bdb6070f8 100644
--- a/src/core/runtime/amd_cpu_agent.cpp
+++ b/src/core/runtime/amd_cpu_agent.cpp
@@ -85,15 +85,15 @@ void CpuAgent::InitRegionList() {
     if (system_prop != mem_props.end()) system_props = *system_prop;
 
     MemoryRegion* system_region_fine =
-        new MemoryRegion(true, false, is_apu_node, this, system_props);
+        new MemoryRegion(true, false, is_apu_node, false, this, system_props);
     regions_.push_back(system_region_fine);
     MemoryRegion* system_region_kernarg =
-        new MemoryRegion(true, true, is_apu_node, this, system_props);
+        new MemoryRegion(true, true, is_apu_node, false, this, system_props);
     regions_.push_back(system_region_kernarg);
 
     if (!is_apu_node) {
       MemoryRegion* system_region_coarse =
-          new MemoryRegion(false, false, is_apu_node, this, system_props);
+          new MemoryRegion(false, false, is_apu_node, false, this, system_props);
       regions_.push_back(system_region_coarse);
     }
   }
@@ -385,6 +385,12 @@ hsa_status_t CpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
     case HSA_AMD_AGENT_INFO_IOMMU_SUPPORT:
       *((hsa_amd_iommu_version_t*)value) = HSA_IOMMU_SUPPORT_NONE;
       break;
+    case HSA_AMD_AGENT_INFO_NUM_XCC:
+      *((uint32_t*)value) = 0;
+      break;
+    case HSA_AMD_AGENT_INFO_DRIVER_UID:
+      *((uint32_t*)value) = 0;
+      break;
     default:
       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
       break;
diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
index 7acbd8a74..88b270848 100644
--- a/src/core/runtime/amd_gpu_agent.cpp
+++ b/src/core/runtime/amd_gpu_agent.cpp
@@ -3,7 +3,7 @@
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
 //
-// Copyright (c) 2014-2022, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2014-2023, Advanced Micro Devices, Inc. All rights reserved.
 //
 // Developed by:
 //
@@ -104,6 +104,9 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
       enum_index_(index),
       ape1_base_(0),
       ape1_size_(0),
+      pending_copy_req_ref_(0),
+      pending_copy_stat_check_ref_(0),
+      sdma_blit_used_mask_(0),
       scratch_cache_(
           [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }) {
   const bool is_apu_node = (properties_.NumCPUCores > 0);
@@ -124,8 +127,18 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna
 
   rocr::core::IsaFeature sramecc = rocr::core::IsaFeature::Unsupported;
   if (isa_base->IsSrameccSupported()) {
-    sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled
-                                                              : core::IsaFeature::Disabled;
+    switch (core::Runtime::runtime_singleton_->flag().sramecc_enable()) {
+      case Flag::SRAMECC_DISABLED:
+        sramecc = core::IsaFeature::Disabled;
+        break;
+      case Flag::SRAMECC_ENABLED:
+        sramecc = core::IsaFeature::Enabled;
+        break;
+      case Flag::SRAMECC_DEFAULT:
+        sramecc = node_props.Capability.ui32.SRAM_EDCSupport == 1 ? core::IsaFeature::Enabled
+                                                                  : core::IsaFeature::Disabled;
+        break;
+    }
   }
 
   rocr::core::IsaFeature xnack = rocr::core::IsaFeature::Unsupported;
@@ -243,6 +256,8 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
     ASICShader compute_8;
     ASICShader compute_9;
     ASICShader compute_90a;
+    ASICShader compute_940;
+    ASICShader compute_942;
     ASICShader compute_1010;
     ASICShader compute_10;
     ASICShader compute_11;
@@ -251,53 +266,63 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
   std::map<std::string, CompiledShader> compiled_shaders = {
       {"TrapHandler",
        {
-           {NULL, 0, 0, 0},
-           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
-           {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},
-           {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},
-           {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},
-           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},
-           {NULL, 0, 0, 0},
+           {NULL, 0, 0, 0},                                                 // gfx7
+           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},            // gfx8
+           {kCodeTrapHandler9, sizeof(kCodeTrapHandler9), 2, 4},            // gfx9
+           {kCodeTrapHandler90a, sizeof(kCodeTrapHandler90a), 2, 4},        // gfx90a
+           {NULL, 0, 0, 0},                                                 // gfx940
+           {NULL, 0, 0, 0},                                                 // gfx942
+           {kCodeTrapHandler1010, sizeof(kCodeTrapHandler1010), 2, 4},      // gfx1010
+           {kCodeTrapHandler10, sizeof(kCodeTrapHandler10), 2, 4},          // gfx10
+           {NULL, 0, 0, 0},                                                 // gfx11
        }},
       {"TrapHandlerKfdExceptions",
        {
-           {NULL, 0, 0, 0},
-           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},
-           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},
-           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},
-           {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},
-           {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4},
-           {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4},
+           {NULL, 0, 0, 0},                                                 // gfx7
+           {kCodeTrapHandler8, sizeof(kCodeTrapHandler8), 2, 4},            // gfx8
+           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},      // gfx9
+           {kCodeTrapHandlerV2_9, sizeof(kCodeTrapHandlerV2_9), 2, 4},      // gfx90a
+           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},  // gfx940
+           {kCodeTrapHandlerV2_940, sizeof(kCodeTrapHandlerV2_940), 2, 4},  // gfx942
+           {kCodeTrapHandlerV2_1010, sizeof(kCodeTrapHandlerV2_1010), 2, 4},// gfx1010
+           {kCodeTrapHandlerV2_10, sizeof(kCodeTrapHandlerV2_10), 2, 4},    // gfx10
+           {kCodeTrapHandlerV2_11, sizeof(kCodeTrapHandlerV2_11), 2, 4},    // gfx11
        }},
       {"CopyAligned",
        {
-           {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
-           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},
-           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
-           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},
-           {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12},
+           {kCodeCopyAligned7, sizeof(kCodeCopyAligned7), 32, 12},          // gfx7
+           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx8
+           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx9
+           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx90a
+           {kCodeCopyAligned940, sizeof(kCodeCopyAligned940), 32, 12},      // gfx940
+           {kCodeCopyAligned8, sizeof(kCodeCopyAligned8), 32, 12},          // gfx942
+           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},        // gfx1010
+           {kCodeCopyAligned10, sizeof(kCodeCopyAligned10), 32, 12},        // gfx10
+           {kCodeCopyAligned11, sizeof(kCodeCopyAligned11), 32, 12},        // gfx11
        }},
       {"CopyMisaligned",
        {
-           {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
-           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},
-           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
-           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},
-           {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10},
+           {kCodeCopyMisaligned7, sizeof(kCodeCopyMisaligned7), 23, 10},    // gfx7
+           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx8
+           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx9
+           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx90a
+           {kCodeCopyMisaligned940, sizeof(kCodeCopyMisaligned940), 23, 10},// gfx940
+           {kCodeCopyMisaligned8, sizeof(kCodeCopyMisaligned8), 23, 10},    // gfx942
+           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},  // gfx1010
+           {kCodeCopyMisaligned10, sizeof(kCodeCopyMisaligned10), 23, 10},  // gfx10
+           {kCodeCopyMisaligned11, sizeof(kCodeCopyMisaligned11), 23, 10},  // gfx11
        }},
       {"Fill",
        {
-           {kCodeFill7, sizeof(kCodeFill7), 19, 8},
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},
-           {kCodeFill8, sizeof(kCodeFill8), 19, 8},
-           {kCodeFill10, sizeof(kCodeFill10), 19, 8},
-           {kCodeFill10, sizeof(kCodeFill10), 19, 8},
-           {kCodeFill11, sizeof(kCodeFill11), 19, 8},
+           {kCodeFill7, sizeof(kCodeFill7), 19, 8},                         // gfx7
+           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx8
+           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx9
+           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx90a
+           {kCodeFill940, sizeof(kCodeFill940), 19, 8},                     // gfx940
+           {kCodeFill8, sizeof(kCodeFill8), 19, 8},                         // gfx942
+           {kCodeFill10, sizeof(kCodeFill10), 19, 8},                       // gfx1010
+           {kCodeFill10, sizeof(kCodeFill10), 19, 8},                       // gfx10
+           {kCodeFill11, sizeof(kCodeFill11), 19, 8},                       // gfx11
        }}};
 
   auto compiled_shader_it = compiled_shaders.find(func_name);
@@ -314,10 +339,22 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
       asic_shader = &compiled_shader_it->second.compute_8;
       break;
     case 9:
-      if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10))
+      if((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) {
         asic_shader = &compiled_shader_it->second.compute_90a;
-      else
+      } else if(isa_->GetMinorVersion() == 4) {
+        switch(isa_->GetStepping()) {
+          case 0:
+          case 1:
+            asic_shader = &compiled_shader_it->second.compute_940;
+            break;
+          case 2:
+          default:
+            asic_shader = &compiled_shader_it->second.compute_942;
+            break;
+        }
+      } else {
         asic_shader = &compiled_shader_it->second.compute_9;
+      }
       break;
     case 10:
       if(isa_->GetMinorVersion() == 1)
@@ -368,8 +405,10 @@ void GpuAgent::AssembleShader(const char* func_name, AssembleTarget assemble_tar
     AMD_HSA_BITS_SET(header->compute_pgm_rsrc2,
                      AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X, 1);
 
-    if ((isa_->GetMajorVersion() == 9) && (isa_->GetMinorVersion() == 0) &&
-        (isa_->GetStepping() == 10)) {
+    // gfx90a, gfx940, gfx941, gfx942
+    if ((isa_->GetMajorVersion() == 9) &&
+        (((isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) ||
+        (isa_->GetMinorVersion() == 4))) {
       // Program COMPUTE_PGM_RSRC3.ACCUM_OFFSET for 0 ACC VGPRs on gfx90a.
       // FIXME: Assemble code objects from source at build time
       int gran_accvgprs = ((gran_vgprs + 1) * 8) / 4 - 1;
@@ -411,15 +450,19 @@ void GpuAgent::InitRegionList() {
           memory_max_frequency_ = mem_props[mem_idx].MemoryClockMax;
         case HSA_HEAPTYPE_GPU_LDS:
         case HSA_HEAPTYPE_GPU_SCRATCH: {
-          MemoryRegion* region = new MemoryRegion(false, false, false, this, mem_props[mem_idx]);
+          MemoryRegion* region =
+              new MemoryRegion(false, false, false, false, this, mem_props[mem_idx]);
 
           regions_.push_back(region);
 
           if (region->IsLocalMemory()) {
+            regions_.push_back(
+                new MemoryRegion(false, false, false, true, this, mem_props[mem_idx]));
             // Expose VRAM as uncached/fine grain over PCIe (if enabled) or XGMI.
             if ((properties_.HiveID != 0) ||
                 (core::Runtime::runtime_singleton_->flag().fine_grain_pcie())) {
-              regions_.push_back(new MemoryRegion(true, false, false, this, mem_props[mem_idx]));
+              regions_.push_back(
+                  new MemoryRegion(true, false, false, false, this, mem_props[mem_idx]));
             }
           }
           break;
@@ -485,8 +528,11 @@ void GpuAgent::InitScratchPool() {
 void GpuAgent::ReserveScratch()
 {
   size_t reserved_sz = core::Runtime::runtime_singleton_->flag().scratch_single_limit();
+  size_t available;
+  HSAKMT_STATUS err = hsaKmtAvailableMemory(node_id(), &available);
+  assert(err == HSAKMT_STATUS_SUCCESS && "hsaKmtAvailableMemory failed");
   ScopedAcquire<KernelMutex> lock(&scratch_lock_);
-  if (!scratch_cache_.reserved_bytes() && reserved_sz) {
+  if (!scratch_cache_.reserved_bytes() && reserved_sz && available > 8 * reserved_sz) {
     HSAuint64 alt_va;
     void* reserved_base = scratch_pool_.alloc(reserved_sz);
     assert(reserved_base && "Could not allocate reserved memory");
@@ -679,6 +725,7 @@ void GpuAgent::InitDma() {
       // On gfx90a ensure that HostToDevice queue is created first and so is placed on SDMA0.
       if ((!use_xgmi) && (!isHostToDev) && (isa_->GetMajorVersion() == 9) &&
           (isa_->GetMinorVersion() == 0) && (isa_->GetStepping() == 10)) {
+        GetBlitObject(BlitHostToDev);
         *blits_[BlitHostToDev];
       }
 
@@ -686,7 +733,12 @@ void GpuAgent::InitDma() {
       if (ret != nullptr) return ret;
     }
 
-    auto ret = CreateBlitKernel((*queue).get());
+    // pending_copy_stat_check_ref_ will prevent unnecessary compute queue creation
+    // since there is no graceful way to handle lazy loading when the caller needs to know
+    // the status of available SDMA HW resources without a fallback.
+    // Call to isSDMA should be used as a proxy error check if !blit_copy_fallback.
+    auto ret = pending_copy_stat_check_ref_ ? new AMD::BlitKernel(NULL) :
+                                              CreateBlitKernel((*queue).get());
     if (ret == nullptr)
       throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "Blit creation failed.");
     return ret;
@@ -774,11 +826,34 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, const void* src, size_t size) {
   return blits_[BlitDevToDev]->SubmitLinearCopyCommand(dst, src, size);
 }
 
+void GpuAgent::SetCopyRequestRefCount(bool set) {
+  ScopedAcquire<KernelMutex> lock(&blit_lock_);
+  while (pending_copy_stat_check_ref_) {
+    blit_lock_.Release();
+    os::YieldThread();
+    blit_lock_.Acquire();
+  }
+  if (!set && pending_copy_req_ref_) pending_copy_req_ref_--;
+  else pending_copy_req_ref_++;
+}
+
+void GpuAgent::SetCopyStatusCheckRefCount(bool set) {
+  ScopedAcquire<KernelMutex> lock(&blit_lock_);
+  while (pending_copy_req_ref_) {
+    blit_lock_.Release();
+    os::YieldThread();
+    blit_lock_.Acquire();
+  }
+  if (!set && pending_copy_stat_check_ref_) pending_copy_stat_check_ref_--;
+  else pending_copy_stat_check_ref_++;
+}
+
 hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
                                const void* src, core::Agent& src_agent,
                                size_t size,
                                std::vector<core::Signal*>& dep_signals,
                                core::Signal& out_signal) {
+  SetCopyRequestRefCount(true);
   // Bind the Blit object that will drive this copy operation
   lazy_ptr<core::Blit>& blit = GetBlitObject(dst_agent, src_agent, size);
 
@@ -789,6 +864,7 @@ hsa_status_t GpuAgent::DmaCopy(void* dst, core::Agent& dst_agent,
   }
 
   hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal);
+  SetCopyRequestRefCount(false);
 
   return stat;
 }
@@ -812,28 +888,43 @@ hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent,
 
   // check if dst and src are the same gpu or over xGMI.
   bool is_same_gpu = (src_agent.public_handle().handle == dst_agent.public_handle().handle) &&
-      (dst_agent.public_handle().handle == public_handle_.handle);
-  bool is_xgmi = !is_same_gpu &&
-                   src_agent.device_type() == core::Agent::kAmdGpuDevice &&
-                     dst_agent.device_type() == core::Agent::kAmdGpuDevice &&
-                       dst_agent.HiveId() && src_agent.HiveId() == dst_agent.HiveId() &&
+                     (dst_agent.public_handle().handle == public_handle_.handle);
+
+  bool is_p2p = !is_same_gpu && src_agent.device_type() == core::Agent::kAmdGpuDevice &&
+                                dst_agent.device_type() == core::Agent::kAmdGpuDevice;
+
+  if ((is_p2p &&
+      core::Runtime::runtime_singleton_->flag().enable_peer_sdma() == Flag::SDMA_DISABLE) ||
+      core::Runtime::runtime_singleton_->flag().enable_sdma() == Flag::SDMA_DISABLE) {
+    // Note  that VDI/HIP will call DmaCopy instead of DmaCopyOnEngine for P2P copies, but
+    // we still want to handle force Blit Kernels in this function in case other libraries
+    // decide to use DmaCopyOnEngine for P2P copies
+
+    engine_offset = BlitDevToDev;
+  } else {
+    bool is_xgmi = is_p2p && dst_agent.HiveId() && src_agent.HiveId() == dst_agent.HiveId() &&
                          properties_.NumSdmaXgmiEngines;
 
-  // Due to a RAS issue, GFX90a can only support H2D copies on SDMA0
-  bool is_h2d_blit = (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
-    dst_agent.device_type() == core::Agent::kAmdGpuDevice);
-  bool limit_h2d_blit = isa_->GetVersion() == core::Isa::Version(9, 0, 10);
+    // Due to a RAS issue, GFX90a can only support H2D copies on SDMA0
+    bool is_h2d_blit = (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
+      dst_agent.device_type() == core::Agent::kAmdGpuDevice);
+    bool limit_h2d_blit = isa_->GetVersion() == core::Isa::Version(9, 0, 10);
+
+    // Ensure engine selection is within proper range based on transfer type
+    if ((is_xgmi && engine_offset <= properties_.NumSdmaEngines) ||
+        (!is_xgmi && engine_offset > (properties_.NumSdmaEngines +
+                                      properties_.NumSdmaXgmiEngines)) ||
+          (!is_h2d_blit && !is_same_gpu && limit_h2d_blit &&
+            engine_offset == BlitHostToDev)) {
+      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    }
 
-  // Ensure engine selection is within proper range based on transfer type
-  if ((is_xgmi && engine_offset <= properties_.NumSdmaEngines) ||
-       (!is_xgmi && engine_offset > properties_.NumSdmaEngines) ||
-         (!is_h2d_blit && !is_same_gpu && limit_h2d_blit && engine_offset == BlitHostToDev)) {
-    return HSA_STATUS_ERROR_INVALID_ARGUMENT;
+    engine_offset = is_same_gpu ?(force_copy_on_sdma ? BlitDevToHost :
+                                   BlitDevToDev) : engine_offset;
   }
 
-  lazy_ptr<core::Blit>& blit = is_same_gpu ?
-                                 (force_copy_on_sdma ? blits_[BlitDevToHost] :
-                                   blits_[BlitDevToDev]) : blits_[engine_offset];
+  SetCopyRequestRefCount(true);
+  lazy_ptr<core::Blit>& blit = GetBlitObject(engine_offset);
 
   if (profiling_enabled()) {
     // Track the agent so we could translate the resulting timestamp to system
@@ -842,10 +933,20 @@ hsa_status_t GpuAgent::DmaCopyOnEngine(void* dst, core::Agent& dst_agent,
   }
 
   hsa_status_t stat = blit->SubmitLinearCopyCommand(dst, src, size, dep_signals, out_signal);
+  SetCopyRequestRefCount(false);
 
   return stat;
 }
 
+bool GpuAgent::DmaEngineIsFree(uint32_t engine_offset) {
+  SetCopyStatusCheckRefCount(true);
+  bool is_free = !!!(sdma_blit_used_mask_ & (1 << engine_offset)) ||
+                    (blits_[engine_offset]->isSDMA() &&
+                     !!!blits_[engine_offset]->PendingBytes());
+  SetCopyStatusCheckRefCount(false);
+  return is_free;
+}
+
 hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_agent,
                                      uint32_t *engine_ids_mask) {
   assert(((src_agent.device_type() == core::Agent::kAmdGpuDevice) ||
@@ -857,10 +958,10 @@ hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_ag
                    dst_agent.device_type() == core::Agent::kAmdGpuDevice &&
                      dst_agent.HiveId() && src_agent.HiveId() == dst_agent.HiveId() &&
                        properties_.NumSdmaXgmiEngines) {
-    // Find a free xGMI SDMA engine
+    //Find a free xGMI SDMA engine
     for (int i = 0; i < properties_.NumSdmaXgmiEngines; i++) {
-      if (!!!blits_[DefaultBlitCount + i]->PendingBytes()) {
-         *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i);
+      if (DmaEngineIsFree(DefaultBlitCount + i)) {
+        *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i);
       }
     }
   } else {
@@ -869,14 +970,24 @@ hsa_status_t GpuAgent::DmaCopyStatus(core::Agent& dst_agent, core::Agent& src_ag
     // Due to a RAS issue, GFX90a can only support H2D copies on SDMA0
     bool limit_h2d_blit = isa_->GetVersion() == core::Isa::Version(9, 0, 10);
 
-    if (!!!blits_[BlitHostToDev]->PendingBytes()) {
+    // Check if H2D is free
+    if (DmaEngineIsFree(BlitHostToDev)) {
       if (is_h2d_blit || !limit_h2d_blit) {
         *engine_ids_mask |= HSA_AMD_SDMA_ENGINE_0;
       }
     }
 
-    if (!!!blits_[BlitDevToHost]->PendingBytes()) {
-      *engine_ids_mask |= HSA_AMD_SDMA_ENGINE_1;
+    // Check is D2H is free
+    if (DmaEngineIsFree(BlitDevToHost)) {
+      *engine_ids_mask |= properties_.NumSdmaEngines > 1 ?
+                          HSA_AMD_SDMA_ENGINE_1 :
+                          HSA_AMD_SDMA_ENGINE_0;
+    }
+    // Find a free xGMI SDMA engine for H2D/D2H though it may be lower bandwidth
+    for (int i = 0; i < properties_.NumSdmaXgmiEngines; i++) {
+      if (DmaEngineIsFree(DefaultBlitCount + i)) {
+         *engine_ids_mask |= (HSA_AMD_SDMA_ENGINE_2 << i);
+      }
     }
   }
 
@@ -890,10 +1001,14 @@ hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_
                                    core::Signal& out_signal) {
   if (isa_->GetMajorVersion() < 9) return HSA_STATUS_ERROR_INVALID_AGENT;
 
-  lazy_ptr<core::Blit>& blit =
-      (dir == hsaHostToDevice) ? blits_[BlitHostToDev] : blits_[BlitDevToHost];
+  SetCopyRequestRefCount(true);
+  lazy_ptr<core::Blit>& blit = GetBlitObject((dir == hsaHostToDevice) ? BlitHostToDev :
+                                                                        BlitDevToHost);
 
-  if (!blit->isSDMA()) return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  if (!blit->isSDMA()) {
+    SetCopyRequestRefCount(false);
+    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+  }
 
   if (profiling_enabled()) {
     // Track the agent so we could translate the resulting timestamp to system
@@ -904,6 +1019,7 @@ hsa_status_t GpuAgent::DmaCopyRect(const hsa_pitched_ptr_t* dst, const hsa_dim3_
   BlitSdmaBase* sdmaBlit = static_cast<BlitSdmaBase*>((*blit).get());
   hsa_status_t stat = sdmaBlit->SubmitCopyRectCommand(dst, dst_offset, src, src_offset, range,
                                                       dep_signals, out_signal);
+  SetCopyRequestRefCount(false);
 
   return stat;
 }
@@ -922,6 +1038,12 @@ hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) {
     }
   }
 
+  // If we did not update t1 since agent initialization, force a SyncClock. Otherwise computing
+  // the SystemClockCounter to GPUClockCounter ratio in TranslateTime(tick) results to a division
+  // by 0. We perform the check here because we do not want to check everytime there is a call to
+  // TranslateTime(tick)
+  if (enable && t0_.GPUClockCounter == t1_.GPUClockCounter) SyncClocks();
+
   return HSA_STATUS_SUCCESS;
 }
 
@@ -931,6 +1053,9 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
   // agent, and vendor name length limit excluding terminating nul character.
   constexpr size_t hsa_name_size = 63;
 
+  const bool isa_has_image_support =
+      (isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() == 4) ? false : true;
+
   switch (attribute_u) {
     case HSA_AGENT_INFO_NAME: {
       std::string name = isa_->GetProcessorName();
@@ -1066,18 +1191,21 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
     case HSA_EXT_AGENT_INFO_IMAGE_2DADEPTH_MAX_ELEMENTS:
     case HSA_EXT_AGENT_INFO_IMAGE_3D_MAX_ELEMENTS:
     case HSA_EXT_AGENT_INFO_IMAGE_ARRAY_MAX_LAYERS:
-      return hsa_amd_image_get_info_max_dim(public_handle(), attribute, value);
+      if (!isa_has_image_support)
+        *((uint32_t*)value) = 0;
+      else
+        return hsa_amd_image_get_info_max_dim(public_handle(), attribute, value);
+      break;
     case HSA_EXT_AGENT_INFO_MAX_IMAGE_RD_HANDLES:
       // TODO: hardcode based on OCL constants.
-      *((uint32_t*)value) = 128;
+      *((uint32_t*)value) = isa_has_image_support ? 128 : 0;
       break;
     case HSA_EXT_AGENT_INFO_MAX_IMAGE_RORW_HANDLES:
-      // TODO: hardcode based on OCL constants.
-      *((uint32_t*)value) = 64;
+      *((uint32_t*)value) = isa_has_image_support ? 64 : 0;
       break;
     case HSA_EXT_AGENT_INFO_MAX_SAMPLER_HANDLERS:
-      // TODO: hardcode based on OCL constants.
-      *((uint32_t*)value) = 16;
+      *((uint32_t*)value) = isa_has_image_support ? 16 : 0;
+      break;
     case HSA_AMD_AGENT_INFO_CHIP_ID:
       *((uint32_t*)value) = properties_.DeviceId;
       break;
@@ -1231,6 +1359,12 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const {
       else
         *((hsa_amd_iommu_version_t*)value) = HSA_IOMMU_SUPPORT_NONE;
       break;
+    case HSA_AMD_AGENT_INFO_NUM_XCC:
+      *((uint32_t*)value) = static_cast<uint32_t>(properties_.NumXcc);
+      break;
+    case HSA_AMD_AGENT_INFO_DRIVER_UID:
+      *((uint32_t*)value) = KfdGpuID();
+      break;
     default:
       return HSA_STATUS_ERROR_INVALID_ARGUMENT;
       break;
@@ -1454,8 +1588,11 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
       debug_print("Failed to map requested scratch (%ld) - reducing queue occupancy.\n",
                   scratch.size);
     const uint64_t num_cus = properties_.NumFComputeCores / properties_.NumSIMDPerCU;
+    const uint64_t se_per_xcc = properties_.NumShaderBanks / properties_.NumXcc;
+
     const uint64_t total_waves = scratch.size / size_per_wave;
-    uint64_t waves_per_cu = total_waves / num_cus;
+    uint64_t waves_per_cu = AlignUp(total_waves / num_cus, scratch.waves_per_group);
+
     while (waves_per_cu != 0) {
       size_t size = waves_per_cu * num_cus * size_per_wave;
       void* base = scratch_pool_.alloc_high(size);
@@ -1475,7 +1612,14 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
         return;
       }
       scratch_pool_.free(base);
-      waves_per_cu = waves_per_cu - scratch.waves_per_group;
+
+      // Wave count must be divisible by #SEs in an XCC. If occupancy must be reduced
+      // such that waves_per_cu < waves_per_group, continue reducing by #SEs per XCC
+      // (only allowed if waves_per_group is a multiple #SEs per XCC).
+      waves_per_cu -= (waves_per_cu <= scratch.waves_per_group &&
+                       se_per_xcc < scratch.waves_per_group &&
+                       scratch.waves_per_group % se_per_xcc == 0) ?
+                       se_per_xcc : scratch.waves_per_group;
     }
 
     // Failed to allocate minimal scratch
@@ -1658,6 +1802,12 @@ void GpuAgent::BindTrapHandler() {
     AssembleShader("TrapHandlerKfdExceptions", AssembleTarget::ISA, trap_code_buf_,
                    trap_code_buf_size_);
   } else {
+    if (isa_->GetMajorVersion() >= 11 ||
+       (isa_->GetMajorVersion() == 9 && isa_->GetMinorVersion() == 4)) {
+      // No trap handler support without exception handling, soft error.
+      return;
+    }
+
     AssembleShader("TrapHandler", AssembleTarget::ISA, trap_code_buf_, trap_code_buf_size_);
 
     // Make an empty map from doorbell index to queue.
@@ -1728,6 +1878,11 @@ void GpuAgent::InvalidateCodeCaches() {
   queues_[QueueUtility]->ExecutePM4(cache_inv, cache_inv_size_dw * sizeof(uint32_t));
 }
 
+lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(uint32_t engine_offset) {
+  sdma_blit_used_mask_ |= 1 << engine_offset;
+  return blits_[engine_offset];
+}
+
 lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
   // Determine if destination is a member xgmi peers list
   uint32_t xgmi_engine_cnt = properties_.NumSdmaXgmiEngines;
@@ -1739,25 +1894,21 @@ lazy_ptr<core::Blit>& GpuAgent::GetXgmiBlit(const core::Agent& dst_agent) {
     uint64_t dst_handle = dst_agent.public_handle().handle;
     uint64_t peer_handle = xgmi_peer_list_[idx]->public_handle().handle;
     if (peer_handle == dst_handle) {
-      return blits_[(idx % xgmi_engine_cnt) + DefaultBlitCount];
+      return GetBlitObject((idx % xgmi_engine_cnt) + DefaultBlitCount);
     }
   }
 
   // Add agent to the xGMI neighbours list
   xgmi_peer_list_.push_back(&dst_agent);
-  return blits_[((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount];
+  return GetBlitObject(((xgmi_peer_list_.size() - 1) % xgmi_engine_cnt) + DefaultBlitCount);
 }
 
 lazy_ptr<core::Blit>& GpuAgent::GetPcieBlit(const core::Agent& dst_agent,
                                             const core::Agent& src_agent) {
-  lazy_ptr<core::Blit>& blit =
-    (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
-     dst_agent.device_type() == core::Agent::kAmdGpuDevice)
-       ? blits_[BlitHostToDev]  // CPU->GPU transfer.
-       : (src_agent.device_type() == core::Agent::kAmdGpuDevice &&
-          dst_agent.device_type() == core::Agent::kAmdCpuDevice)
-            ? blits_[BlitDevToHost]   // GPU->CPU transfer.
-            : blits_[BlitDevToHost];  // GPU->GPU transfer.
+  bool is_h2d = (src_agent.device_type() == core::Agent::kAmdCpuDevice &&
+                 dst_agent.device_type() == core::Agent::kAmdGpuDevice);
+
+  lazy_ptr<core::Blit>& blit = GetBlitObject(is_h2d ? BlitHostToDev : BlitDevToHost);
   return blit;
 }
 
@@ -1776,11 +1927,17 @@ lazy_ptr<core::Blit>& GpuAgent::GetBlitObject(const core::Agent& dst_agent,
     // If the copy is very small then cache flush overheads can dominate.
     // Choose a (potentially) SDMA enabled engine to avoid cache flushing.
     if (size < core::Runtime::runtime_singleton_->flag().force_sdma_size()) {
-      return blits_[BlitDevToHost];
+      return GetBlitObject(BlitDevToHost);
     }
     return blits_[BlitDevToDev];
   }
 
+  if (core::Runtime::runtime_singleton_->flag().enable_peer_sdma() == Flag::SDMA_DISABLE
+      && src_agent.device_type() == core::Agent::kAmdGpuDevice
+      && dst_agent.device_type() == core::Agent::kAmdGpuDevice) {
+      return blits_[BlitDevToDev];
+  }
+
   // Acquire Hive Id of Src and Dst devices - ignore hive id for CPU devices.
   // CPU-GPU connections should always use the host (aka pcie) facing SDMA engines, even if the
   // connection is XGMI.
diff --git a/src/core/runtime/amd_memory_region.cpp b/src/core/runtime/amd_memory_region.cpp
index b6f91efa7..2218dfc9f 100644
--- a/src/core/runtime/amd_memory_region.cpp
+++ b/src/core/runtime/amd_memory_region.cpp
@@ -100,20 +100,28 @@ void MemoryRegion::MakeKfdMemoryUnresident(const void* ptr) {
   hsaKmtUnmapMemoryToGPU(const_cast<void*>(ptr));
 }
 
-MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, core::Agent* owner,
+MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile,
+                           bool extended_scope_fine_grain, core::Agent* owner,
                            const HsaMemoryProperties& mem_props)
     : core::MemoryRegion(fine_grain, kernarg, full_profile, owner),
       mem_props_(mem_props),
+      extended_scope_fine_grain_(extended_scope_fine_grain),
       max_single_alloc_size_(0),
       virtual_size_(0),
       fragment_allocator_(BlockAllocator(*this)) {
   virtual_size_ = GetPhysicalSize();
 
+  // extended_scope_fine_grain and fine_grain memory regions are mutually exclusive
+  assert(!(fine_grain && extended_scope_fine_grain));
+
   mem_flag_.Value = 0;
   map_flag_.Value = 0;
-
   static const HSAuint64 kGpuVmSize = (1ULL << 40);
 
+  // Bind the memory region based on whether it is
+  // coarse or fine grain or extended scope fine grain.
+  mem_flag_.ui32.CoarseGrain = (fine_grain || extended_scope_fine_grain) ? 0 : 1;
+
   if (IsLocalMemory()) {
     mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB;
     mem_flag_.ui32.NoSubstitute = 1;
@@ -122,6 +130,20 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, cor
     mem_flag_.ui32.NonPaged = 1;
 
     virtual_size_ = kGpuVmSize;
+
+    // If memory region is extended scope fine grained
+    // mark the page table entries for this memory region
+    // as MTYPE_UC. Full read and write ordering are guaranteed
+    // to this address.
+    if (extended_scope_fine_grain) {
+      AMD::GpuAgent* agent_ =
+          const_cast<AMD::GpuAgent*>(reinterpret_cast<const AMD::GpuAgent*>(owner));
+      if (agent_->isa()->GetVersion() == core::Isa::Version(9, 4, 0) ||
+          agent_->isa()->GetVersion() == core::Isa::Version(9, 4, 1) ||
+          agent_->isa()->GetVersion() == core::Isa::Version(9, 4, 2))
+        mem_flag_.ui32.Uncached = 1;
+    }
+
   } else if (IsSystem()) {
     mem_flag_.ui32.PageSize = HSA_PAGE_SIZE_4KB;
     mem_flag_.ui32.NoSubstitute = 0;
@@ -134,8 +156,6 @@ MemoryRegion::MemoryRegion(bool fine_grain, bool kernarg, bool full_profile, cor
         (full_profile) ? os::GetUserModeVirtualMemorySize() : kGpuVmSize;
   }
 
-  // Bind if memory region is coarse or fine grain
-  mem_flag_.ui32.CoarseGrain = (fine_grain) ? 0 : 1;
 
   // Adjust allocatable size per page align
   max_single_alloc_size_ = AlignDown(static_cast<size_t>(GetPhysicalSize()), kPageSize_);
@@ -199,6 +219,13 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
     useSubAlloc &= ((alloc_flags & (~AllocateRestrict)) == 0);
     if (useSubAlloc) {
       *address = fragment_allocator_.alloc(size);
+
+      if ((alloc_flags & AllocateAsan) &&
+          hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) {
+        fragment_allocator_.free(*address);
+        *address = NULL;
+        return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+      }
       return HSA_STATUS_SUCCESS;
     }
   }
@@ -253,6 +280,12 @@ hsa_status_t MemoryRegion::AllocateImpl(size_t& size, AllocateFlags alloc_flags,
       return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
     }
 
+    if ((alloc_flags & AllocateAsan) &&
+        hsaKmtReplaceAsanHeaderPage(*address) != HSAKMT_STATUS_SUCCESS) {
+      FreeKfdMemory(*address, size);
+      *address = NULL;
+      return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
+    }
     return HSA_STATUS_SUCCESS;
   }
 
@@ -304,8 +337,12 @@ hsa_status_t MemoryRegion::GetInfo(hsa_region_info_t attribute,
         case HSA_HEAPTYPE_SYSTEM:
         case HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC:
         case HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE: {
-          uint32_t ret = fine_grain() ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
-                                      : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
+          uint32_t ret = 0;
+
+          ret = fine_grain()                ? HSA_REGION_GLOBAL_FLAG_FINE_GRAINED
+              : extended_scope_fine_grain() ? HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED
+                                            : HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED;
+
           if (kernarg()) ret |= HSA_REGION_GLOBAL_FLAG_KERNARG;
           *((uint32_t*)value) = ret;
           break;
@@ -458,21 +495,16 @@ hsa_amd_memory_pool_access_t MemoryRegion::GetAccessInfo(
 
   // Determine access type for device local memory which is
   // guaranteed to be HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC
-  // Return disallowed by default if framebuffer is coarse grained
-  // without regard to type of requesting device (CPU / GPU)
-  // Return disallowed by default if framebuffer is fine grained
-  // and requesting device is connected via xGMI link
 
   if (IsLocalMemory()) {
-
     // Return disallowed by default if memory is coarse
-    // grained without regard to link type
-    if  (fine_grain() == false) {
+    // grained or extended scope fine grained without regard to link type
+    if (fine_grain() == false) {
       return HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT;
     }
 
     // Return disallowed by default if memory is fine
-    // grained and link type is xGMI.
+    // grained and requesting device is connected via xGMI link
     if (agent.HiveId() == owner()->HiveId()) {
       return HSA_AMD_MEMORY_POOL_ACCESS_DISALLOWED_BY_DEFAULT;
     }
diff --git a/src/core/runtime/default_signal.cpp b/src/core/runtime/default_signal.cpp
index bd2f7cf1f..820fc75ca 100644
--- a/src/core/runtime/default_signal.cpp
+++ b/src/core/runtime/default_signal.cpp
@@ -42,9 +42,11 @@
 
 #include "core/inc/default_signal.h"
 #include "core/util/timer.h"
-#include <mwaitxintrin.h>
 
+#if defined(__i386__) || defined(__x86_64__)
+#include <mwaitxintrin.h>
 #define MWAITX_ECX_TIMER_ENABLE 0x2  // BIT(1)
+#endif
 
 namespace rocr {
 namespace core {
@@ -103,7 +105,9 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
       timer::duration_from_seconds<timer::fast_clock::duration>(
           double(timeout) / double(hsa_freq));
 
+#if defined(__i386__) || defined(__x86_64__)
   if (g_use_mwaitx) _mm_monitorx(const_cast<int64_t*>(&signal_.value), 0, 0);
+#endif
 
   while (true) {
     if (!IsValid()) return 0;
@@ -138,11 +142,13 @@ hsa_signal_value_t BusyWaitSignal::WaitRelaxed(hsa_signal_condition_t condition,
       return hsa_signal_value_t(value);
     }
 
-    if (time - start_time > kMaxElapsed)
+    if (time - start_time > kMaxElapsed) {
       os::uSleep(20);
-    else if (g_use_mwaitx) {
+#if defined(__i386__) || defined(__x86_64__)
+    } else if (g_use_mwaitx) {
       _mm_mwaitx(0, 60000, MWAITX_ECX_TIMER_ENABLE);  // 60000 ~20us on a 1.5Ghz CPU
       _mm_monitorx(const_cast<int64_t*>(&signal_.value), 0, 0);
+#endif
     }
   }
 }
diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp
index bcc141457..c509fda5e 100644
--- a/src/core/runtime/hsa.cpp
+++ b/src/core/runtime/hsa.cpp
@@ -1084,6 +1084,8 @@ hsa_status_t
   TRY;
   IS_OPEN();
 
+  core::MemoryRegion::AllocateFlags alloc_flag = core::MemoryRegion::AllocateNoFlags;
+
   if (size == 0 || ptr == NULL) {
     return HSA_STATUS_ERROR_INVALID_ARGUMENT;
   }
@@ -1091,8 +1093,7 @@ hsa_status_t
   const core::MemoryRegion* mem_region = core::MemoryRegion::Convert(region);
   IS_VALID(mem_region);
 
-  return core::Runtime::runtime_singleton_->AllocateMemory(
-      mem_region, size, core::MemoryRegion::AllocateNoFlags, ptr);
+  return core::Runtime::runtime_singleton_->AllocateMemory(mem_region, size, alloc_flag, ptr);
   CATCH;
 }
 
diff --git a/src/core/runtime/hsa_ext_amd.cpp b/src/core/runtime/hsa_ext_amd.cpp
index a4feac989..09c5976e4 100644
--- a/src/core/runtime/hsa_ext_amd.cpp
+++ b/src/core/runtime/hsa_ext_amd.cpp
@@ -761,6 +761,10 @@ hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool, siz
 
   if (flags == HSA_AMD_MEMORY_POOL_PCIE_FLAG) alloc_flag |= core::MemoryRegion::AllocatePCIeRW;
 
+#ifdef SANITIZER_AMDGPU
+  alloc_flag |= core::MemoryRegion::AllocateAsan;
+#endif
+
   return core::Runtime::runtime_singleton_->AllocateMemory(mem_region, size, alloc_flag, ptr);
   CATCH;
 }
diff --git a/src/core/runtime/interrupt_signal.cpp b/src/core/runtime/interrupt_signal.cpp
index 773bbffa1..9d7691aa2 100644
--- a/src/core/runtime/interrupt_signal.cpp
+++ b/src/core/runtime/interrupt_signal.cpp
@@ -44,9 +44,11 @@
 #include "core/inc/runtime.h"
 #include "core/util/timer.h"
 #include "core/util/locks.h"
-#include <mwaitxintrin.h>
 
+#if defined(__i386__) || defined(__x86_64__)
+#include <mwaitxintrin.h>
 #define MWAITX_ECX_TIMER_ENABLE 0x2  // BIT(1)
+#endif
 
 namespace rocr {
 namespace core {
@@ -147,8 +149,15 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed(
 
   uint32_t prior = waiting_++;
   MAKE_SCOPE_GUARD([&]() { waiting_--; });
-  // Allow only the first waiter to sleep (temporary, known to be bad).
-  if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE;
+
+  uint64_t event_age = 1;
+
+  if (!core::Runtime::runtime_singleton_->KfdVersion().supports_event_age) {
+      event_age = 0;
+      // Allow only the first waiter to sleep. Without event age tracking,
+      // race condition can cause some threads to sleep without wakeup since missing interrupt.
+      if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE;
+  }
 
   int64_t value;
 
@@ -165,7 +174,10 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed(
           double(timeout) / double(hsa_freq));
 
   bool condition_met = false;
+
+#if defined(__i386__) || defined(__x86_64__)
   if (g_use_mwaitx) _mm_monitorx(const_cast<int64_t*>(&signal_.value), 0, 0);
+#endif
 
   while (true) {
     if (!IsValid()) return 0;
@@ -201,19 +213,23 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed(
     }
 
     if (wait_hint == HSA_WAIT_STATE_ACTIVE) {
+#if defined(__i386__) || defined(__x86_64__)
       if (g_use_mwaitx) {
         _mm_mwaitx(0, 0, 0);
         _mm_monitorx(const_cast<int64_t*>(&signal_.value), 0, 0);
       }
+#endif
       continue;
     }
 
     if (time - start_time < kMaxElapsed) {
       //  os::uSleep(20);
+#if defined(__i386__) || defined(__x86_64__)
       if (g_use_mwaitx) {
         _mm_mwaitx(0, 60000, MWAITX_ECX_TIMER_ENABLE);
         _mm_monitorx(const_cast<int64_t*>(&signal_.value), 0, 0);
       }
+#endif
       continue;
     }
 
@@ -222,7 +238,7 @@ hsa_signal_value_t InterruptSignal::WaitRelaxed(
     uint64_t ct=timer::duration_cast<std::chrono::milliseconds>(
       time_remaining).count();
     wait_ms = (ct>0xFFFFFFFEu) ? 0xFFFFFFFEu : ct;
-    hsaKmtWaitOnEvent(event_, wait_ms);
+    hsaKmtWaitOnEvent_Ext(event_, wait_ms, &event_age);
   }
 }
 
diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp
index 7d3cf906e..3674848a7 100755
--- a/src/core/runtime/isa.cpp
+++ b/src/core/runtime/isa.cpp
@@ -297,6 +297,33 @@ constexpr size_t hsa_name_size = 63;
   ISAREG_ENTRY_GEN("gfx90c",                 9, 0, 12, unsupported, any,         64)
   ISAREG_ENTRY_GEN("gfx90c:xnack-",          9, 0, 12, unsupported, disabled,    64)
   ISAREG_ENTRY_GEN("gfx90c:xnack+",          9, 0, 12, unsupported, enabled,     64)
+  ISAREG_ENTRY_GEN("gfx940",                 9, 4, 0,  any,         any,         64)
+  ISAREG_ENTRY_GEN("gfx940:xnack-",          9, 4, 0,  any,         disabled,    64)
+  ISAREG_ENTRY_GEN("gfx940:xnack+",          9, 4, 0,  any,         enabled,     64)
+  ISAREG_ENTRY_GEN("gfx940:sramecc-",        9, 4, 0,  disabled,    any,         64)
+  ISAREG_ENTRY_GEN("gfx940:sramecc+",        9, 4, 0,  enabled,     any,         64)
+  ISAREG_ENTRY_GEN("gfx940:sramecc-:xnack-", 9, 4, 0,  disabled,    disabled,    64)
+  ISAREG_ENTRY_GEN("gfx940:sramecc-:xnack+", 9, 4, 0,  disabled,    enabled,     64)
+  ISAREG_ENTRY_GEN("gfx940:sramecc+:xnack-", 9, 4, 0,  enabled,     disabled,    64)
+  ISAREG_ENTRY_GEN("gfx940:sramecc+:xnack+", 9, 4, 0,  enabled,     enabled,     64)
+  ISAREG_ENTRY_GEN("gfx941",                 9, 4, 1,  any,         any,         64)
+  ISAREG_ENTRY_GEN("gfx941:xnack-",          9, 4, 1,  any,         disabled,    64)
+  ISAREG_ENTRY_GEN("gfx941:xnack+",          9, 4, 1,  any,         enabled,     64)
+  ISAREG_ENTRY_GEN("gfx941:sramecc-",        9, 4, 1,  disabled,    any,         64)
+  ISAREG_ENTRY_GEN("gfx941:sramecc+",        9, 4, 1,  enabled,     any,         64)
+  ISAREG_ENTRY_GEN("gfx941:sramecc-:xnack-", 9, 4, 1,  disabled,    disabled,    64)
+  ISAREG_ENTRY_GEN("gfx941:sramecc-:xnack+", 9, 4, 1,  disabled,    enabled,     64)
+  ISAREG_ENTRY_GEN("gfx941:sramecc+:xnack-", 9, 4, 1,  enabled,     disabled,    64)
+  ISAREG_ENTRY_GEN("gfx941:sramecc+:xnack+", 9, 4, 1,  enabled,     enabled,     64)
+  ISAREG_ENTRY_GEN("gfx942",                 9, 4, 2,  any,         any,         64)
+  ISAREG_ENTRY_GEN("gfx942:xnack-",          9, 4, 2,  any,         disabled,    64)
+  ISAREG_ENTRY_GEN("gfx942:xnack+",          9, 4, 2,  any,         enabled,     64)
+  ISAREG_ENTRY_GEN("gfx942:sramecc-",        9, 4, 2,  disabled,    any,         64)
+  ISAREG_ENTRY_GEN("gfx942:sramecc+",        9, 4, 2,  enabled,     any,         64)
+  ISAREG_ENTRY_GEN("gfx942:sramecc-:xnack-", 9, 4, 2,  disabled,    disabled,    64)
+  ISAREG_ENTRY_GEN("gfx942:sramecc-:xnack+", 9, 4, 2,  disabled,    enabled,     64)
+  ISAREG_ENTRY_GEN("gfx942:sramecc+:xnack-", 9, 4, 2,  enabled,     disabled,    64)
+  ISAREG_ENTRY_GEN("gfx942:sramecc+:xnack+", 9, 4, 2,  enabled,     enabled,     64)
   ISAREG_ENTRY_GEN("gfx1010",                10, 1, 0, unsupported, any,         32)
   ISAREG_ENTRY_GEN("gfx1010:xnack-",         10, 1, 0, unsupported, disabled,    32)
   ISAREG_ENTRY_GEN("gfx1010:xnack+",         10, 1, 0, unsupported, enabled,     32)
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
index 74d061749..9647c3690 100644
--- a/src/core/runtime/runtime.cpp
+++ b/src/core/runtime/runtime.cpp
@@ -291,7 +291,7 @@ hsa_status_t Runtime::AllocateMemory(const MemoryRegion* region, size_t size,
   // Track the allocation result so that it could be freed properly.
   if (status == HSA_STATUS_SUCCESS) {
     ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
-    allocation_map_[*address] = AllocationRegion(region, size, size_requested);
+    allocation_map_[*address] = AllocationRegion(region, size, size_requested, alloc_flags);
   }
 
   return status;
@@ -305,6 +305,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
   const MemoryRegion* region = nullptr;
   size_t size = 0;
   std::unique_ptr<std::vector<AllocationRegion::notifier_t>> notifiers;
+  MemoryRegion::AllocateFlags alloc_flags = core::MemoryRegion::AllocateNoFlags;
 
   {
     ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
@@ -317,6 +318,7 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
     }
     region = it->second.region;
     size = it->second.size;
+    alloc_flags = it->second.alloc_flags;
 
     // Imported fragments can't be released with FreeMemory.
     if (region == nullptr) {
@@ -338,6 +340,9 @@ hsa_status_t Runtime::FreeMemory(void* ptr) {
     }
   }
 
+  if (alloc_flags & core::MemoryRegion::AllocateAsan)
+    assert(hsaKmtReturnAsanHeaderPage(ptr) == HSAKMT_STATUS_SUCCESS);
+
   return region->Free(ptr, size);
 }
 
@@ -690,8 +695,8 @@ hsa_status_t Runtime::GetSystemInfo(hsa_system_info_t attribute, void* value) {
 
       // Implemented in KFD in 1.12
       if (kfd_version.KernelInterfaceMajorVersion > 1 ||
-          kfd_version.KernelInterfaceMajorVersion == 1 &&
-              kfd_version.KernelInterfaceMinorVersion >= 12)
+          (kfd_version.KernelInterfaceMajorVersion == 1 &&
+              kfd_version.KernelInterfaceMinorVersion >= 12))
         *(reinterpret_cast<bool*>(value)) = true;
       else
         *(reinterpret_cast<bool*>(value)) = false;
@@ -1025,7 +1030,8 @@ hsa_status_t Runtime::IPCAttach(const hsa_amd_ipc_memory_t* handle, size_t len,
       len = Min(len, importSize - fragOffset);
     }
     ScopedAcquire<KernelSharedMutex> lock(&memory_lock_);
-    allocation_map_[importAddress] = AllocationRegion(nullptr, len, len);
+    allocation_map_[importAddress] =
+        AllocationRegion(nullptr, len, len, core::MemoryRegion::AllocateNoFlags);
   };
 
   if ((importHandle.handle[6] & 0x80000000) != 0) {
@@ -1415,7 +1421,11 @@ hsa_status_t Runtime::Load() {
 
   // Assume features are not supported if parse CPUID fails
   if (!os::ParseCpuID(&cpuinfo)) {
-    fprintf(stderr, "Failed to parse CPUID\n");
+    /*
+     * This is not a failure, in some environments such as SRIOV, not all CPUID info is
+     * exposed inside the guest
+     */
+    debug_warning("Parsing CPUID failed.");
   }
 
   flag_.Refresh();
diff --git a/src/core/runtime/signal.cpp b/src/core/runtime/signal.cpp
index 486a2a305..3d554a03b 100644
--- a/src/core/runtime/signal.cpp
+++ b/src/core/runtime/signal.cpp
@@ -197,8 +197,10 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals,
     for (uint32_t i = 0; i < signal_count; i++) signals[i]->waiting_--;
   });
 
-  // Allow only the first waiter to sleep (temporary, known to be bad).
-  if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE;
+  if (!core::Runtime::runtime_singleton_->KfdVersion().supports_event_age)
+      // Allow only the first waiter to sleep. Without event age tracking,
+      // race condition can cause some threads to sleep without wakeup since missing interrupt.
+      if (prior != 0) wait_hint = HSA_WAIT_STATE_ACTIVE;
 
   // Ensure that all signals in the list can be slept on.
   if (wait_hint != HSA_WAIT_STATE_ACTIVE) {
@@ -229,6 +231,12 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals,
     if (signal_count > small_size) delete[] evts;
   });
 
+  uint64_t event_age[unique_evts];
+  memset(event_age, 0, unique_evts * sizeof(uint64_t));
+  if (core::Runtime::runtime_singleton_->KfdVersion().supports_event_age)
+    for (uint32_t i = 0; i < unique_evts; i++)
+      event_age[i] = 1;
+
   int64_t value;
 
   timer::fast_clock::time_point start_time = timer::fast_clock::now();
@@ -310,7 +318,7 @@ uint32_t Signal::WaitAny(uint32_t signal_count, const hsa_signal_t* hsa_signals,
     uint64_t ct=timer::duration_cast<std::chrono::milliseconds>(
       time_remaining).count();
     wait_ms = (ct>0xFFFFFFFEu) ? 0xFFFFFFFEu : ct;
-    hsaKmtWaitOnMultipleEvents(evts, unique_evts, false, wait_ms);
+    hsaKmtWaitOnMultipleEvents_Ext(evts, unique_evts, false, wait_ms, event_age);
   }
 }
 
diff --git a/src/core/runtime/trap_handler/CMakeLists.txt b/src/core/runtime/trap_handler/CMakeLists.txt
index c98e10486..2196cb0e9 100644
--- a/src/core/runtime/trap_handler/CMakeLists.txt
+++ b/src/core/runtime/trap_handler/CMakeLists.txt
@@ -46,8 +46,8 @@ cmake_minimum_required ( VERSION 3.7 )
 find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
 find_package(LLVM REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm )
 
-set (TARGET_DEVS "gfx900;gfx1010;gfx1030;gfx1100")
-set (POSTFIX "9;1010;10;11")
+set (TARGET_DEVS "gfx900;gfx940;gfx941;gfx942;gfx1010;gfx1030;gfx1100")
+set (POSTFIX "9;940;941;942;1010;10;11")
 
 if(${CMAKE_VERBOSE_MAKEFILE})
   get_property(clang_path TARGET clang PROPERTY LOCATION)
diff --git a/src/core/runtime/trap_handler/trap_handler.s b/src/core/runtime/trap_handler/trap_handler.s
index 9e1d56bd4..1803f4fa1 100644
--- a/src/core/runtime/trap_handler/trap_handler.s
+++ b/src/core/runtime/trap_handler/trap_handler.s
@@ -71,22 +71,27 @@
 .set TTMP6_SAVED_TRAP_ID_SIZE                , 4
 .set TTMP6_SAVED_TRAP_ID_MASK                , (((1 << TTMP6_SAVED_TRAP_ID_SIZE) - 1) << TTMP6_SAVED_TRAP_ID_SHIFT)
 .set TTMP6_SAVED_TRAP_ID_BFE                 , (TTMP6_SAVED_TRAP_ID_SHIFT | (TTMP6_SAVED_TRAP_ID_SIZE << 16))
-.set TTMP11_PC_HI_SHIFT                      , 7
-.set TTMP11_DEBUG_ENABLED_SHIFT              , 23
+
+.set TTMP_PC_HI_SHIFT                        , 7
+.set TTMP_DEBUG_ENABLED_SHIFT                , 23
 
 .if .amdgcn.gfx_generation_number == 9
-  .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 26
+  .set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT     , 26
   .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
   .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x1F8000
 .elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3
-  .set TTMP11_SAVE_REPLAY_W64H_SHIFT         , 31
-  .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT   , 24
+  .set TTMP_SAVE_REPLAY_W64H_SHIFT           , 31
+  .set TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT     , 24
   .set SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT      , 25
   .set SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT     , 15
   .set SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK , 0x3F8000
   .set SQ_WAVE_IB_STS_REPLAY_W64H_MASK       , 0x2000000
 .endif
 
+.if .amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4
+  .set TTMP11_TTMPS_SETUP_SHIFT              , 31
+.endif
+
 // ABI between first and second level trap handler:
 //   ttmp0 = PC[31:0]
 //   ttmp12 = SQ_WAVE_STATUS
@@ -94,7 +99,10 @@
 //   ttmp15 = TMA[63:32]
 // gfx9:
 //   ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32]
+// gfx906/gfx908/gfx90a:
 //   ttmp11 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[15:0], NoScratch[0], WaveIdInWG[5:0]
+// gfx940/gfx941/gfx942:
+//   ttmp13 = SQ_WAVE_IB_STS[20:15], 0[1:0], DebugEnabled[0], 0[22:0]
 // gfx10:
 //   ttmp1 = 0[0], PCRewind[5:0], HostTrap[0], TrapId[7:0], PC[47:32]
 // gfx1010:
@@ -117,7 +125,11 @@ trap_entry:
   // If llvm.debugtrap and debugger is not attached.
   s_cmp_eq_u32         ttmp2, TRAP_ID_DEBUGTRAP
   s_cbranch_scc0       .no_skip_debugtrap
-  s_bitcmp0_b32        ttmp11, TTMP11_DEBUG_ENABLED_SHIFT
+.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || .amdgcn.gfx_generation_number == 10
+  s_bitcmp0_b32        ttmp11, TTMP_DEBUG_ENABLED_SHIFT
+.else
+  s_bitcmp0_b32        ttmp13, TTMP_DEBUG_ENABLED_SHIFT
+.endif
   s_cbranch_scc0       .no_skip_debugtrap
 
   // Ignore llvm.debugtrap.
@@ -200,6 +212,7 @@ trap_entry:
   s_mov_b32            m0, ttmp3
   s_nop                0x0 // Manually inserted wait states
   s_sendmsg            sendmsg(MSG_INTERRUPT)
+  s_waitcnt            lgkmcnt(0) // Wait for the message to go out.
   s_mov_b32            m0, ttmp2
 
   // Parking the wave requires saving the original pc in the preserved ttmps.
@@ -212,26 +225,18 @@ trap_entry:
   //
   // ttmp7:  pc_lo[31:0]
   // ttmp11: 1st_level_ttmp11[31:23] pc_hi[15:0] 1st_level_ttmp11[6:0]
-
-.if ((.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor >= 3) || .amdgcn.gfx_generation_number > 10)
-  s_branch             .halt_wave
-.else
+.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor < 4) || (.amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3) || (.amdgcn.gfx_generation_number == 11)
   // Save the PC
   s_mov_b32            ttmp7, ttmp0
   s_and_b32            ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK
-  s_lshl_b32           ttmp1, ttmp1, TTMP11_PC_HI_SHIFT
-  s_andn2_b32          ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP11_PC_HI_SHIFT)
+  s_lshl_b32           ttmp1, ttmp1, TTMP_PC_HI_SHIFT
+  s_andn2_b32          ttmp11, ttmp11, (SQ_WAVE_PC_HI_ADDRESS_MASK << TTMP_PC_HI_SHIFT)
   s_or_b32             ttmp11, ttmp11, ttmp1
 
   // Park the wave
   s_getpc_b64          [ttmp0, ttmp1]
   s_add_u32            ttmp0, ttmp0, .parked - .
   s_addc_u32           ttmp1, ttmp1, 0x0
-  s_branch             .halt_wave
-
-.parked:
-  s_trap               0x2
-  s_branch             .parked
 .endif
 
 .halt_wave:
@@ -239,17 +244,29 @@ trap_entry:
   s_bitset1_b32        ttmp6, TTMP6_WAVE_STOPPED_SHIFT
   s_bitset1_b32        ttmp12, SQ_WAVE_STATUS_HALT_SHIFT
 
+.if (.amdgcn.gfx_generation_number == 9 && .amdgcn.gfx_generation_minor >= 4)
+  s_bitcmp1_b32        ttmp11, TTMP11_TTMPS_SETUP_SHIFT
+  s_cbranch_scc1       .ttmps_initialized
+  s_mov_b32            ttmp4, 0
+  s_mov_b32            ttmp5, 0
+  s_bitset1_b32        ttmp11, TTMP11_TTMPS_SETUP_SHIFT
+.ttmps_initialized:
+.endif
+
 .exit_trap:
   // Restore SQ_WAVE_IB_STS.
 .if .amdgcn.gfx_generation_number == 9
-  s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+.if .amdgcn.gfx_generation_minor < 4
+  s_lshr_b32           ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+.else
+  s_lshr_b32           ttmp2, ttmp13, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+.endif
   s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
   s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
-.endif
-.if .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3
-  s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+.elseif .amdgcn.gfx_generation_number == 10 && .amdgcn.gfx_generation_minor < 3
+  s_lshr_b32           ttmp2, ttmp11, (TTMP_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
   s_and_b32            ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
-  s_lshr_b32           ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
+  s_lshr_b32           ttmp2, ttmp11, (TTMP_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
   s_and_b32            ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
   s_or_b32             ttmp2, ttmp2, ttmp3
   s_setreg_b32         hwreg(HW_REG_IB_STS), ttmp2
@@ -262,3 +279,15 @@ trap_entry:
 
   // Return to original (possibly modified) PC.
   s_rfe_b64            [ttmp0, ttmp1]
+
+.parked:
+  s_trap               0x2
+  s_branch             .parked
+
+// For gfx11, add padding instructions so we can ensure instruction cache
+// prefetch always has something to load.
+.if .amdgcn.gfx_generation_number == 11
+.rept (256 - ((. - trap_entry) % 64)) / 4
+  s_code_end
+.endr
+.endif
diff --git a/src/core/util/flag.h b/src/core/util/flag.h
index 5c5e8f310..507b29901 100644
--- a/src/core/util/flag.h
+++ b/src/core/util/flag.h
@@ -57,6 +57,7 @@ namespace rocr {
 class Flag {
  public:
   enum SDMA_OVERRIDE { SDMA_DISABLE, SDMA_ENABLE, SDMA_DEFAULT };
+  enum SRAMECC_ENABLE { SRAMECC_DISABLED, SRAMECC_ENABLED, SRAMECC_DEFAULT };
 
   // The values are meaningful and chosen to satisfy the thunk API.
   enum XNACK_REQUEST { XNACK_DISABLE = 0, XNACK_ENABLE = 1, XNACK_UNCHANGED = 2 };
@@ -86,6 +87,9 @@ class Flag {
     var = os::GetEnvVar("HSA_ENABLE_SDMA");
     enable_sdma_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
 
+    var = os::GetEnvVar("HSA_ENABLE_PEER_SDMA");
+    enable_peer_sdma_ = (var == "0") ? SDMA_DISABLE : ((var == "1") ? SDMA_ENABLE : SDMA_DEFAULT);
+
     visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES");
     filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES");
 
@@ -175,6 +179,10 @@ class Flag {
     var = os::GetEnvVar("HSA_SVM_PROFILE");
     svm_profile_ = var;
 
+    var = os::GetEnvVar("HSA_ENABLE_SRAMECC");
+    sramecc_enable_ =
+        (var == "0") ? SRAMECC_DISABLED : ((var == "1") ? SRAMECC_ENABLED : SRAMECC_DEFAULT);
+
     var = os::GetEnvVar("HSA_IMAGE_PRINT_SRD");
     image_print_srd_ = (var == "1") ? true : false;
 
@@ -220,6 +228,8 @@ class Flag {
 
   SDMA_OVERRIDE enable_sdma() const { return enable_sdma_; }
 
+  SDMA_OVERRIDE enable_peer_sdma() const { return enable_peer_sdma_; }
+
   std::string visible_gpus() const { return visible_gpus_; }
 
   bool filter_visible_gpus() const { return filter_visible_gpus_; }
@@ -269,6 +279,8 @@ class Flag {
 
   const std::string& svm_profile() const { return svm_profile_; }
 
+  SRAMECC_ENABLE sramecc_enable() const { return sramecc_enable_; }
+
  private:
   bool check_flat_scratch_;
   bool enable_vm_fault_message_;
@@ -295,6 +307,7 @@ class Flag {
   bool enable_mwaitx_;
 
   SDMA_OVERRIDE enable_sdma_;
+  SDMA_OVERRIDE enable_peer_sdma_;
 
   bool filter_visible_gpus_;
   std::string visible_gpus_;
@@ -312,6 +325,8 @@ class Flag {
   // Indicates user preference for Xnack state.
   XNACK_REQUEST xnack_;
 
+  SRAMECC_ENABLE sramecc_enable_;
+
   // Map GPU index post RVD to its default cu mask.
   std::map<uint32_t, std::vector<uint32_t>> cu_mask_;
 
diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp
index 838b61983..f9f842cb5 100644
--- a/src/core/util/lnx/os_linux.cpp
+++ b/src/core/util/lnx/os_linux.cpp
@@ -60,7 +60,9 @@
 #include <string>
 #include <utility>
 #include "core/inc/runtime.h"
+#if defined(__i386__) || defined(__x86_64__)
 #include <cpuid.h>
+#endif
 
 namespace rocr {
 namespace os {
@@ -84,6 +86,7 @@ class os_thread {
  public:
   explicit os_thread(ThreadEntry function, void* threadArgument, uint stackSize)
       : thread(0), lock(nullptr), state(RUNNING) {
+    int err;
     std::unique_ptr<ThreadArgs> args(new ThreadArgs);
     lock = CreateMutex();
     if (lock == nullptr) return;
@@ -92,45 +95,68 @@ class os_thread {
     args->entry_function = function;
 
     pthread_attr_t attrib;
-    pthread_attr_init(&attrib);
+    err = pthread_attr_init(&attrib);
+    if (err != 0) {
+      fprintf(stderr, "pthread_attr_init failed: %s\n", strerror(err));
+      return;
+    }
 
     if (stackSize != 0) {
       stackSize = Max(uint(PTHREAD_STACK_MIN), stackSize);
       stackSize = AlignUp(stackSize, 4096);
-      int err = pthread_attr_setstacksize(&attrib, stackSize);
-      assert(err == 0 && "pthread_attr_setstacksize failed.");
+      err = pthread_attr_setstacksize(&attrib, stackSize);
+      if (err != 0) {
+        fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err));
+        return;
+      }
     }
 
     if (core::Runtime::runtime_singleton_->flag().override_cpu_affinity()) {
       int cores = get_nprocs_conf();
       cpu_set_t* cpuset = CPU_ALLOC(cores);
+      if (cpuset == nullptr) {
+        fprintf(stderr, "CPU_ALLOC failed: %s\n", strerror(errno));
+        return;
+      }
       CPU_ZERO_S(CPU_ALLOC_SIZE(cores), cpuset);
       for (int i = 0; i < cores; i++) {
         CPU_SET(i, cpuset);
       }
-      int err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset);
-      assert(err == 0 && "pthread_attr_setaffinity_np failed.");
+      err = pthread_attr_setaffinity_np(&attrib, CPU_ALLOC_SIZE(cores), cpuset);
       CPU_FREE(cpuset);
+      if (err != 0) {
+        fprintf(stderr, "pthread_attr_setaffinity_np failed: %s\n", strerror(err));
+        return;
+      }
     }
 
-    int err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
+    err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
 
     // Probably a stack size error since system limits can be different from PTHREAD_STACK_MIN
     // Attempt to grow the stack within reason.
     if ((err == EINVAL) && stackSize != 0) {
       while (stackSize < 20 * 1024 * 1024) {
         stackSize *= 2;
-        pthread_attr_setstacksize(&attrib, stackSize);
+        err = pthread_attr_setstacksize(&attrib, stackSize);
+        if (err != 0) {
+          fprintf(stderr, "pthread_attr_setstacksize failed: %s\n", strerror(err));
+          return;
+        }
         err = pthread_create(&thread, &attrib, ThreadTrampoline, args.get());
         if (err != EINVAL) break;
+        debug_print("pthread_create returned EINVAL, doubling stack size\n");
       }
     }
 
-    pthread_attr_destroy(&attrib);
     if (err == 0)
       args.release();
     else
       thread = 0;
+
+    err = pthread_attr_destroy(&attrib);
+    if (err != 0) {
+      fprintf(stderr, "pthread_attr_destroy failed: %s\n", strerror(err));
+    }
   }
 
   os_thread(os_thread&& rhs) {
@@ -145,7 +171,10 @@ class os_thread {
 
   ~os_thread() {
     if (lock != nullptr) DestroyMutex(lock);
-    if ((state == RUNNING) && (thread != 0)) pthread_detach(thread);
+    if ((state == RUNNING) && (thread != 0)) {
+      int err = pthread_detach(thread);
+      if (err != 0) fprintf(stderr, "pthread_detach failed: %s\n", strerror(err));
+    }
   }
 
   bool Valid() { return (lock != nullptr) && (thread != 0); }
@@ -192,11 +221,17 @@ void* GetExportAddress(LibHandle lib, std::string export_name) {
 
   link_map* map;
   int err = dlinfo(*(void**)&lib, RTLD_DI_LINKMAP, &map);
-  assert(err != -1 && "dlinfo failed.");
+  if (err == -1) {
+    fprintf(stderr, "dlinfo failed: %s\n", dlerror());
+    return nullptr;
+  }
 
   Dl_info info;
   err = dladdr(ret, &info);
-  assert(err != 0 && "dladdr failed.");
+  if (err == 0) {
+    fprintf(stderr, "dladdr failed.\n");
+    return nullptr;
+  }
 
   if (strcmp(info.dli_fname, map->l_name) == 0) return ret;
 
@@ -529,7 +564,10 @@ uint64_t ReadAccurateClock() {
   if (invPeriod == 0.0) AccurateClockFrequency();
   timespec time;
   int err = clock_gettime(CLOCK_MONOTONIC_RAW, &time);
-  assert(err == 0 && "clock_gettime(CLOCK_MONOTONIC_RAW,...) failed");
+  if (err != 0) {
+    perror("clock_gettime(CLOCK_MONOTONIC_RAW,...) failed");
+    abort();
+  }
   return (uint64_t(time.tv_sec) * 1000000000ull + uint64_t(time.tv_nsec)) * invPeriod;
 }
 
@@ -558,13 +596,16 @@ uint64_t AccurateClockFrequency() {
   }
   timespec time;
   int err = clock_getres(clock, &time);
-  assert(err == 0 && "clock_getres(CLOCK_MONOTONIC(_RAW),...) failed");
-  assert(time.tv_sec == 0 &&
-         "clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency "
-         "(<1Hz).");
-  assert(time.tv_nsec < 0xFFFFFFFF &&
-         "clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low frequency "
-         "(<1Hz).");
+  if (err != 0) {
+    perror("clock_getres failed");
+    abort();
+  }
+  if (time.tv_sec != 0 || time.tv_nsec >= 0xFFFFFFFF) {
+    fprintf(stderr,
+            "clock_getres(CLOCK_MONOTONIC(_RAW),...) returned very low "
+            "frequency (<1Hz).\n");
+    abort();
+  }
   if (invPeriod == 0.0) invPeriod = 1.0 / double(time.tv_nsec);
   return 1000000000ull / uint64_t(time.tv_nsec);
 }
@@ -573,16 +614,19 @@ SharedMutex CreateSharedMutex() {
   pthread_rwlockattr_t attrib;
   int err = pthread_rwlockattr_init(&attrib);
   if (err != 0) {
-    assert(false && "rw lock attribute init failed.");
+    fprintf(stderr, "rw lock attribute init failed: %s\n", strerror(err));
     return nullptr;
   }
   err = pthread_rwlockattr_setkind_np(&attrib, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP);
-  assert(err == 0 && "Set rw lock attribute failure.");
+  if (err != 0) {
+    fprintf(stderr, "Set rw lock attribute failure: %s\n", strerror(err));
+    return nullptr;
+  }
 
   pthread_rwlock_t* lock = new pthread_rwlock_t;
   err = pthread_rwlock_init(lock, &attrib);
   if (err != 0) {
-    assert(false && "rw lock init failed.");
+    fprintf(stderr, "rw lock init failed: %s\n", strerror(err));
     return nullptr;
   }
 
@@ -602,7 +646,10 @@ bool AcquireSharedMutex(SharedMutex lock) {
 
 void ReleaseSharedMutex(SharedMutex lock) {
   int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
-  assert(err == 0 && "SharedMutex unlock failed.");
+  if (err != 0) {
+    fprintf(stderr, "SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
 }
 
 bool TrySharedAcquireSharedMutex(SharedMutex lock) {
@@ -617,7 +664,10 @@ bool SharedAcquireSharedMutex(SharedMutex lock) {
 
 void SharedReleaseSharedMutex(SharedMutex lock) {
   int err = pthread_rwlock_unlock(*(pthread_rwlock_t**)&lock);
-  assert(err == 0 && "SharedMutex unlock failed.");
+  if (err != 0) {
+    fprintf(stderr, "SharedMutex unlock failed: %s\n", strerror(err));
+    abort();
+  }
 }
 
 void DestroySharedMutex(SharedMutex lock) {
@@ -645,8 +695,8 @@ uint64_t SystemClockFrequency() {
 }
 
 bool ParseCpuID(cpuid_t* cpuinfo) {
+#if defined(__i386__) || defined(__x86_64__)
   uint32_t eax, ebx, ecx, edx, max_eax = 0;
-
   memset(cpuinfo, 0, sizeof(*cpuinfo));
 
   /* Make sure current CPU supports at least EAX 4 */
@@ -665,6 +715,9 @@ bool ParseCpuID(cpuid_t* cpuinfo) {
     }
   }
   return true;
+#else
+  return false;
+#endif
 }
 
 }   //  namespace os
diff --git a/src/image/blit_kernel.cpp b/src/image/blit_kernel.cpp
index 3eea38d6e..067c668f7 100644
--- a/src/image/blit_kernel.cpp
+++ b/src/image/blit_kernel.cpp
@@ -85,6 +85,9 @@ extern uint8_t ocl_blit_object_gfx908[];
 extern uint8_t ocl_blit_object_gfx909[];
 extern uint8_t ocl_blit_object_gfx90a[];
 extern uint8_t ocl_blit_object_gfx90c[];
+extern uint8_t ocl_blit_object_gfx940[];
+extern uint8_t ocl_blit_object_gfx941[];
+extern uint8_t ocl_blit_object_gfx942[];
 extern uint8_t ocl_blit_object_gfx1010[];
 extern uint8_t ocl_blit_object_gfx1011[];
 extern uint8_t ocl_blit_object_gfx1012[];
@@ -1007,6 +1010,12 @@ hsa_status_t BlitKernel::GetPatchedBlitObject(const char* agent_name,
     *blit_code_object = ocl_blit_object_gfx90a;
   } else if (sname == "gfx90c") {
     *blit_code_object = ocl_blit_object_gfx90c;
+  } else if (sname == "gfx940") {
+    *blit_code_object = ocl_blit_object_gfx940;
+  } else if (sname == "gfx941") {
+    *blit_code_object = ocl_blit_object_gfx941;
+  } else if (sname == "gfx942") {
+    *blit_code_object = ocl_blit_object_gfx942;
   } else if (sname == "gfx1010") {
     *blit_code_object = ocl_blit_object_gfx1010;
   } else if (sname == "gfx1011") {
diff --git a/src/image/blit_src/CMakeLists.txt b/src/image/blit_src/CMakeLists.txt
index 21f998bf7..1b3c7cfa3 100644
--- a/src/image/blit_src/CMakeLists.txt
+++ b/src/image/blit_src/CMakeLists.txt
@@ -47,7 +47,7 @@ find_package(Clang REQUIRED HINTS ${CMAKE_PREFIX_PATH}/llvm PATHS /opt/rocm/llvm
 
 # Determine the target devices if not specified
 if (NOT DEFINED TARGET_DEVICES)
-  set (TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1103")
+  set(TARGET_DEVICES "gfx700;gfx701;gfx702;gfx801;gfx802;gfx803;gfx805;gfx810;gfx900;gfx902;gfx904;gfx906;gfx908;gfx909;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010;gfx1011;gfx1012;gfx1013;gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;gfx1103")
 endif()
 set( TARGET_DEVICES ${TARGET_DEVICES} CACHE STRING "Build targets" FORCE )
 
diff --git a/src/inc/amd_hsa_elf.h b/src/inc/amd_hsa_elf.h
index b2de2a20a..45962b1dd 100644
--- a/src/inc/amd_hsa_elf.h
+++ b/src/inc/amd_hsa_elf.h
@@ -119,17 +119,19 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX1035       = 0x03d,
   EF_AMDGPU_MACH_AMDGCN_GFX1034       = 0x03e,
   EF_AMDGPU_MACH_AMDGCN_GFX90A        = 0x03f,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040,
+  EF_AMDGPU_MACH_AMDGCN_GFX940        = 0x040,
   EF_AMDGPU_MACH_AMDGCN_GFX1100       = 0x041,
   EF_AMDGPU_MACH_AMDGCN_GFX1013       = 0x042,
   EF_AMDGPU_MACH_AMDGCN_GFX1103       = 0x044,
   EF_AMDGPU_MACH_AMDGCN_GFX1036       = 0x045,
   EF_AMDGPU_MACH_AMDGCN_GFX1101       = 0x046,
   EF_AMDGPU_MACH_AMDGCN_GFX1102       = 0x047,
+  EF_AMDGPU_MACH_AMDGCN_GFX941        = 0x04b,
+  EF_AMDGPU_MACH_AMDGCN_GFX942        = 0x04c,
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX942,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/src/inc/hsa.h b/src/inc/hsa.h
index 3c0db5d52..a70fd0f06 100644
--- a/src/inc/hsa.h
+++ b/src/inc/hsa.h
@@ -3218,7 +3218,16 @@ typedef enum {
    * region, the application must explicitely invoke ::hsa_memory_assign_agent
    * in order to transfer ownership to that agent for a particular buffer.
    */
-  HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4
+  HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED = 4,
+
+  /**
+   * Updates to memory in this region have extended scope, where the device-scope atomics
+   * to this memory type act as system-scope with respect to all variables located in
+   * memory regions of this type.
+   * Note: On non-compliant systems, the application may still be responsible for performing
+   * device-specific actions necessary to achieve system-scope coherence.
+   */
+  HSA_REGION_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8
 } hsa_region_global_flag_t;
 
 /**
diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h
index 508a45cf3..36f5ae146 100644
--- a/src/inc/hsa_ext_amd.h
+++ b/src/inc/hsa_ext_amd.h
@@ -51,9 +51,11 @@
 /*
  * - 1.0 - initial version
  * - 1.1 - dmabuf export
+ * - 1.2 - hsa_amd_memory_async_copy_on_engine
+ * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool
  */
 #define HSA_AMD_INTERFACE_VERSION_MAJOR 1
-#define HSA_AMD_INTERFACE_VERSION_MINOR 1
+#define HSA_AMD_INTERFACE_VERSION_MINOR 3
 
 #ifdef __cplusplus
 extern "C" {
@@ -387,7 +389,17 @@ typedef enum hsa_amd_agent_info_s {
    * Queries for version of IOMMU supported by agent.
    * The type of this attribute is hsa_amd_iommu_version_t.
    */
-  HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110
+  HSA_AMD_AGENT_INFO_IOMMU_SUPPORT = 0xA110,
+  /**
+   * Queries for number of XCCs within the agent.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_NUM_XCC = 0xA111,
+  /**
+   * Queries for driver unique identifier.
+   * The type of this attribute is uint32_t.
+   */
+  HSA_AMD_AGENT_INFO_DRIVER_UID = 0xA112
 } hsa_amd_agent_info_t;
 
 /**
@@ -1008,7 +1020,14 @@ typedef enum hsa_amd_memory_pool_global_flag_s {
   /**
    * Writes to memory in this pool can be performed by a single agent at a time.
    */
-  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4,
+
+  /** Updates to memory in this memory pool have extended scope, acting as
+   * system-scope atomics for variables in memory regions of this type.
+   * Note: On non-compliant systems, device-specific actions may be required
+   * for system-scope coherence. */
+  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED = 8,
+
 } hsa_amd_memory_pool_global_flag_t;
 
 typedef enum hsa_amd_memory_pool_location_s {
@@ -1942,31 +1961,38 @@ typedef struct hsa_amd_pointer_info_s {
   */
   hsa_amd_pointer_type_t type;
   /*
-  Base address at which non-host agents may access the allocation.
+  Base address at which non-host agents may access the allocation. This field is
+  not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
   */
   void* agentBaseAddress;
   /*
-  Base address at which the host agent may access the allocation.
+  Base address at which the host agent may access the allocation. This field is
+  not meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
   */
   void* hostBaseAddress;
   /*
-  Size of the allocation
+  Size of the allocation. This field is not meaningful if the type of the allocation
+  is HSA_EXT_POINTER_TYPE_UNKNOWN.
   */
   size_t sizeInBytes;
   /*
-  Application provided value.
+  Application provided value. This field is not meaningful if the type of the
+  allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
   */
   void* userData;
   /*
-  Reports an agent which "owns" (ie has preferred access to) the pool in which the allocation was
+  Reports an agent which "owns" (ie has preferred access to) the pool in which the
+  allocation was
   made.  When multiple agents share equal access to a pool (ex: multiple CPU agents, or multi-die
-  GPU boards) any such agent may be returned.
+  GPU boards) any such agent may be returned. This field is not meaningful if
+  the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN or if this agent is not available in
+  this process, for e.g if this agent is masked using ROCR_VISIBLE_DEVICES.
   */
   hsa_agent_t agentOwner;
   /*
   Contains a bitfield of hsa_amd_memory_pool_global_flag_t values.
-  Reports the effective global flags bitmask for the allocation.  This field is not meaningful if
-  the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
+  Reports the effective global flags bitmask for the allocation.  This field is not
+  meaningful if the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN.
   */
   uint32_t global_flags;
 } hsa_amd_pointer_info_t;
@@ -1974,7 +2000,9 @@ typedef struct hsa_amd_pointer_info_s {
 /**
  * @brief Retrieves information about the allocation referenced by the given
  * pointer.  Optionally returns the number and list of agents which can
- * directly access the allocation.
+ * directly access the allocation. In case this virtual address is unknown, the
+ * pointer type returned will be HSA_EXT_POINTER_TYPE_UNKNOWN and the only fields
+ * that are valid after hsa_amd_pointer_info returns are size and type.
  *
  * @param[in] ptr Pointer which references the allocation to retrieve info for.
  *
@@ -2047,9 +2075,10 @@ typedef struct hsa_amd_ipc_memory_s {
  * region has been attached (via hsa_amd_ipc_memory_attach) in the remote
  * process prior to releasing that memory in the local process.
  * Repeated calls for the same allocation may, but are not required to, return
- * unique handles.
+ * unique handles. The allocation needs to be on memory on an agent of type
+ * HSA_DEVICE_TYPE_GPU.
  *
- * @param[in] ptr Pointer to memory allocated via ROCr APIs to prepare for
+ * @param[in] ptr Pointer to device memory allocated via ROCr APIs to prepare for
  * sharing.
  *
  * @param[in] len Length in bytes of the allocation to share.
diff --git a/src/inc/hsa_ven_amd_aqlprofile.h b/src/inc/hsa_ven_amd_aqlprofile.h
index 169ab5278..7aea7858f 100644
--- a/src/inc/hsa_ven_amd_aqlprofile.h
+++ b/src/inc/hsa_ven_amd_aqlprofile.h
@@ -2,24 +2,24 @@
 //
 // The University of Illinois/NCSA
 // Open Source License (NCSA)
-// 
+//
 // Copyright (c) 2017-2020, Advanced Micro Devices, Inc. All rights reserved.
-// 
+//
 // Developed by:
-// 
+//
 //                 AMD Research and AMD HSA Software Development
-// 
+//
 //                 Advanced Micro Devices, Inc.
-// 
+//
 //                 www.amd.com
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to
 // deal with the Software without restriction, including without limitation
 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
 // and/or sell copies of the Software, and to permit persons to whom the
 // Software is furnished to do so, subject to the following conditions:
-// 
+//
 //  - Redistributions of source code must retain the above copyright notice,
 //    this list of conditions and the following disclaimers.
 //  - Redistributions in binary form must reproduce the above copyright
@@ -29,7 +29,7 @@
 //    nor the names of its contributors may be used to endorse or promote
 //    products derived from this Software without specific prior written
 //    permission.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
@@ -122,6 +122,10 @@ typedef enum {
   HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GCR = 30,
   HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_GUS = 31,
 
+  // UMC & MMEA System Blocks
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_UMC = 32,
+  HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_MMEA = 33,
+
   HSA_VEN_AMD_AQLPROFILE_BLOCKS_NUMBER
 } hsa_ven_amd_aqlprofile_block_name_t;
 
diff --git a/src/libamdhsacode/amd_elf_image.cpp b/src/libamdhsacode/amd_elf_image.cpp
index 33871dd2b..831c1f7a7 100644
--- a/src/libamdhsacode/amd_elf_image.cpp
+++ b/src/libamdhsacode/amd_elf_image.cpp
@@ -711,16 +711,38 @@ namespace elf {
 
       GElfStringTable* shstrtab() override;
       GElfStringTable* strtab() override;
-      GElfSymbolTable* getSymtab(uint16_t index) override
+      GElfSymbolTable* getReferencedSymbolTable(uint16_t index)
       {
         return static_cast<GElfSymbolTable*>(section(index));
       }
+      GElfSymbolTable* getSymtab(uint16_t index) override
+      {
+        if (section(index)->type() == SHT_SYMTAB)
+          return static_cast<GElfSymbolTable*>(section(index));
+        return nullptr;
+      }
+      GElfSymbolTable* getDynsym(uint16_t index) override
+      {
+        if (section(index)->type() == SHT_DYNSYM)
+          return static_cast<GElfSymbolTable*>(section(index));
+        return nullptr;
+      }
+
+      GElfSymbolTable* getSymbolTable() override;
+      GElfSymbolTable* getSymbolTable(uint16_t index) override
+      {
+        const char *UseDynsym = getenv("LOADER_USE_DYNSYM");
+        if (UseDynsym && std::strncmp(UseDynsym, "0", 1) != 0)
+          return getDynsym(index);
+        return getSymtab(index);
+      }
 
       GElfStringTable* addStringTable(const std::string& name) override;
       GElfStringTable* getStringTable(uint16_t index) override;
 
       GElfSymbolTable* addSymbolTable(const std::string& name, StringTable* stab = 0) override;
       GElfSymbolTable* symtab() override;
+      GElfSymbolTable* dynsym() override;
 
       GElfSegment* segment(size_t i) override { return segments[i].get(); }
       Segment* segmentByVAddr(uint64_t vaddr) override;
@@ -759,6 +781,7 @@ namespace elf {
       GElfStringTable* shstrtabSection;
       GElfStringTable* strtabSection;
       GElfSymbolTable* symtabSection;
+      GElfSymbolTable* dynsymSection;
       GElfNoteSection* noteSection;
       std::vector<std::unique_ptr<GElfSegment>> segments;
       std::vector<std::unique_ptr<GElfSection>> sections;
@@ -1242,7 +1265,7 @@ namespace elf {
     bool GElfRelocationSection::pullData()
     {
       section = elf->section(hdr.sh_info);
-      symtab = elf->getSymtab(hdr.sh_link);
+      symtab = elf->getReferencedSymbolTable(hdr.sh_link);
       Elf_Scn *lScn = elf_getscn(elf->e, ndxscn);
       assert(lScn);
       Elf_Data *lData = elf_getdata(lScn, nullptr);
@@ -1261,6 +1284,7 @@ namespace elf {
         e(0),
         shstrtabSection(0), strtabSection(0),
         symtabSection(0),
+        dynsymSection(0),
         noteSection(0)
     {
       if (EV_NONE == elf_version(EV_CURRENT)) {
@@ -1436,6 +1460,7 @@ namespace elf {
         if (section->type() == SHT_STRTAB) { strtabSection = static_cast<GElfStringTable*>(section.get()); }
         if (section->type() == SHT_SYMTAB) { symtabSection = static_cast<GElfSymbolTable*>(section.get()); }
         if (section->type() == SHT_NOTE) { noteSection = static_cast<GElfNoteSection*>(section.get()); }
+        if (section->type() == SHT_DYNSYM) { dynsymSection = static_cast<GElfSymbolTable*>(section.get()); }
       }
 
       size_t phnum;
@@ -1553,7 +1578,7 @@ namespace elf {
       }
     }
 
-    GElfStringTable* GElfImage::addStringTable(const std::string& name) 
+    GElfStringTable* GElfImage::addStringTable(const std::string& name)
     {
       GElfStringTable* stab = new GElfStringTable(this);
       sections.push_back(std::unique_ptr<GElfStringTable>(stab));
@@ -1597,6 +1622,21 @@ namespace elf {
       return symtabSection;
     }
 
+    GElfSymbolTable* GElfImage::dynsym()
+    {
+      if (!dynsymSection) {
+        dynsymSection = addSymbolTable(".dynsym", strtab());
+      }
+      return dynsymSection;
+    }
+
+    GElfSymbolTable* GElfImage::getSymbolTable()
+    {
+      const char *UseDynsym = getenv("LOADER_USE_DYNSYM");
+      if (UseDynsym && std::strncmp(UseDynsym, "0", 1) != 0)
+        return dynsym();
+      return symtab();
+    }
 
     GElfNoteSection* GElfImage::note()
     {
diff --git a/src/libamdhsacode/amd_hsa_code.cpp b/src/libamdhsacode/amd_hsa_code.cpp
index 64eac2e65..da0a6532e 100644
--- a/src/libamdhsacode/amd_hsa_code.cpp
+++ b/src/libamdhsacode/amd_hsa_code.cpp
@@ -580,6 +580,9 @@ namespace code {
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:  MI.Name = "gfx909";  MI.XnackSupported = true;  MI.SrameccSupported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A:  MI.Name = "gfx90a";  MI.XnackSupported = true;  MI.SrameccSupported = true;  break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C:  MI.Name = "gfx90c";  MI.XnackSupported = true;  MI.SrameccSupported = false; break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940:  MI.Name = "gfx940";  MI.XnackSupported = true;  MI.SrameccSupported = true;  break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941:  MI.Name = "gfx941";  MI.XnackSupported = true;  MI.SrameccSupported = true;  break;
+      case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942:  MI.Name = "gfx942";  MI.XnackSupported = true;  MI.SrameccSupported = true;  break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: MI.Name = "gfx1010"; MI.XnackSupported = true;  MI.SrameccSupported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: MI.Name = "gfx1011"; MI.XnackSupported = true;  MI.SrameccSupported = false; break;
       case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: MI.Name = "gfx1012"; MI.XnackSupported = true;  MI.SrameccSupported = false; break;
@@ -643,14 +646,6 @@ namespace code {
         mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX906;
       else if (old_name == "AMD:AMDGPU:9:0:12")
         mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
-      else if (old_name == "AMD:AMDGPU:11:0:0")
-        mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1100;
-      else if (old_name == "AMD:AMDGPU:11:0:1")
-        mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101;
-      else if (old_name == "AMD:AMDGPU:11:0:2")
-        mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102;
-      else if (old_name == "AMD:AMDGPU:11:0:3")
-        mach = ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103;
       else {
         // Code object v2 only supports asics up to gfx906 plus gfx90c. Do NOT
         // add handling of new asics into this if-else-if* block.
@@ -1770,8 +1765,8 @@ namespace code {
           hsatext = sec;
         }
       }
-      for (size_t i = 0; i < img->symtab()->symbolCount(); ++i) {
-        amd::elf::Symbol* elfsym = img->symtab()->symbol(i);
+      for (size_t i = 0; i < img->getSymbolTable()->symbolCount(); ++i) {
+        amd::elf::Symbol* elfsym = img->getSymbolTable()->symbol(i);
         Symbol* sym = 0;
         switch (elfsym->type()) {
         case STT_AMDGPU_HSA_KERNEL: {
diff --git a/src/libamdhsacode/amd_hsa_code_util.cpp b/src/libamdhsacode/amd_hsa_code_util.cpp
index 150840dd4..a1247de7f 100644
--- a/src/libamdhsacode/amd_hsa_code_util.cpp
+++ b/src/libamdhsacode/amd_hsa_code_util.cpp
@@ -957,12 +957,8 @@ int OpenTempFile(const char* prefix)
 #ifdef _WIN32
     char dir[MAX_PATH+1];
     if (!GetTempPath(sizeof(dir), dir)) { return -1; }
-#else // _WIN32
-    char *dir = NULL;
-#endif // _WIN32
     char *name = _tempnam(dir, tname.c_str());
     if (!name) { return -1; }
-#ifdef _WIN32
     HANDLE h = CreateFile(
       name,
       GENERIC_READ | GENERIC_WRITE,
@@ -975,10 +971,10 @@ int OpenTempFile(const char* prefix)
     if (h == INVALID_HANDLE_VALUE) { continue; }
     return _open_osfhandle((intptr_t)h, 0);
 #else // _WIN32
-    int d = _open(name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
-    if (d < 0) { free(name); continue; }
-    if (unlink(name) < 0) { free(name); _close(d); return -1; }
-    free(name);
+    tname += "XXXXXX";
+    int d = mkstemp((char*)tname.c_str());
+    if (d < 0) { continue; }
+    if (unlink(tname.c_str()) < 0) { _close(d); return -1; }
     return d;
 #endif // _WIN32
   }
diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp
index dbad4b93b..2a3b96f24 100644
--- a/src/loader/executable.cpp
+++ b/src/loader/executable.cpp
@@ -80,7 +80,9 @@ __attribute__((noinline)) static void _loader_debug_state() {
 // 5: New trap handler ABI. Save the PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped
 // 6: New trap handler ABI. ttmp6[25:0] contains dispatch index modulo queue size
 // 7: New trap handler ABI. Send interrupts as a bitmask, coalescing concurrent exceptions.
-HSA_API r_debug _amdgpu_r_debug = {7,
+// 8: New trap handler ABI. for gfx940: Initialize ttmp[4:5] if ttmp11[31] == 0.
+// 9: New trap handler API. For gfx11: Save PC in ttmp11[22:7] ttmp6[31:0], and park the wave if stopped.
+HSA_API r_debug _amdgpu_r_debug = {9,
                            nullptr,
                            reinterpret_cast<uintptr_t>(&_loader_debug_state),
                            r_debug::RT_CONSISTENT,