charmplusplus · trquinn · Oct 11, 2023 · Oct 10, 2023 · Oct 10, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -703,6 +703,9 @@ if(CUDA)
     set(CUDA_DIR "${CUDA_TOOLKIT_ROOT_DIR}")
   endif()
   add_library(cudahybridapi ${hybridAPI-cxx-sources})
+  if(TRACING)
+    target_compile_definitions(cudahybridapi PRIVATE HAPI_TRACE)
+  endif()
 endif()
 
 if(EXISTS ${CMAKE_SOURCE_DIR}/src/arch/${VDIR}/conv-mach-pxshm.sh)

diff --git a/examples/charm++/cuda/hello/Makefile b/examples/charm++/cuda/hello/Makefile
@@ -2,7 +2,7 @@
 CHARMC = ../../../../bin/charmc $(USE_WR) $(OPTS)
 
 # use deprecated hapiWorkRequest for GPU offloading
-USE_WR = #-DUSE_WR
+USE_WR = #-DUSE_WR -DHAPI_TRACE
 
 # set CUDATOOLKIT_HOME to the CUDA toolkit directory
 CUDATOOLKIT_HOME ?= /usr/local/cuda

diff --git a/examples/charm++/cuda/hello/hello.C b/examples/charm++/cuda/hello/hello.C
@@ -6,7 +6,7 @@
 /* readonly */ int nElements;
 /* readonly */ CProxy_Hello arr;
 
-extern void kernelSetup(cudaStream_t stream, void* cb);
+extern void kernelSetup(cudaStream_t stream, const CkCallback& cb);
 
 /* mainchare */
 class Main : public CBase_Main {
@@ -66,9 +66,9 @@ class Hello : public CBase_Hello {
         thisIndex, CkMyPe(), device, prop.name);
 
     CkArrayIndex1D myIndex = CkArrayIndex1D(thisIndex);
-    CkCallback* cb = new CkCallback(CkIndex_Hello::pass(), myIndex, thisArrayID);
+    CkCallback cb(CkIndex_Hello::pass(), myIndex, thisArrayID);
 
-    kernelSetup(stream, (void*)cb);
+    kernelSetup(stream, cb);
   }
 
   void pass() {

diff --git a/examples/charm++/cuda/hello/hello.cu b/examples/charm++/cuda/hello/hello.cu
@@ -9,7 +9,7 @@ void runHello(struct hapiWorkRequest* wr, cudaStream_t kernel_stream,
   helloKernel<<<wr->grid_dim, wr->block_dim, wr->shared_mem, kernel_stream>>>();
 }
 
-void kernelSetup(cudaStream_t stream, void* cb) {
+void kernelSetup(cudaStream_t stream, const CkCallback& cb) {
 #ifdef USE_WR
   // DEPRECATED
   hapiWorkRequest* wr = hapiCreateWorkRequest();

diff --git a/src/arch/cuda/hybridAPI/gpumanager.h b/src/arch/cuda/hybridAPI/gpumanager.h
@@ -44,6 +44,33 @@ struct cuda_ipc_device_info {
   void* buffer;
 };
 
+#ifdef HAPI_TRACE
+#define QUEUE_SIZE_INIT 128
+extern "C" int traceRegisterUserEvent(const char* x, int e);
+extern "C" void traceUserBracketEvent(int e, double beginT, double endT);
+
+typedef struct gpuEventTimer {
+  int stage;
+  double cmi_start_time;
+  double cmi_end_time;
+  int event_type;
+  const char* trace_name;
+} gpuEventTimer;
+#endif
+
+// Event stages used for profiling.
+enum WorkRequestStage{
+  DataSetup        = 1,
+  KernelExecution  = 2,
+  DataCleanup      = 3
+};
+
+enum ProfilingStage{
+  GpuMemSetup   = 8800,
+  GpuKernelExec = 8801,
+  GpuMemCleanup = 8802
+};
+
 // Contains per-process data and methods needed by HAPI.
 struct GPUManager {
   std::vector<BufferPool> mempool_free_bufs_;

diff --git a/src/arch/cuda/hybridAPI/hapi_impl.cpp b/src/arch/cuda/hybridAPI/hapi_impl.cpp
@@ -25,36 +25,9 @@
 extern "C" double CmiWallTimer();
 #endif
 
-#ifdef HAPI_TRACE
-#define QUEUE_SIZE_INIT 128
-extern "C" int traceRegisterUserEvent(const char* x, int e);
-extern "C" void traceUserBracketEvent(int e, double beginT, double endT);
-
-typedef struct gpuEventTimer {
-  int stage;
-  double cmi_start_time;
-  double cmi_end_time;
-  int event_type;
-  const char* trace_name;
-} gpuEventTimer;
-#endif
-
 static void createPool(int *nbuffers, int n_slots, std::vector<BufferPool> &pools);
 static void releasePool(std::vector<BufferPool> &pools);
 
-// Event stages used for profiling.
-enum WorkRequestStage{
-  DataSetup        = 1,
-  KernelExecution  = 2,
-  DataCleanup      = 3
-};
-
-enum ProfilingStage{
-  GpuMemSetup   = 8800,
-  GpuKernelExec = 8801,
-  GpuMemCleanup = 8802
-};
-
 #ifdef HAPI_CUDA_CALLBACK
 struct hapiCallbackMessage {
   char header[CmiMsgHeaderSizeBytes];