diff --git a/src/ecg_library/ecg.c b/src/ecg_library/ecg.c index 5a5fd221..a17dbc34 100644 --- a/src/ecg_library/ecg.c +++ b/src/ecg_library/ecg.c @@ -9,9 +9,10 @@ #include "../utils/utils.h" #include -#ifdef COMPILE_CUDA +#if defined(COMPILE_CUDA) || defined(COMPILE_SYCL) #include "../gpu_utils/gpu_utils.h" +#include "../gpu_utils/accel_utils.h" // Precision to be used for the calculations on the GPU #ifdef CELL_MODEL_REAL_DOUBLE @@ -122,8 +123,8 @@ static void fill_discretization_matrix_elements(struct cell_node *grid_cell, voi for(size_t i = 0; i < max_elements; i++) { if(cell_elements[i].column == position) { - p = i; - insert = true; + p = i; + insert = true; break; } } @@ -283,7 +284,7 @@ END_CALC_ECG(end_pseudo_bidomain_cpu) { arrfree(PSEUDO_BIDOMAIN_DATA->leads); } -#ifdef COMPILE_CUDA +#if defined(COMPILE_CUDA) || defined(COMPILE_SYCL) INIT_CALC_ECG(init_pseudo_bidomain_gpu) { @@ -298,8 +299,8 @@ INIT_CALC_ECG(init_pseudo_bidomain_gpu) { //This is allocated when using the CPU code, but we do not need it in the gpu version free(PSEUDO_BIDOMAIN_DATA->beta_im); - check_cublas_error(cusparseCreate(&(PSEUDO_BIDOMAIN_DATA->cusparseHandle))); - check_cublas_error(cublasCreate(&(PSEUDO_BIDOMAIN_DATA->cublasHandle))); + create_sparse_handle(&(PSEUDO_BIDOMAIN_DATA->sparseHandle)); + create_blas_handle(&(PSEUDO_BIDOMAIN_DATA->blasHandle)); int_array I = NULL, J = NULL; f32_array val = NULL; @@ -318,31 +319,31 @@ INIT_CALC_ECG(init_pseudo_bidomain_gpu) { PSEUDO_BIDOMAIN_DATA->nz = nz; - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_col), nz * sizeof(int))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_row), (N + 1) * sizeof(int))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_val), nz * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->beta_im), N * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_distances), PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_volumes), N * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->tmp_data), N * sizeof(real))); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_col), nz * sizeof(int)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_row), (N + 1) * sizeof(int)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_val), nz * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->beta_im), N * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_distances), PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_volumes), N * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->tmp_data), N * sizeof(real)); -#if CUBLAS_VER_MAJOR >= 11 +#if defined(COMPILE_CUDA) && CUBLAS_VER_MAJOR >= 11 check_cuda_error(cusparseCreateCsr(&(PSEUDO_BIDOMAIN_DATA->matA), N, N, nz, PSEUDO_BIDOMAIN_DATA->d_row, PSEUDO_BIDOMAIN_DATA->d_col, PSEUDO_BIDOMAIN_DATA->d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUBLAS_SIZE)); check_cuda_error(cusparseCreateDnVec(&(PSEUDO_BIDOMAIN_DATA->vec_beta_im), N, PSEUDO_BIDOMAIN_DATA->beta_im, CUBLAS_SIZE)); check_cuda_error(cusparseCreateDnVec(&(PSEUDO_BIDOMAIN_DATA->vec_vm), N, the_ode_solver->sv, CUBLAS_SIZE)); -#else +#elif defined(COMPILE_CUDA) check_cuda_error((cudaError_t)cusparseCreateMatDescr(&(PSEUDO_BIDOMAIN_DATA->descr))); cusparseSetMatType(PSEUDO_BIDOMAIN_DATA->descr, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(PSEUDO_BIDOMAIN_DATA->descr, CUSPARSE_INDEX_BASE_ZERO); PSEUDO_BIDOMAIN_DATA->local_sv = the_ode_solver->sv; #endif - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_col, J, nz * sizeof(int), cudaMemcpyHostToDevice)); // JA - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_row, I, (N + 1) * sizeof(int), cudaMemcpyHostToDevice)); // IA - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_val, new_val, nz * sizeof(real), cudaMemcpyHostToDevice)); // A - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_distances, PSEUDO_BIDOMAIN_DATA->distances, PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real), - cudaMemcpyHostToDevice)); + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_col, J, nz * sizeof(int), HOST_TO_DEVICE); // JA + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_row, I, (N + 1) * sizeof(int), HOST_TO_DEVICE); // IA + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_val, new_val, nz * sizeof(real), HOST_TO_DEVICE); // A + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_distances, PSEUDO_BIDOMAIN_DATA->distances, PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real), + HOST_TO_DEVICE); PSEUDO_BIDOMAIN_DATA->volumes = MALLOC_ARRAY_OF_TYPE(real, N); @@ -353,17 +354,17 @@ INIT_CALC_ECG(init_pseudo_bidomain_gpu) { PSEUDO_BIDOMAIN_DATA->volumes[i] = d.x * d.y * d.z; } - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_volumes, PSEUDO_BIDOMAIN_DATA->volumes, N * sizeof(real), cudaMemcpyHostToDevice)); + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_volumes, PSEUDO_BIDOMAIN_DATA->volumes, N * sizeof(real), HOST_TO_DEVICE); -#if CUSPARSE_VER_MAJOR >= 11 +#if defined(COMPILE_CUDA) && CUSPARSE_VER_MAJOR >= 11 real alpha = 1.0; real beta = 0.0; - check_cuda_error(cusparseSpMV_bufferSize(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, + check_cuda_error(cusparseSpMV_bufferSize(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, PSEUDO_BIDOMAIN_DATA->vec_vm, &beta, PSEUDO_BIDOMAIN_DATA->vec_beta_im, CUBLAS_SIZE, CUSPARSE_ALG, &(PSEUDO_BIDOMAIN_DATA->bufferSize))); - check_cuda_error(cudaMalloc(&(PSEUDO_BIDOMAIN_DATA->buffer), PSEUDO_BIDOMAIN_DATA->bufferSize)); + malloc_device(&(PSEUDO_BIDOMAIN_DATA->buffer), PSEUDO_BIDOMAIN_DATA->bufferSize); #endif arrfree(I); @@ -383,17 +384,17 @@ CALC_ECG(pseudo_bidomain_gpu) { real alpha = 1.0; real beta = 0.0; #if CUSPARSE_VER_MAJOR >= 11 - check_cublas_error(cusparseSpMV(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, + check_cublas_error(cusparseSpMV(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, PSEUDO_BIDOMAIN_DATA->vec_vm, &beta, PSEUDO_BIDOMAIN_DATA->vec_beta_im, CUBLAS_SIZE, CUSPARSE_ALG, PSEUDO_BIDOMAIN_DATA->buffer)); #else #ifdef CELL_MODEL_REAL_DOUBLE - cusparseDcsrmv(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, + cusparseDcsrmv(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, PSEUDO_BIDOMAIN_DATA->descr, PSEUDO_BIDOMAIN_DATA->d_val, PSEUDO_BIDOMAIN_DATA->d_row, PSEUDO_BIDOMAIN_DATA->d_col, PSEUDO_BIDOMAIN_DATA->local_sv, &beta, PSEUDO_BIDOMAIN_DATA->beta_im); #else - cusparseScsrmv(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, + cusparseScsrmv(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, PSEUDO_BIDOMAIN_DATA->descr, PSEUDO_BIDOMAIN_DATA->d_val, PSEUDO_BIDOMAIN_DATA->d_row, PSEUDO_BIDOMAIN_DATA->d_col, PSEUDO_BIDOMAIN_DATA->local_sv, &beta, PSEUDO_BIDOMAIN_DATA->beta_im); #endif @@ -412,10 +413,10 @@ CALC_ECG(pseudo_bidomain_gpu) { #ifdef CELL_MODEL_REAL_DOUBLE check_cublas_error( - cublasDdot(PSEUDO_BIDOMAIN_DATA->cublasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); + cublasDdot(PSEUDO_BIDOMAIN_DATA->blasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); #else check_cublas_error( - cublasSdot(PSEUDO_BIDOMAIN_DATA->cublasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); + cublasSdot(PSEUDO_BIDOMAIN_DATA->blasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); #endif fprintf(PSEUDO_BIDOMAIN_DATA->output_file, "%lf ", -PSEUDO_BIDOMAIN_DATA->scale_factor * local_sum); @@ -432,8 +433,8 @@ END_CALC_ECG(end_pseudo_bidomain_gpu) { if(!persistent_data) return; - check_cuda_error((cudaError_t)cusparseDestroy(persistent_data->cusparseHandle)); - check_cuda_error((cudaError_t)cublasDestroy(persistent_data->cublasHandle)); + check_cuda_error((cudaError_t)cusparseDestroy(persistent_data->sparseHandle)); + check_cuda_error((cudaError_t)cublasDestroy(persistent_data->blasHandle)); #if CUBLAS_VER_MAJOR >= 11 if(persistent_data->matA) { @@ -448,13 +449,13 @@ END_CALC_ECG(end_pseudo_bidomain_gpu) { #else check_cuda_error((cudaError_t)cusparseDestroyMatDescr(persistent_data->descr)); #endif - check_cuda_error(cudaFree(persistent_data->d_col)); - check_cuda_error(cudaFree(persistent_data->d_row)); - check_cuda_error(cudaFree(persistent_data->d_val)); - check_cuda_error(cudaFree(persistent_data->beta_im)); - check_cuda_error(cudaFree(persistent_data->tmp_data)); - check_cuda_error(cudaFree(persistent_data->d_distances)); - check_cuda_error(cudaFree(persistent_data->d_volumes)); + free_device(persistent_data->d_col); + free_device(persistent_data->d_row); + free_device(persistent_data->d_val); + free_device(persistent_data->beta_im); + free_device(persistent_data->tmp_data); + free_device(persistent_data->d_distances); + free_device(persistent_data->d_volumes); fclose(persistent_data->output_file); @@ -511,7 +512,7 @@ INIT_CALC_ECG(init_pseudo_bidomain_with_diffusive_current_cpu) { uint32_t nlen_output_dir = strlen(output_dir); char *filename = (char*)malloc(sizeof(char)*nlen_output_dir+10); sprintf(filename, "%s/ecg.txt", output_dir); - + char *dir = get_dir_from_path(filename); create_dir(dir); free(dir); @@ -587,7 +588,7 @@ CALC_ECG(pseudo_bidomain_with_diffusive_current_cpu) { // DIFFUSIVE CURRENT PLOT if (time_info->iteration % PSEUDO_BIDOMAIN_DATA->diff_curr_rate == 0 && time_info->current_t <= PSEUDO_BIDOMAIN_DATA->diff_curr_max_time) { - char *filename_diffusive_current[200]; + char filename_diffusive_current[200]; sprintf(filename_diffusive_current, "%s/diffusive_current_t_%g_ms.txt", output_dir, time_info->current_t); FILE *output_diffusive_current = fopen(filename_diffusive_current, "w"); for(uint32_t i = 0; i < n_active; i++) { @@ -615,7 +616,7 @@ CALC_ECG(pseudo_bidomain_with_diffusive_current_cpu) { fprintf(PSEUDO_BIDOMAIN_DATA->output_file, "\n"); } - + END_CALC_ECG(end_pseudo_bidomain_with_diffusive_current_cpu) { fclose(PSEUDO_BIDOMAIN_DATA->output_file); free(PSEUDO_BIDOMAIN_DATA->beta_im); @@ -637,8 +638,8 @@ INIT_CALC_ECG(init_pseudo_bidomain_with_diffusive_current_gpu) { //This is allocated when using the CPU code, but we do not need it in the gpu version free(PSEUDO_BIDOMAIN_DATA->beta_im); - check_cublas_error(cusparseCreate(&(PSEUDO_BIDOMAIN_DATA->cusparseHandle))); - check_cublas_error(cublasCreate(&(PSEUDO_BIDOMAIN_DATA->cublasHandle))); + check_cublas_error(cusparseCreate(&(PSEUDO_BIDOMAIN_DATA->sparseHandle))); + check_cublas_error(cublasCreate(&(PSEUDO_BIDOMAIN_DATA->blasHandle))); int_array I = NULL, J = NULL; f32_array val = NULL; @@ -661,13 +662,13 @@ INIT_CALC_ECG(init_pseudo_bidomain_with_diffusive_current_gpu) { PSEUDO_BIDOMAIN_DATA->nz = nz; - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_col), nz * sizeof(int))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_row), (N + 1) * sizeof(int))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_val), nz * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->beta_im), N * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_distances), PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->d_volumes), N * sizeof(real))); - check_cuda_error(cudaMalloc((void **)&(PSEUDO_BIDOMAIN_DATA->tmp_data), N * sizeof(real))); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_col), nz * sizeof(int)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_row), (N + 1) * sizeof(int)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_val), nz * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->beta_im), N * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_distances), PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->d_volumes), N * sizeof(real)); + malloc_device((void **)&(PSEUDO_BIDOMAIN_DATA->tmp_data), N * sizeof(real)); #if CUBLAS_VER_MAJOR >= 11 check_cuda_error(cusparseCreateCsr(&(PSEUDO_BIDOMAIN_DATA->matA), N, N, nz, PSEUDO_BIDOMAIN_DATA->d_row, PSEUDO_BIDOMAIN_DATA->d_col, @@ -681,11 +682,11 @@ INIT_CALC_ECG(init_pseudo_bidomain_with_diffusive_current_gpu) { PSEUDO_BIDOMAIN_DATA->local_sv = the_ode_solver->sv; #endif - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_col, J, nz * sizeof(int), cudaMemcpyHostToDevice)); // JA - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_row, I, (N + 1) * sizeof(int), cudaMemcpyHostToDevice)); // IA - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_val, new_val, nz * sizeof(real), cudaMemcpyHostToDevice)); // A - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_distances, PSEUDO_BIDOMAIN_DATA->distances, PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real), - cudaMemcpyHostToDevice)); + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_col, J, nz * sizeof(int), HOST_TO_DEVICE); // JA + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_row, I, (N + 1) * sizeof(int), HOST_TO_DEVICE); // IA + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_val, new_val, nz * sizeof(real), HOST_TO_DEVICE); // A + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_distances, PSEUDO_BIDOMAIN_DATA->distances, PSEUDO_BIDOMAIN_DATA->n_leads * N * sizeof(real), + HOST_TO_DEVICE); PSEUDO_BIDOMAIN_DATA->volumes = MALLOC_ARRAY_OF_TYPE(real, N); @@ -696,17 +697,17 @@ INIT_CALC_ECG(init_pseudo_bidomain_with_diffusive_current_gpu) { PSEUDO_BIDOMAIN_DATA->volumes[i] = d.x * d.y * d.z; } - check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->d_volumes, PSEUDO_BIDOMAIN_DATA->volumes, N * sizeof(real), cudaMemcpyHostToDevice)); + memcpy_device(PSEUDO_BIDOMAIN_DATA->d_volumes, PSEUDO_BIDOMAIN_DATA->volumes, N * sizeof(real), HOST_TO_DEVICE); #if CUSPARSE_VER_MAJOR >= 11 real alpha = 1.0; real beta = 0.0; - check_cuda_error(cusparseSpMV_bufferSize(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, + check_cuda_error(cusparseSpMV_bufferSize(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, PSEUDO_BIDOMAIN_DATA->vec_vm, &beta, PSEUDO_BIDOMAIN_DATA->vec_beta_im, CUBLAS_SIZE, CUSPARSE_ALG, &(PSEUDO_BIDOMAIN_DATA->bufferSize))); - check_cuda_error(cudaMalloc(&(PSEUDO_BIDOMAIN_DATA->buffer), PSEUDO_BIDOMAIN_DATA->bufferSize)); + malloc_device(&(PSEUDO_BIDOMAIN_DATA->buffer), PSEUDO_BIDOMAIN_DATA->bufferSize); #endif arrfree(I); @@ -726,17 +727,17 @@ CALC_ECG(pseudo_bidomain_with_diffusive_current_gpu) { real alpha = 1.0; real beta = 0.0; #if CUSPARSE_VER_MAJOR >= 11 - check_cublas_error(cusparseSpMV(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, + check_cublas_error(cusparseSpMV(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, PSEUDO_BIDOMAIN_DATA->matA, PSEUDO_BIDOMAIN_DATA->vec_vm, &beta, PSEUDO_BIDOMAIN_DATA->vec_beta_im, CUBLAS_SIZE, CUSPARSE_ALG, PSEUDO_BIDOMAIN_DATA->buffer)); #else #ifdef CELL_MODEL_REAL_DOUBLE - cusparseDcsrmv(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, + cusparseDcsrmv(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, PSEUDO_BIDOMAIN_DATA->descr, PSEUDO_BIDOMAIN_DATA->d_val, PSEUDO_BIDOMAIN_DATA->d_row, PSEUDO_BIDOMAIN_DATA->d_col, PSEUDO_BIDOMAIN_DATA->local_sv, &beta, PSEUDO_BIDOMAIN_DATA->beta_im); #else - cusparseScsrmv(PSEUDO_BIDOMAIN_DATA->cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, + cusparseScsrmv(PSEUDO_BIDOMAIN_DATA->sparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, n_active, n_active, PSEUDO_BIDOMAIN_DATA->nz, &alpha, PSEUDO_BIDOMAIN_DATA->descr, PSEUDO_BIDOMAIN_DATA->d_val, PSEUDO_BIDOMAIN_DATA->d_row, PSEUDO_BIDOMAIN_DATA->d_col, PSEUDO_BIDOMAIN_DATA->local_sv, &beta, PSEUDO_BIDOMAIN_DATA->beta_im); #endif @@ -749,7 +750,7 @@ CALC_ECG(pseudo_bidomain_with_diffusive_current_gpu) { // DIFFUSIVE CURRENT if (time_info->iteration % PSEUDO_BIDOMAIN_DATA->diff_curr_rate == 0 && time_info->current_t <= PSEUDO_BIDOMAIN_DATA->diff_curr_max_time) { - char *filename_diffusive_current[200]; + char filename_diffusive_current[200]; sprintf(filename_diffusive_current, "%s/diffusive_current_t_%g_ms.txt", output_dir, time_info->current_t); FILE *output_diffusive_current = fopen(filename_diffusive_current, "w"); check_cuda_error(cudaMemcpy(PSEUDO_BIDOMAIN_DATA->beta_im_cpu, PSEUDO_BIDOMAIN_DATA->beta_im, n_active * sizeof(real), cudaMemcpyDeviceToHost)); @@ -767,10 +768,10 @@ CALC_ECG(pseudo_bidomain_with_diffusive_current_gpu) { #ifdef CELL_MODEL_REAL_DOUBLE check_cublas_error( - cublasDdot(PSEUDO_BIDOMAIN_DATA->cublasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); + cublasDdot(PSEUDO_BIDOMAIN_DATA->blasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); #else check_cublas_error( - cublasSdot(PSEUDO_BIDOMAIN_DATA->cublasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); + cublasSdot(PSEUDO_BIDOMAIN_DATA->blasHandle, n_active, PSEUDO_BIDOMAIN_DATA->tmp_data, 1, PSEUDO_BIDOMAIN_DATA->d_volumes, 1, &local_sum)); #endif fprintf(PSEUDO_BIDOMAIN_DATA->output_file, "%lf ", -PSEUDO_BIDOMAIN_DATA->scale_factor * local_sum); @@ -787,8 +788,8 @@ END_CALC_ECG(end_pseudo_bidomain_with_diffusive_current_gpu) { if(!persistent_data) return; - check_cuda_error((cudaError_t)cusparseDestroy(persistent_data->cusparseHandle)); - check_cuda_error((cudaError_t)cublasDestroy(persistent_data->cublasHandle)); + check_cuda_error((cudaError_t)cusparseDestroy(persistent_data->sparseHandle)); + check_cuda_error((cudaError_t)cublasDestroy(persistent_data->blasHandle)); #if CUBLAS_VER_MAJOR >= 11 if(persistent_data->matA) { @@ -803,13 +804,13 @@ END_CALC_ECG(end_pseudo_bidomain_with_diffusive_current_gpu) { #else check_cuda_error((cudaError_t)cusparseDestroyMatDescr(persistent_data->descr)); #endif - check_cuda_error(cudaFree(persistent_data->d_col)); - check_cuda_error(cudaFree(persistent_data->d_row)); - check_cuda_error(cudaFree(persistent_data->d_val)); - check_cuda_error(cudaFree(persistent_data->beta_im)); - check_cuda_error(cudaFree(persistent_data->tmp_data)); - check_cuda_error(cudaFree(persistent_data->d_distances)); - check_cuda_error(cudaFree(persistent_data->d_volumes)); + free_device(persistent_data->d_col); + free_device(persistent_data->d_row); + free_device(persistent_data->d_val); + free_device(persistent_data->beta_im); + free_device(persistent_data->tmp_data); + free_device(persistent_data->d_distances); + free_device(persistent_data->d_volumes); free(persistent_data->beta_im_cpu); fclose(persistent_data->output_file); diff --git a/src/ecg_library/ecg.h b/src/ecg_library/ecg.h index 3d6b5bc3..4acd53a4 100644 --- a/src/ecg_library/ecg.h +++ b/src/ecg_library/ecg.h @@ -16,9 +16,14 @@ struct pseudo_bidomain_persistent_data { uint32_t diff_curr_rate; real_cpu diff_curr_max_time; +#if defined(COMPILE_CUDA) || defined(COMPILE_SYCL) #ifdef COMPILE_CUDA - cusparseHandle_t cusparseHandle; - cublasHandle_t cublasHandle; + cusparseHandle_t sparseHandle; + cublasHandle_t blasHandle; +#else + dpct::sparse::descriptor_ptr sparseHandle; + dpct::blas::descriptor_ptr blasHandle; +#endif int *d_col, *d_row, nz; real *d_distances; real *d_volumes; @@ -26,20 +31,26 @@ struct pseudo_bidomain_persistent_data { real *tmp_data; real *d_val; real *beta_im_cpu; - size_t bufferSize; void *buffer; - -#if CUBLAS_VER_MAJOR <= 10 +#if defined(COMPILE_CUDA) && CUBLAS_VER_MAJOR <= 10 cusparseMatDescr_t descr; real *local_sv; -#else +#elif defined(COMPILE_CUDA) cusparseSpMatDescr_t matA; cusparseDnVecDescr_t vec_vm; cusparseDnVecDescr_t vec_beta_im; #endif +#ifdef COMPILE_SYCL + dpct::sparse::sparse_matrix_desc_t matA; + std::shared_ptr vec_vm; + std::shared_ptr vec_beta_im; #endif + +#endif + + }; #define PSEUDO_BIDOMAIN_DATA ((struct pseudo_bidomain_persistent_data *)config->persistent_data) diff --git a/src/gpu_utils/accel_utils.cpp b/src/gpu_utils/accel_utils.cpp new file mode 100644 index 00000000..91c22e7e --- /dev/null +++ b/src/gpu_utils/accel_utils.cpp @@ -0,0 +1,84 @@ +// +// Created by sachetto on 28/11/24. +// + +#include "accel_utils.h" +#include "gpu_utils.h" + +#ifdef COMPILE_CUDA +#include +#include +#endif + +extern "C" void malloc_device(void **ptr, size_t n) { + +#ifdef COMPILE_CUDA + check_cuda_error(cudaMalloc(ptr, n)); +#elif defined(COMPILE_SYCL) + DPCT_CHECK_ERROR(ptr = sycl::malloc_device(n, dpct::get_in_order_queue())); +#endif + +} + +extern "C" void free_device(void *ptr) { +#ifdef COMPILE_CUDA + check_cuda_error(cudaFree(ptr)); +#elif defined(COMPILE_SYCL) + DPCT_CHECK_ERROR(dpct::dpct_free(persistent_data->d_col, dpct::get_in_order_queue())); +#endif +} + +extern "C" void memcpy_device(void *dest, const void *src, size_t n, copy_direction kind) { + + if(kind == HOST_TO_DEVICE) { + #ifdef COMPILE_CUDA + check_cuda_error(cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice)); + #elif defined(COMPILE_SYCL) + sycl::device dev_ct1; + sycl::queue q_ct1(dev_ct1, sycl::property_list{sycl::property::queue::in_order()}); + q_ct1.memcpy(dest, src, n).wait(); + #endif + } else if(kind == DEVICE_TO_HOST) { + #ifdef COMPILE_CUDA + check_cuda_error(cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost)); + #elif defined(COMPILE_SYCL) + dpct::device_ext &dev_ct1 = dpct::get_current_device(); + sycl::queue &q_ct1 = dev_ct1.default_queue(); + q_ct1.memcpy(dest, src, n).wait(); + #endif + } +} + +extern "C" void create_sparse_handle(void *handle) { +#ifdef COMPILE_CUDA + check_cublas_error(cusparseCreate((cusparseHandle_t *)handle)); +#elif defined(COMPILE_SYCL) + DPCT_CHECK_ERROR(handle = new dpct::sparse::descriptor(); +#endif +} + +extern "C" void create_blas_handle(void *handle) { +#ifdef COMPILE_CUDA + check_cublas_error(cublasCreate((cublasHandle_t *)handle)); +#elif defined(COMPILE_SYCL) + DPCT_CHECK_ERROR(handle = new dpct::blas::descriptor(); +#endif +} + +extern "C" void sparse_create_scr(void *mat, int64_t rows, int64_t cols, int64_t nnz, + void* csrRowOffsets, + void* csrColInd, + void* csrValues, + cusparseIndexType_t csrRowOffsetsType, + cusparseIndexType_t csrColIndType, + cusparseIndexBase_t idxBase, + cudaDataType valueType) { +#ifdef COMPILE_CUDA + check_cuda_error(cusparseCreateCsr(&(PSEUDO_BIDOMAIN_DATA->matA), N, N, nz, PSEUDO_BIDOMAIN_DATA->d_row, PSEUDO_BIDOMAIN_DATA->d_col, + PSEUDO_BIDOMAIN_DATA->d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUBLAS_SIZE)); +#elif defined(COMPILE_SYCL) + DPCT_CHECK_ERROR(mat = new dpct::sparse::sparse_matrix_desc( + rows, cols, nnz, csrRowOffsets,csrColInd, csrValues, dpct::library_data_t::real_int32, + dpct::library_data_t::real_int32, oneapi::mkl::index_base::zero, CUBLAS_SIZE, dpct::sparse::matrix_format::csr)); +#endif +} \ No newline at end of file diff --git a/src/gpu_utils/accel_utils.h b/src/gpu_utils/accel_utils.h new file mode 100644 index 00000000..942959f2 --- /dev/null +++ b/src/gpu_utils/accel_utils.h @@ -0,0 +1,26 @@ +// +// Created by sachetto on 28/11/24. +// + +#ifndef ACCEL_UTILS_H +#define ACCEL_UTILS_H + +#include + +typedef enum { + HOST_TO_DEVICE, + DEVICE_TO_HOST, +} copy_direction; + +#ifdef __cplusplus +extern "C" { +#endif + void malloc_device(void **ptr, size_t n); + void free_device(void *ptr); + void memcpy_device(void *dest, const void *src, size_t n, copy_direction kind); + void create_sparse_handle(void *handle); + void create_blas_handle(void *handle); +#ifdef __cplusplus +} +#endif +#endif //ACCEL_UTILS_H diff --git a/src/gpu_utils/build.sh b/src/gpu_utils/build.sh index 9fe0dbdf..9503fe7c 100755 --- a/src/gpu_utils/build.sh +++ b/src/gpu_utils/build.sh @@ -1,12 +1,11 @@ -GPU_UTILS_SOURCE_FILES="gpu_utils.c" -GPU_UTILS_HEADER_FILES="gpu_utils.h" +GPU_UTILS_SOURCE_FILES="gpu_utils.c accel_utils.cpp" +GPU_UTILS_HEADER_FILES="gpu_utils.h accel_utils.h" if [ -n "$CUDA_FOUND" ]; then GPU_UTILS_EXTRA_LIB_PATH=$CUDA_LIBRARY_PATH - GPU_UTILS_DYNAMIC_LIBS="c cudart" + GPU_UTILS_DYNAMIC_LIBS="c cudart cublas cusparse" GPU_UTILS_SOURCE_FILES="$GPU_UTILS_SOURCE_FILES gpu_utils.cu" fi -COMPILE_SHARED_LIB "gpu_utils" "$GPU_UTILS_SOURCE_FILES" "$GPU_UTILS_HEADER_FILES" "" "$GPU_UTILS_DYNAMIC_LIBS" "$GPU_UTILS_EXTRA_LIB_PATH" "" "$CUDA_FOUND" -#COMPILE_STATIC_LIB "gpu_utils" "$GPU_UTILS_SOURCE_FILES" "$GPU_UTILS_HEADER_FILES" +COMPILE_SHARED_LIB "gpu_utils" "$GPU_UTILS_SOURCE_FILES" "$GPU_UTILS_HEADER_FILES" "" "$GPU_UTILS_DYNAMIC_LIBS" "$GPU_UTILS_EXTRA_LIB_PATH" "" "$CUDA_FOUND" \ No newline at end of file diff --git a/src/gpu_utils/gpu_utils.cu b/src/gpu_utils/gpu_utils.cu index 3e81c180..7fec3e51 100644 --- a/src/gpu_utils/gpu_utils.cu +++ b/src/gpu_utils/gpu_utils.cu @@ -3,7 +3,8 @@ __global__ void gpu_ecg_integral_kernel(const real *beta_im, const real* distances, const real *volumes, int n, real *result); __global__ void kernel_gpu_vec_div_vec(real *vec1, real *vec2, real *vec3, size_t n); - extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) { + +extern "C" void gpu_vec_div_vec(real *vec1, real *vec2, real *res, size_t n) { const int GRID = (n + BLOCK_SIZE - 1)/BLOCK_SIZE; kernel_gpu_vec_div_vec<<>>(vec1, vec2, res, n); cudaDeviceSynchronize(); diff --git a/src/save_mesh_library/save_mesh.c b/src/save_mesh_library/save_mesh.c index 6284895f..d03d6174 100644 --- a/src/save_mesh_library/save_mesh.c +++ b/src/save_mesh_library/save_mesh.c @@ -344,8 +344,7 @@ SAVE_MESH(save_as_text_or_binary) { float value; if(ode_solver->gpu) { value = (float) sv_cpu[i*ode_solver->original_num_cells]; - } - else { + } else { value = sv_cpu[i]; } @@ -357,8 +356,7 @@ SAVE_MESH(save_as_text_or_binary) { } fprintf(output_file, "\n"); - } - else { + } else { fprintf(output_file, "%g,%g,%g,%g,%g,%g,%g\n", center_x, center_y, center_z, dx, dy, dz, v); } }