diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8c1f7a472e..fb6bc7055c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -75,7 +75,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: 'v2023.12.3' + ref: 'v2024.04.0' file: 'pipelines/${CI_MACHINE}.yml' - artifact: '${CI_MACHINE}-jobs.yml' job: 'generate-job-lists' @@ -100,9 +100,11 @@ trigger-rajaperf: strategy: depend include: + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' # [Optional] checks preliminary to running the actual CI test - project: 'radiuss/radiuss-shared-ci' - ref: 'v2023.12.3' + ref: 'v2024.04.0' file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index eb7011b78a..62d7908945 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -19,17 +19,17 @@ variables: # Note: We repeat the reservation, necessary when jobs are manually re-triggered. RUBY_JOB_ALLOC: "--reservation=ci --nodes=1" # Project specific variants for ruby - PROJECT_RUBY_VARIANTS: "~shared +openmp +tests" + PROJECT_RUBY_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for ruby PROJECT_RUBY_DEPS: "" # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --time=60 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=120 --nodes=1" # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle - PROJECT_POODLE_VARIANTS: "~shared +openmp +tests" + PROJECT_POODLE_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for poodle PROJECT_POODLE_DEPS: "" @@ -39,26 +39,26 @@ variables: # Arguments for job level allocation CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona - PROJECT_CORONA_VARIANTS: "~shared ~openmp +tests" + PROJECT_CORONA_VARIANTS: "~shared ~openmp +vectorization +tests" # Project specific deps for corona PROJECT_CORONA_DEPS: "^blt@develop " # Tioga # Arguments for top level allocation - TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1 -o per-resource.count=2" + TIOGA_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona - PROJECT_TIOGA_VARIANTS: "~shared ~openmp +tests" + PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for corona PROJECT_TIOGA_DEPS: "^blt@develop " # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_JOB_ALLOC: "1 -W 30 -q pci" + LASSEN_JOB_ALLOC: "1 -W 40 -q pci" # Project specific variants for lassen - PROJECT_LASSEN_VARIANTS: "~shared +openmp +tests cuda_arch=70" + PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70" # Project specific deps for lassen PROJECT_LASSEN_DEPS: "^blt@develop " diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e4823564b..9e5ecec0b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ include(CMakeDependentOption) # Set version number set(RAJA_VERSION_MAJOR 2024) set(RAJA_VERSION_MINOR 02) -set(RAJA_VERSION_PATCHLEVEL 1) +set(RAJA_VERSION_PATCHLEVEL 2) if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")) message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}") diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 2e26861191..c2df2a03ea 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -20,6 +20,39 @@ Notable changes include: * Bug fixes/improvements: +Version 2024.02.2 -- Release date 2024-05-08 +============================================ + +This release contains a bugfix and new execution policies that improve +performance for GPU kernels with reductions. + +Notable changes include: + + * New features / API changes: + * New GPU execution policies for CUDA and HIP added which provide + improved performance for GPU kernels with reductions. Please see the + RAJA User Guide for more information. Short summary: + * Option added to change max grid size in policies that use the + occupancy calculator. + * Policies added to run with max occupancy, a fraction of of the + max occupancy, and to run with a "concretizer" which allows a + user to determine how to run based on what the occupancy + calculator determines about a kernel. + * Additional options to tune kernels containing reductions, such as + * an option to initialize data on host for reductions that use + atomic operations + * an option to avoid device scope memory fences + * Change ordering of SYCL thread index ordering in RAJA::launch to + follow the SYCL "row-major" convention. Please see RAJA User Guide + for more information. + + * Build changes/improvements: + * NONE. + + * Bug fixes/improvements: + * Fixed issue in bump-style allocator used internally in RAJA::launch. + + Version 2024.02.1 -- Release date 2024-04-03 ============================================ diff --git a/docs/Licenses/rocprim-license.txt b/docs/Licenses/rocprim-license.txt new file mode 100644 index 0000000000..976ca2abb3 --- /dev/null +++ b/docs/Licenses/rocprim-license.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/conf.py b/docs/conf.py index 1570ed2888..3212170b30 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -88,7 +88,7 @@ # The short X.Y version. version = u'2024.02' # The full version, including alpha/beta/rc tags. -release = u'2024.02.1' +release = u'2024.02.2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst new file mode 100644 index 0000000000..91494f3674 --- /dev/null +++ b/docs/sphinx/user_guide/cook_book.rst @@ -0,0 +1,23 @@ +.. ## +.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _cook-book-label: + +************************ +RAJA Cook Book +************************ + +The following sections show common use case patterns and the recommended +RAJA features and policies to use with them. They are intended +to provide users with complete beyond usage examples beyond what can be found in other parts of the RAJA User Guide. In particular, the examples and discussion provide guidance on RAJA execution policy selection to improve performance of user application codes. + +.. toctree:: + :maxdepth: 2 + + cook_book/reduction + diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst new file mode 100644 index 0000000000..73843ebb40 --- /dev/null +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -0,0 +1,110 @@ +.. ## +.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _cook-book-reductions-label: + +======================= +Cooking with Reductions +======================= + +Please see the following section for overview discussion about RAJA reductions: + + * :ref:`feat-reductions-label`. + + +---------------------------- +Reductions with RAJA::forall +---------------------------- + +Here is the setup for a simple reduction example:: + + const int N = 1000; + + int vec[N]; + + for (int i = 0; i < N; ++i) { + + vec[i] = 1; + + } + +Here a simple sum reduction is performed in a for loop:: + + int vsum = 0; + + // Run a kernel using the reduction objects + for (int i = 0; i < N; ++i) { + + vsum += vec[i]; + + } + +The results of these operations will yield the following values: + + * ``vsum == 1000`` + +RAJA uses policy types to specify how things are implemented. + +The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied. +For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The +``RAJA::cuda_exec_with_reduce<256>`` runs the loop as a CUDA GPU kernel with +256 threads per block and other CUDA kernel launch parameters, like the +number of blocks, optimized for performance with reducers.:: + + using exec_policy = RAJA::seq_exec; + // using exec_policy = RAJA::omp_parallel_for_exec; + // using exec_policy = RAJA::omp_target_parallel_for_exec<256>; + // using exec_policy = RAJA::cuda_exec_with_reduce<256>; + // using exec_policy = RAJA::hip_exec_with_reduce<256>; + // using exec_policy = RAJA::sycl_exec<256>; + +The reduction policy specifies how the reduction is done and must match the +execution policy. For example ``RAJA::seq_reduce`` does a sequential reduction +and can only be used with sequential execution policies. The +``RAJA::cuda_reduce_atomic`` policy uses atomics, if possible with the given +data type, and can only be used with cuda execution policies. Similarly for other RAJA execution back-ends, such as HIP and OpenMP. Here are example RAJA reduction policies whose names are indicative of which execution policies they work with:: + + using reduce_policy = RAJA::seq_reduce; + // using reduce_policy = RAJA::omp_reduce; + // using reduce_policy = RAJA::omp_target_reduce; + // using reduce_policy = RAJA::cuda_reduce_atomic; + // using reduce_policy = RAJA::hip_reduce_atomic; + // using reduce_policy = RAJA::sycl_reduce; + + +Here a simple sum reduction is performed using RAJA:: + + RAJA::ReduceSum vsum(0); + + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + vsum += vec[i]; + + }); + +The results of these operations will yield the following values: + + * ``vsum.get() == 1000`` + + +Another option for the execution policy when using the cuda or hip backends are +the base policies which have a boolean parameter to choose between the general +use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.:: + + // static constexpr bool with_reduce = ...; + // using exec_policy = RAJA::cuda_exec_base; + // using exec_policy = RAJA::hip_exec_base; + +Another option for the reduction policy when using the cuda or hip backends are +the base policies which have a boolean parameter to choose between the atomic +``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.:: + + // static constexpr bool with_atomic = ...; + // using reduce_policy = RAJA::cuda_reduce_base; + // using reduce_policy = RAJA::hip_reduce_base; diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index e61be4e598..facde1da5d 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -236,180 +236,264 @@ RAJA policies for GPU execution using CUDA or HIP are essentially identical. The only difference is that CUDA policies have the prefix ``cuda_`` and HIP policies have the prefix ``hip_``. - ========================================= ============= ======================================= - CUDA/HIP Execution Policies Works with Brief description - ========================================= ============= ======================================= - cuda/hip_exec forall, Execute loop iterations - scan, directly mapped to global threads - sort in a GPU kernel launched - with given thread-block - size and unbounded grid size. - Note that the thread-block - size must be provided, - there is no default. - cuda/hip_exec_grid forall, Execute loop iterations - mapped to global threads via - grid striding with multiple - iterations per global thread - in a GPU kernel launched - with given thread-block - size and grid size. - Note that the thread-block - size and grid size must be - provided, there is no default. - cuda/hip_exec_occ_calc forall Execute loop iterations - mapped to global threads via - grid striding with multiple - iterations per global thread - in a GPU kernel launched - with given thread-block - size and grid size bounded - by the maximum occupancy of - the kernel. Note that the - thread-block size must - be provided, there is no - default. Note this can improve - reducer performance in kernels - with large iteration counts. - cuda/hip_launch_t launch Launches a device kernel, - any code expressed within - the lambda is executed - on the device. - cuda/hip_thread_x_direct kernel (For) Map loop iterates - launch (loop) directly to GPU threads - in x-dimension, one - iterate per thread - (see note below about - limitations) - cuda/hip_thread_y_direct kernel (For) Same as above, but map - launch (loop) to threads in y-dim - cuda/hip_thread_z_direct kernel (For) Same as above, but map - launch (loop) to threads in z-dim - cuda/hip_thread_x_loop kernel (For) Similar to - launch (loop) thread-x-direct - policy, but use a - block-stride loop which - doesn't limit number of - loop iterates - cuda/hip_thread_y_loop kernel (For) Same as above, but for - launch (loop) threads in y-dimension - cuda/hip_thread_z_loop kernel (For) Same as above, but for - launch (loop) threads in z-dimension - cuda/hip_thread_syncable_loop kernel (For) Similar to thread-loop - launch (loop) policy, but safe to use - with Cuda/HipSyncThreads - cuda/hip_thread_size_x_direct kernel (For) Same as thread_x_direct - launch (loop) policy above but with - a compile time number of - threads - cuda/hip_thread_size_y_direct kernel (For) Same as above, but map - launch (loop) to threads in y-dim - cuda/hip_thread_size_z_direct kernel (For) Same as above, but map - launch (loop) to threads in z-dim - cuda/hip_flatten_threads_{xyz}_direct launch (loop) Reshapes threads in a - multi-dimensional thread - team into one-dimension, - accepts any permutation - of dimensions - cuda/hip_block_x_direct kernel (For) Map loop iterates - launch (loop) directly to GPU thread - blocks in x-dimension, - one iterate per block - cuda/hip_block_y_direct kernel (For) Same as above, but map - launch (loop) to blocks in y-dimension - cuda/hip_block_z_direct kernel (For) Same as above, but map - launch (loop) to blocks in z-dimension - cuda/hip_block_x_loop kernel (For) Similar to - launch (loop) block-x-direct policy, - but use a grid-stride - loop. - cuda/hip_block_y_loop kernel (For) Same as above, but use - launch (loop) blocks in y-dimension - cuda/hip_block_z_loop kernel (For) Same as above, but use - launch (loop) blocks in z-dimension - cuda/hip_block_size_x_direct kernel (For) Same as block_x_direct - launch (loop) policy above but with - a compile time number of - blocks - cuda/hip_block_size_y_direct kernel (For) Same as above, but map - launch (loop) to blocks in y-dim - cuda/hip_block_size_z_direct kernel (For) Same as above, but map - launch (loop) to blocks in z-dim - cuda/hip_global_x_direct kernel (For) Creates a unique thread - launch (loop) id for each thread on - x-dimension of the grid. - Same as computing - threadIdx.x + - threadDim.x * blockIdx.x. - cuda/hip_global_y_direct kernel (For) Same as above, but uses - launch (loop) globals in y-dimension. - cuda/hip_global_z_direct kernel (For) Same as above, but uses - launch (loop) globals in z-dimension. - cuda/hip_global_x_loop kernel (For) Similar to - launch (loop) global-x-direct policy, - but use a grid-stride - loop. - cuda/hip_global_y_loop kernel (For) Same as above, but use - launch (loop) globals in y-dimension - cuda/hip_global_z_loop kernel (For) Same as above, but use - launch (loop) globals in z-dimension - cuda/hip_global_size_x_direct kernel (For) Same as global_x_direct - launch (loop) policy above but with - a compile time block - size - cuda/hip_global_size_y_direct kernel (For) Same as above, but map - launch (loop) to globals in y-dim - cuda/hip_global_size_z_direct kernel (For) Same as above, but map - launch (loop) to globals in z-dim - cuda/hip_warp_direct kernel (For) Map work to threads - in a warp directly. - Cannot be used in - conjunction with - cuda/hip_thread_x_* - policies. - Multiple warps can be - created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_warp_loop kernel (For) Policy to map work to - threads in a warp using - a warp-stride loop. - Cannot be used in - conjunction with - cuda/hip_thread_x_* - policies. - Multiple warps can be - created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_warp_masked_direct> kernel (For) Policy to map work - directly to threads in a - warp using a bit mask. - Cannot be used in - conjunction with - cuda/hip_thread_x_* - policies. - Multiple warps can - be created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_warp_masked_loop> kernel (For) Policy to map work to - threads in a warp using - a bit mask and a - warp-stride loop. Cannot - be used in conjunction - with cuda/hip_thread_x_* - policies. Multiple warps - can be created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_block_reduce kernel Perform a reduction - (Reduce) across a single GPU - thread block. - cuda/hip_warp_reduce kernel Perform a reduction - (Reduce) across a single GPU - thread warp. - ========================================= ============= ======================================= ++----------------------------------------------------+---------------+---------------------------------+ +| CUDA/HIP Execution Policies | Works with | Brief description | ++====================================================+===============+=================================+ +| cuda/hip_exec | forall, | Execute loop iterations | +| | scan, | directly mapped to global | +| | sort | threads in a GPU kernel | +| | | launched with given threadblock | +| | | size and unbounded grid size. | +| | | Note that the threadblock | +| | | size must be provided. | +| | | There is no default. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_with_reduce | forall | The cuda/hip exec policy | +| | | recommended for use with | +| | | kernels containing reductions. | +| | | In general, using the occupancy | +| | | calculator policies improves | +| | | performance of kernels with | +| | | reductions. Exactly how much | +| | | occupancy to use differs by | +| | | platform. This policy provides | +| | | a simple way to get what works | +| | | well for a platform without | +| | | having to know the details. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_base | forall | Choose between cuda/hip_exec | +| | | and cuda/hip_exec_with_reduce | +| | | policies based on the boolean | +| | | template parameter 'with_reduce'| ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_grid | forall | Execute loop iterations | +| | | mapped to global threads via | +| | | grid striding with multiple | +| | | iterations per global thread | +| | | in a GPU kernel launched | +| | | with given thread-block | +| | | size and grid size. | +| | | Note that the thread-block | +| | | size and grid size must be | +| | | provided, there is no default. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_max | forall | Execute loop iterations | +| | | mapped to global threads via | +| | | grid striding with multiple | +| | | iterations per global thread | +| | | in a GPU kernel launched | +| | | with given thread-block | +| | | size and grid size bounded | +| | | by the maximum occupancy of | +| | | the kernel. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_calc | forall | Similar to the occ_max | +| | | policy but may use less | +| | | than the maximum occupancy | +| | | determined by the occupancy | +| | | calculator of the kernel for | +| | | performance reasons. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_fraction> | | but use a fraction of the | +| | | maximum occupancy of the kernel.| +| | | | +| | | | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_custom | forall | Similar to the occ_max policy | +| | | policy but the grid size is | +| | | is determined by concretizer. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_launch_t | launch | Launches a device kernel, any | +| | | code inside the lambda | +| | | expression is executed | +| | | on the device. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_x_direct | kernel (For) | Map loop iterates directly to | +| | launch (loop) | GPU threads in x-dimension, one | +| | | iterate per thread. See note | +| | | below about limitations. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in y-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_x_loop | kernel (For) | Similar to thread-x-direct | +| | launch (loop) | policy, but use a block-stride | +| | | loop which doesn't limit total | +| | | number of loop iterates. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_y_loop | kernel (For) | Same as above, but for | +| | launch (loop) | threads in y-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_z_loop | kernel (For) | Same as above, but for | +| | launch (loop) | threads in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_syncable_loop | kernel (For) | Similar to thread-loop | +| | launch (loop) | policy, but safe to use | +| | | with Cuda/HipSyncThreads. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_size_x_direct | kernel (For) | Same as thread_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time number of | +| | | threads. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_size_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_size_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_flatten_threads_{xyz}_direct | launch (loop) | Reshapes threads in a | +| | | multi-dimensional thread | +| | | team into one-dimension, | +| | | accepts any permutation | +| | | of dimensions | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_x_direct | kernel (For) | Map loop iterates | +| | launch (loop) | directly to GPU thread | +| | | blocks in x-dimension, | +| | | one iterate per block | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in z-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_x_loop | kernel (For) | Similar to | +| | launch (loop) | block-x-direct policy, | +| | | but use a grid-stride | +| | | loop. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_y_loop | kernel (For) | Same as above, but use | +| | launch (loop) | blocks in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_z_loop | kernel (For) | Same as above, but use | +| | launch (loop) | blocks in z-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_size_x_direct | kernel (For) | Same as block_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time number of | +| | | blocks | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_size_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in y-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_size_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in z-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_x_direct | kernel (For) | Creates a unique thread | +| | launch (loop) | id for each thread on | +| | | x-dimension of the grid. | +| | | Same as computing | +| | | threadIdx.x + | +| | | threadDim.x * blockIdx.x. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_y_direct | kernel (For) | Same as above, but uses | +| | launch (loop) | globals in y-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_z_direct | kernel (For) | Same as above, but uses | +| | launch (loop) | globals in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_x_loop | kernel (For) | Similar to | +| | launch (loop) | global-x-direct policy, | +| | | but use a grid-stride | +| | | loop. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_y_loop | kernel (For) | Same as above, but use | +| | launch (loop) | globals in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_z_loop | kernel (For) | Same as above, but use | +| | launch (loop) | globals in z-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_size_x_direct | kernel (For) | Same as global_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time block | +| | | size | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_size_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to globals in y-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_size_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to globals in z-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_direct | kernel (For) | Map work to threads | +| | | in a warp directly. | +| | | Cannot be used in | +| | | conjunction with | +| | | cuda/hip_thread_x_* | +| | | policies. | +| | | Multiple warps can be | +| | | created by using | +| | | cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_loop | kernel (For) | Map work to threads in a warp | +| | | using a warp-stride loop. | +| | | Cannot be used with | +| | | cuda/hip_thread_x_* policies. | +| | | Multiple warps can be created | +| | | by using cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_masked_direct> | kernel | Mmap work directly to threads | +| | (For) | in a warp using a bit mask. | +| | | Cannot be used with | +| | | cuda/hip_thread_x_* policies. | +| | | Multiple warps can be created | +| | | by using cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_masked_loop> | kernel | Map work to threads in a warp | +| | (For) | using a bit mask and a warp- | +| | | stride loop. | +| | | Cannot be used with | +| | | cuda/hip_thread_x_* policies. | +| | | Multiple warps can be created | +| | | by using cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_reduce | kernel | Perform a reduction across a | +| | (Reduce) | single GPU thread block. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_reduce | kernel | Perform a reduction across a | +| | (Reduce) | single GPU thread warp. | +| | | thread warp. | ++----------------------------------------------------+---------------+---------------------------------+ + +When a CUDA or HIP policy leaves parameters like the block size and/or grid size +unspecified a concretizer object is used to decide those parameters. The +following concretizers are available to use in the ``cuda/hip_exec_occ_custom`` +policies: + ++----------------------------------------------------+-----------------------------------------+ +| Execution Policy | Brief description | ++====================================================+=========================================+ +| Cuda/HipDefaultConcretizer | The default concretizer, expected to | +| | provide good performance in general. | +| | Note that it may not use max occupancy. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipRecForReduceConcretizer | Expected to provide good performance | +| | in loops with reducers. | +| | Note that it may not use max occupancy. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipMaxOccupancyConcretizer | Uses max occupancy. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer | Avoids using the max occupancy of the | +| | device in terms of threads. | +| | Note that it may use the max occupancy | +| | of the kernel if that is below the max | +| | occupancy of the device. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipFractionOffsetOccupancyConcretizer< | Uses a fraction and offset to choose an | +| Fraction, | occupancy based on the max occupancy | +| BLOCKS_PER_SM_OFFSET> | Using the following formula: | +| | (Fraction * kernel_max_blocks_per_sm + | +| | BLOCKS_PER_SM_OFFSET) * sm_per_device | ++----------------------------------------------------+-----------------------------------------+ Several notable constraints apply to RAJA CUDA/HIP *direct* policies. @@ -473,99 +557,133 @@ write more explicit policies. ignored. For example in cuda_thread_x_direct block_size is unspecified so a runtime number of threads is used, but grid_size is ignored so blocks are ignored when getting indices. - + GPU Policies for SYCL ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - ======================================== ============= ============================== - SYCL Execution Policies Works with Brief description - ======================================== ============= ============================== - sycl_exec forall, Execute loop iterations - in a GPU kernel launched - with given work group - size. - sycl_launch_t launch Launches a sycl kernel, - any code express within - the lambda is executed - on the device. - sycl_global_0 kernel (For) Map loop iterates - directly to GPU global - ids in first - dimension, one iterate - per work item. Group - execution into work - groups of given size. - sycl_global_1 kernel (For) Same as above, but map - to global ids in second - dim - sycl_global_2 kernel (For) Same as above, but map - to global ids in third - dim - sycl_global_item_0 launch (loop) Creates a unique thread - id for each thread for - dimension 0 of the grid. - Same as computing - itm.get_group(0) * - itm.get_local_range(0) + - itm.get_local_id(0). - sycl_global_item_1 launch (loop) Same as above, but uses - threads in dimension 1 - Same as computing - itm.get_group(1) + - itm.get_local_range(1) * - itm.get_local_id(1). - sycl_global_item_2 launch (loop) Same as above, but uses - threads in dimension 2 - Same as computing - itm.get_group(2) + - itm.get_local_range(2) * - itm.get_local_id(2). - sycl_local_0_direct kernel (For) Map loop iterates - launch (loop) directly to GPU work - items in first - dimension, one iterate - per work item (see note - below about limitations) - sycl_local_1_direct kernel (For) Same as above, but map - launch (loop) to work items in second - dim - sycl_local_2_direct kernel (For) Same as above, but map - launch (loop) to work items in third - dim - sycl_local_0_loop kernel (For) Similar to - launch (loop) local-1-direct policy, - but use a work - group-stride loop which - doesn't limit number of - loop iterates - sycl_local_1_loop kernel (For) Same as above, but for - launch (loop) work items in second - dimension - sycl_local_2_loop kernel (For) Same as above, but for - launch (loop) work items in third - dimension - sycl_group_0_direct kernel (For) Map loop iterates - launch (loop) directly to GPU group - ids in first dimension, - one iterate per group - sycl_group_1_direct kernel (For) Same as above, but map - launch (loop) to groups in second - dimension - sycl_group_2_direct kernel (For) Same as above, but map - launch (loop) to groups in third - dimension - sycl_group_0_loop kernel (For) Similar to - launch (loop) group-1-direct policy, - but use a group-stride - loop. - sycl_group_1_loop kernel (For) Same as above, but use - launch (loop) groups in second - dimension - sycl_group_2_loop kernel (For) Same as above, but use - launch (loop) groups in third - dimension - - ======================================== ============= ============================== +.. note:: SYCL uses C++-style ordering for its work group and global thread + dimension/indexing types. This is due, in part, to SYCL's closer + alignment with C++ multi-dimensional indexing, which is "row-major". + This is the reverse of the thread indexing used in CUDA or HIP, + which is "column-major". For example, suppose we have a thread-block + or work-group where we specify the shape as (nx, ny, nz). Consider + an element in the thread-block or work-group with id (x, y, z). + In CUDA or HIP, the element index is x + y * nx + z * nx * ny. In + SYCL, the element index is z + y * nz + x * nz * ny. + + In terms of the CUDA or HIP built-in variables to support threads, + we have:: + + Thread ID: threadIdx.x/y/z + Block ID: blockIdx.x/y/z + Block dimension: blockDim.x/y/z + Grid dimension: gridDim.x/y/z + + The analogues in SYCL are:: + + Thread ID: sycl::nd_item.get_local_id(2/1/0) + Work-group ID: sycl::nd_item.get_group(2/1/0) + Work-group dimensions: sycl::nd_item.get_local_range().get(2/1/0) + ND-range dimensions: sycl::nd_item.get_group_range(2/1/0) + + When using ``RAJA::launch``, thread and block configuration + follows CUDA and HIP programming models and is always + configured in three-dimensions. This means that SYCL dimension + 2 always exists and should be used as one would use the + x dimension for CUDA and HIP. + + Similarly, ``RAJA::kernel`` uses a three-dimensional work-group + configuration. SYCL imension 2 always exists and should be used as + one would use the x dimension in CUDA and HIP. + +======================================== ============= ============================== +SYCL Execution Policies Works with Brief description +======================================== ============= ============================== +sycl_exec forall, Execute loop iterations + in a GPU kernel launched + with given work group + size. +sycl_launch_t launch Launches a sycl kernel, + any code express within + the lambda is executed + on the device. +sycl_global_0 kernel (For) Map loop iterates + directly to GPU global + ids in first + dimension, one iterate + per work item. Group + execution into work + groups of given size. +sycl_global_1 kernel (For) Same as above, but map + to global ids in second + dim +sycl_global_2 kernel (For) Same as above, but map + to global ids in third + dim +sycl_global_item_0 launch (loop) Creates a unique thread + id for each thread for + dimension 0 of the grid. + Same as computing + itm.get_group(0) * + itm.get_local_range(0) + + itm.get_local_id(0). +sycl_global_item_1 launch (loop) Same as above, but uses + threads in dimension 1 + Same as computing + itm.get_group(1) + + itm.get_local_range(1) * + itm.get_local_id(1). +sycl_global_item_2 launch (loop) Same as above, but uses + threads in dimension 2 + Same as computing + itm.get_group(2) + + itm.get_local_range(2) * + itm.get_local_id(2). +sycl_local_0_direct kernel (For) Map loop iterates + launch (loop) directly to GPU work + items in first + dimension, one iterate + per work item (see note + below about limitations) +sycl_local_1_direct kernel (For) Same as above, but map + launch (loop) to work items in second + dim +sycl_local_2_direct kernel (For) Same as above, but map + launch (loop) to work items in third + dim +sycl_local_0_loop kernel (For) Similar to + launch (loop) local-1-direct policy, + but use a work + group-stride loop which + doesn't limit number of + loop iterates +sycl_local_1_loop kernel (For) Same as above, but for + launch (loop) work items in second + dimension +sycl_local_2_loop kernel (For) Same as above, but for + launch (loop) work items in third + dimension +sycl_group_0_direct kernel (For) Map loop iterates + launch (loop) directly to GPU group + ids in first dimension, + one iterate per group +sycl_group_1_direct kernel (For) Same as above, but map + launch (loop) to groups in second + dimension +sycl_group_2_direct kernel (For) Same as above, but map + launch (loop) to groups in third + dimension +sycl_group_0_loop kernel (For) Similar to + launch (loop) group-1-direct policy, + but use a group-stride + loop. +sycl_group_1_loop kernel (For) Same as above, but use + launch (loop) groups in second + dimension +sycl_group_2_loop kernel (For) Same as above, but use + launch (loop) groups in third + dimension +======================================== ============= ============================== OpenMP Target Offload Policies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -676,26 +794,57 @@ It is important to note the following constraints about RAJA reduction usage: The following table summarizes RAJA reduction policy types: -======================= ============= ========================================== -Reduction Policy Loop Policies Brief description - to Use With -======================= ============= ========================================== -seq_reduce seq_exec, Non-parallel (sequential) reduction. -omp_reduce any OpenMP OpenMP parallel reduction. - policy -omp_reduce_ordered any OpenMP OpenMP parallel reduction with result - policy guaranteed to be reproducible. -omp_target_reduce any OpenMP OpenMP parallel target offload reduction. - target policy -cuda/hip_reduce any CUDA/HIP Parallel reduction in a CUDA/HIP kernel - policy (device synchronization will occur when - reduction value is finalized). -cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use CUDA - policy atomic operations. -sycl_reduce any SYCL Reduction in a SYCL kernel (device - policy synchronization will occur when the - reduction value is finalized). -======================= ============= ========================================== +================================================= ============= ========================================== +Reduction Policy Loop Policies Brief description + to Use With +================================================= ============= ========================================== +seq_reduce seq_exec, Non-parallel (sequential) reduction. +omp_reduce any OpenMP OpenMP parallel reduction. + policy +omp_reduce_ordered any OpenMP OpenMP parallel reduction with result + policy guaranteed to be reproducible. +omp_target_reduce any OpenMP OpenMP parallel target offload reduction. + target policy +cuda/hip_reduce any CUDA/HIP Parallel reduction in a CUDA/HIP kernel + policy (device synchronization will occur when + reduction value is finalized). +cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use + policy atomic operations leading to run to run + variability in the results. +cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and + policy cuda/hip_reduce_atomic policies based on + the with_atomic boolean. +cuda/hip_reduce_device_fence any CUDA/HIP Same as above, and reduction uses normal + policy memory accesses that are not visible across + the whole device and device scope fences + to ensure visibility and ordering. + This works on all architectures but + incurs higher overheads on some architectures. +cuda/hip_reduce_block_fence any CUDA/HIP Same as above, and reduction uses special + policy memory accesses to a level of cache + visible to the whole device and block scope + fences to ensure ordering. This improves + performance on some architectures. +cuda/hip_reduce_atomic_host_init_device_fence any CUDA/HIP Same as above with device fence, but + policy initializes the memory used for atomics + on the host. This works well on recent + architectures and incurs lower overheads. +cuda/hip_reduce_atomic_host_init_block_fence any CUDA/HIP Same as above with block fence, but + policy initializes the memory used for atomics + on the host. This works well on recent + architectures and incurs lower overheads. +cuda/hip_reduce_atomic_device_init_device_fence any CUDA/HIP Same as above with device fence, but + policy initializes the memory used for atomics + on the device. This works on all architectures + but incurs higher overheads. +cuda/hip_reduce_atomic_device_init_block_fence any CUDA/HIP Same as above with block fence, but + policy initializes the memory used for atomics + on the device. This works on all architectures + but incurs higher overheads. +sycl_reduce any SYCL Reduction in a SYCL kernel (device + policy synchronization will occur when the + reduction value is finalized). +================================================= ============= ========================================== .. note:: RAJA reductions used with SIMD execution policies are not guaranteed to generate correct results. So they should not be used diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst index 8643e4a225..5f2f09afad 100644 --- a/docs/sphinx/user_guide/feature/reduction.rst +++ b/docs/sphinx/user_guide/feature/reduction.rst @@ -39,6 +39,10 @@ RAJA reductions: * :ref:`tut-reduction-label`. +Please see the following cook book sections for guidance on policy usage: + + * :ref:`cook-book-reductions-label`. + ---------------- Reduction Types diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index f2fb6ca46d..f73f4d9449 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -32,5 +32,6 @@ to use RAJA in an application can be found in :ref:`app-considerations-label`. using_raja config_options features + cook_book app_considerations tutorial diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index f41aad477b..c37ac997a4 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -33,8 +33,10 @@ #include "RAJA/util/camp_aliases.hpp" #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/math.hpp" #include "RAJA/util/plugins.hpp" #include "RAJA/util/Registry.hpp" +#include "RAJA/util/for_each.hpp" // @@ -57,13 +59,6 @@ // #include "RAJA/policy/sequential.hpp" -// -// NOTE: LOOP POLCIES WERE DEPRECATED IN 2023.03.0 RELEASE. -// THEY ARE RE-ADDED HERE AT REQUEST OF USERS. -// THEY WILL BE REMOVED AGAIN IN THE FUTURE. -// -#include "RAJA/policy/loop.hpp" - // // All platforms should support simd and vector execution. // @@ -155,6 +150,11 @@ // #include "RAJA/util/sort.hpp" +// +// reduce algorithms +// +#include "RAJA/util/reduce.hpp" + // // WorkPool, WorkGroup, WorkSite objects // diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 6f56f4ed65..213c435236 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -174,10 +174,14 @@ class LaunchContext template RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes) { - T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; + + //Calculate offset in bytes with a char pointer + void* mem_ptr = static_cast(shared_mem_ptr) + shared_mem_offset; shared_mem_offset += bytes*sizeof(T); - return mem_ptr; + + //convert to desired type + return static_cast(mem_ptr); } /* diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 2a8f848825..43d927acab 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -61,7 +61,7 @@ struct PinnedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { cudaErrchk(cudaFreeHost(ptr)); @@ -80,7 +80,7 @@ struct DeviceAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { cudaErrchk(cudaFree(ptr)); @@ -103,7 +103,31 @@ struct DeviceZeroedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure + bool free(void* ptr) + { + cudaErrchk(cudaFree(ptr)); + return true; + } +}; + +//! Allocator for device pinned memory for use in basic_mempool +struct DevicePinnedAllocator { + + // returns a valid pointer on success, nullptr on failure + void* malloc(size_t nbytes) + { + int device; + cudaErrchk(cudaGetDevice(&device)); + void* ptr; + cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal)); + cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device)); + cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)); + + return ptr; + } + + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { cudaErrchk(cudaFree(ptr)); @@ -114,6 +138,7 @@ struct DeviceZeroedAllocator { using device_mempool_type = basic_mempool::MemPool; using device_zeroed_mempool_type = basic_mempool::MemPool; +using device_pinned_mempool_type = basic_mempool::MemPool; using pinned_mempool_type = basic_mempool::MemPool; namespace detail @@ -279,6 +304,7 @@ RAJA_INLINE typename std::remove_reference::type make_launch_body( return return_type(std::forward(loop_body)); } +//! Get the properties of the current device RAJA_INLINE cudaDeviceProp get_device_prop() { @@ -289,194 +315,217 @@ cudaDeviceProp get_device_prop() return prop; } +//! Get a copy of the device properties, this copy is cached on first use to speedup later calls RAJA_INLINE cudaDeviceProp& device_prop() { - static cudaDeviceProp prop = get_device_prop(); + static thread_local cudaDeviceProp prop = get_device_prop(); return prop; } +static constexpr int cuda_occupancy_uninitialized_int = -1; +static constexpr size_t cuda_occupancy_uninitialized_size_t = + std::numeric_limits::max(); + +//! Struct with the maximum theoretical occupancy of the device struct CudaFixedMaxBlocksData { - int multiProcessorCount; - int maxThreadsPerMultiProcessor; + int device_sm_per_device = cuda::device_prop().multiProcessorCount; + int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor; }; +//! Get the maximum theoretical occupancy of the device RAJA_INLINE -size_t cuda_max_blocks(size_t block_size) +CudaFixedMaxBlocksData cuda_max_blocks() { - static CudaFixedMaxBlocksData data = []() { - cudaDeviceProp& prop = cuda::device_prop(); - return CudaFixedMaxBlocksData{prop.multiProcessorCount, - prop.maxThreadsPerMultiProcessor}; - }(); - - size_t max_blocks = data.multiProcessorCount * - (data.maxThreadsPerMultiProcessor / block_size); + static thread_local CudaFixedMaxBlocksData data; - return max_blocks; + return data; } +//! Struct with the maximum occupancy of a kernel in simple terms struct CudaOccMaxBlocksThreadsData { - size_t prev_shmem_size; - int max_blocks; - int max_threads; + size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t; + int func_max_blocks_per_device = cuda_occupancy_uninitialized_int; + int func_max_threads_per_block = cuda_occupancy_uninitialized_int; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +//! Get the maximum occupancy of a kernel with unknown threads per block +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void cuda_occupancy_max_blocks_threads(Func&& func, size_t shmem_size, - int &max_blocks, int &max_threads) +CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; + static thread_local CudaOccMaxBlocksThreadsData data; - if (data.prev_shmem_size != shmem_size) { + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { - cudaErrchk(cudaOccupancyMaxPotentialBlockSize( - &data.max_blocks, &data.max_threads, func, shmem_size)); + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; - data.prev_shmem_size = shmem_size; + cudaErrchk(cudaOccupancyMaxPotentialBlockSize( + &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block)); } - max_blocks = data.max_blocks; - max_threads = data.max_threads; - + return data; } -struct CudaOccMaxBlocksFixedThreadsData +//! Struct with the maximum occupancy of a kernel in specific terms +struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData { - size_t prev_shmem_size; - int max_blocks; - int multiProcessorCount; + size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t; + int func_threads_per_block = cuda_occupancy_uninitialized_int; + int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func > +//! Get the maximum occupancy of a kernel with compile time threads per block +template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block > RAJA_INLINE -void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks) +CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksFixedThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; - - if (data.prev_shmem_size != shmem_size) { - - cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); + static thread_local CudaOccMaxBlocksData data; - if (data.multiProcessorCount == uninitialized) { + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { - data.multiProcessorCount = cuda::device_prop().multiProcessorCount; + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; - } - - data.max_blocks *= data.multiProcessorCount; - - data.prev_shmem_size = shmem_size; + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); } - max_blocks = data.max_blocks; - + return data; } -struct CudaOccMaxBlocksVariableThreadsData -{ - size_t prev_shmem_size; - int prev_num_threads; - int max_blocks; - int multiProcessorCount; -}; - -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +//! Get the maximum occupancy of a kernel with runtime threads per block +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks, int num_threads) +CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block, int func_threads_per_block) { - static constexpr int uninitialized = 0; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksVariableThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized, uninitialized}; + static thread_local CudaOccMaxBlocksData data; - if ( data.prev_shmem_size != shmem_size || - data.prev_num_threads != num_threads ) { + if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block || + data.func_threads_per_block != func_threads_per_block ) { - cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); - - if (data.multiProcessorCount == uninitialized) { - - data.multiProcessorCount = cuda::device_prop().multiProcessorCount; - - } - - data.max_blocks *= data.multiProcessorCount; + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; - data.prev_shmem_size = shmem_size; - data.prev_num_threads = num_threads; + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); } - max_blocks = data.max_blocks; - + return data; } -struct CudaOccupancyDefaults + +/*! + ****************************************************************************** + * + * \brief Concretizer Implementation that chooses block size and/or grid + * size when one or both has not been specified at compile time. + * + * \tparam IdxT Index type to use for integer calculations. + * \tparam Concretizer Class that determines the max number of blocks to use + * when fitting for the device. + * \tparam UniqueMarker A type that is unique to each global function, used to + * help cache the occupancy data for that global function. + * + * The methods come in two flavors: + * - The fit_len methods choose grid and block sizes that result in a total + * number of threads of at least the len given in the constructor or 0 if + * that is not possible. + * - The fit_device methods choose grid and block sizes that best fit the + * occupancy of the global function according to the occupancy calculator and + * the Concretizer class. + * + * Common terms: + * - block size - threads per block + * - grid size - blocks per device + * + ****************************************************************************** + */ +template < typename IdxT, typename Concretizer, typename UniqueMarker> +struct ConcretizerImpl { - CudaOccupancyDefaults(const void* RAJA_UNUSED_ARG(func)) + ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len) + : m_func(func) + , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block) + , m_len(len) { } - template < typename IdxT > - inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size), - IdxT RAJA_UNUSED_ARG(block_size)) const + IdxT get_max_block_size() const { - return std::numeric_limits::max(); + auto data = cuda_occupancy_max_blocks_threads( + m_func, m_func_dynamic_shmem_per_block); + IdxT func_max_threads_per_block = data.func_max_threads_per_block; + return func_max_threads_per_block; } - template < typename IdxT = cuda_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const + //! Get a block size when grid size is specified + IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const { - return std::make_pair(static_cast(::RAJA::policy::cuda::MAX_BLOCK_SIZE), - std::numeric_limits::max()); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + if (func_threads_per_block <= func_max_threads_per_block) { + return func_threads_per_block; + } else { + return IdxT(0); + } } -}; -template < typename UniqueMarker > -struct CudaOccupancyCalculator -{ - CudaOccupancyCalculator(const void* func) - : m_func(func) - { } + //! Get a grid size when block size is specified + IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const + { + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return func_blocks_per_device; + } + + //! Get a block size and grid size when neither is specified + auto get_block_and_grid_size_to_fit_len() const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); + } + + //! Get a block size when grid size is specified + IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + return std::min(func_threads_per_block, func_max_threads_per_block); + } - template < typename IdxT > - inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const + //! Get a grid size when block size is specified + IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const { - int max_grid_size = -1; - ::RAJA::cuda::cuda_occupancy_max_blocks( - m_func, dynamic_shmem_size, max_grid_size, block_size); - return static_cast(max_grid_size); + auto data = cuda_occupancy_max_blocks( + m_func, m_func_dynamic_shmem_per_block, func_threads_per_block); + IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size(data); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return std::min(func_blocks_per_device, func_max_blocks_per_device); } - template < typename IdxT = cuda_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const + //! Get a block size and grid size when neither is specified + auto get_block_and_grid_size_to_fit_device() const { - int max_block_size = -1; - int max_grid_size = -1; - ::RAJA::cuda::cuda_occupancy_max_blocks_threads( - m_func, dynamic_shmem_size, max_grid_size, max_block_size); - return std::make_pair(static_cast(max_block_size), - static_cast(max_grid_size)); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); } private: const void* m_func; + size_t m_func_dynamic_shmem_per_block; + IdxT m_len; }; } // namespace cuda diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index 3837a8b062..333f0f90e8 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -70,16 +70,17 @@ namespace impl * ****************************************************************************** */ -template +template struct ForallDimensionCalculator; // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0 // there are specializations for named_usage::unspecified // but named_usage::ignored is not supported so no specializations are provided // and static_asserts in the general case catch unsupported values -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); @@ -91,8 +92,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) { - if ( len > (static_cast(IndexGetter::block_size) * - static_cast(IndexGetter::grid_size)) ) { + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + if ( len > (block_size * grid_size) ) { RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); } @@ -101,9 +104,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); @@ -112,17 +116,26 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - // BEWARE: if calculated block_size is too high then the kernel launch will fail - internal::set_cuda_dim(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::grid_size))); - internal::set_cuda_dim(dims.blocks, static_cast(IndexGetter::grid_size)); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size); + + if ( block_size == IdxT(0) ) { + RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); + } + + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); @@ -131,16 +144,22 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - internal::set_cuda_dim(dims.threads, static_cast(IndexGetter::block_size)); - internal::set_cuda_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::block_size))); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size); + + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { using IndexGetter = ::RAJA::cuda::IndexGlobal; @@ -149,104 +168,104 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const auto sizes = concretizer.get_block_and_grid_size_to_fit_len(); - internal::set_cuda_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_cuda_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first))); + internal::set_cuda_dim(dims.threads, sizes.first); + internal::set_cuda_dim(dims.blocks, sizes.second); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len), const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) { - internal::set_cuda_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_cuda_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_block_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::grid_size)), - static_cast(max_sizes.first)); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size); - internal::set_cuda_dim(dims.threads, calculated_block_size); - internal::set_cuda_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size, - static_cast(IndexMapper::block_size)); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::block_size)), - static_cast(max_grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size); - internal::set_cuda_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_cuda_dim(dims.blocks, calculated_grid_size); + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first)), - static_cast(max_sizes.second)); + const auto sizes = concretizer.get_block_and_grid_size_to_fit_device(); - internal::set_cuda_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_cuda_dim(dims.blocks, calculated_grid_size); + internal::set_cuda_dim(dims.threads, sizes.first); + internal::set_cuda_dim(dims.blocks, sizes.second); } }; @@ -273,7 +292,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -298,7 +317,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -324,7 +343,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -352,7 +371,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -379,7 +398,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -405,7 +425,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -433,7 +454,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -462,7 +484,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -494,7 +517,7 @@ void forallp_cuda_kernel(LOOP_BODY loop_body, template RAJA_INLINE concepts::enable_if_t< @@ -502,7 +525,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> forall_impl(resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicitconst&, + ::RAJA::policy::cuda::cuda_exec_explicitconst&, Iterable&& iter, LoopBody&& loop_body, ForallParam) @@ -510,9 +533,9 @@ forall_impl(resources::Cuda cuda_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; + using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; using UniqueMarker = ::camp::list; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -563,7 +586,7 @@ forall_impl(resources::Cuda cuda_res, template RAJA_INLINE concepts::enable_if_t< @@ -571,7 +594,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty> > forall_impl(resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit const&, + ::RAJA::policy::cuda::cuda_exec_explicit const&, Iterable&& iter, LoopBody&& loop_body, ForallParam f_params) @@ -579,9 +602,9 @@ forall_impl(resources::Cuda cuda_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; + using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; using UniqueMarker = ::camp::list, LOOP_BODY, Iterator, ForallParam>; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -659,11 +682,11 @@ forall_impl(resources::Cuda cuda_res, */ template RAJA_INLINE resources::EventProxy forall_impl(resources::Cuda r, - ExecPolicy>, + ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) { @@ -672,7 +695,7 @@ forall_impl(resources::Cuda r, iset.segmentCall(r, isi, detail::CallForall(), - ::RAJA::policy::cuda::cuda_exec_explicit(), + ::RAJA::policy::cuda::cuda_exec_explicit(), loop_body); } // iterate over segments of index set diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp new file mode 100644 index 0000000000..b0d2ea7cf1 --- /dev/null +++ b/include/RAJA/policy/cuda/intrinsics.hpp @@ -0,0 +1,467 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA intrinsics templates for CUDA execution. + * + * These methods should work on any platform that supports + * CUDA devices. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_cuda_intrinsics_HPP +#define RAJA_cuda_intrinsics_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include + +#include + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/SoAArray.hpp" +#include "RAJA/util/types.hpp" + +#include "RAJA/policy/cuda/policy.hpp" + + +namespace RAJA +{ + +namespace cuda +{ + +namespace impl +{ + +/*! + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. + * + * \Note This uses device scope fences to ensure ordering and to flush local + * caches so that memory accesses become visible to the whole device. + * \Note This class uses normal memory accesses that are cached in local caches + * so device scope fences are required to make memory accesses visible + * to the whole device. + */ +struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor +{ + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { + __threadfence(); + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { + __threadfence(); + } +}; + +/*! + ****************************************************************************** + * + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. + * + * \Note This may use block scope fences to ensure ordering and avoid flushing + * local caches so special memory accesses are used to ensure visibility + * to the whole device. + * \Note This class uses device scope atomic memory accesses to bypass local + * caches so memory accesses are visible to the whole device without + * device scope fences. + * \Note A memory access may be split into multiple memory accesses, so + * even though atomic instructions are used concurrent accesses between + * different threads are not thread safe. + * + ****************************************************************************** + */ +struct AccessorDeviceScopeUseBlockFence +{ + // cuda has 32 and 64 bit atomics + static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); + static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long); + + template < typename T > + static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + auto ptr = const_cast(reinterpret_cast(in_ptr + idx)); + + for (size_t i = 0; i < u.array_size(); ++i) { + u.array[i] = atomicAdd(&ptr[i], integer_type(0)); + } + + return u.get_value(); + } + + template < typename T > + static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + u.set_value(val); + auto ptr = reinterpret_cast(in_ptr + idx); + + for (size_t i = 0; i < u.array_size(); ++i) { + atomicExch(&ptr[i], u.array[i]); + } + } + + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { + __threadfence(); + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { + __threadfence(); + } +}; + + +// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits +constexpr size_t min_shfl_int_type_size = sizeof(unsigned int); +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 +constexpr size_t max_shfl_int_type_size = sizeof(unsigned long long); +#else +constexpr size_t max_shfl_int_type_size = sizeof(unsigned int); +#endif + +/*! + ****************************************************************************** + * + * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. + * + * \Note Returns an undefined value if src lane is inactive (divergence). + * Returns this lane's value if src lane is out of bounds or has exited. + * + ****************************************************************************** + */ +template +RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask); +#else + u.array[i] = ::__shfl_xor(u.array[i], laneMask); +#endif + } + return u.get_value(); +} + +template +RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane); +#else + u.array[i] = ::__shfl(u.array[i], srcLane); +#endif + } + return u.get_value(); +} + +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync(unsigned int var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE long shfl_xor_sync(long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync(unsigned long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync(long long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync(unsigned long long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE double shfl_xor_sync(double var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +#else + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +#endif + + +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync(unsigned int var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE long shfl_sync(long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync(unsigned long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE long long shfl_sync(long long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync(unsigned long long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE double shfl_sync(double var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +#else + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + +#endif + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + T temp = val; + + if (numThreads % policy::cuda::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + return temp; +} + +/*! + * Allreduce values in a warp. + * + * + * This does a butterfly pattern leaving each lane with the full reduction + * + */ +template +RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) +{ + T temp = val; + + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + T rhs = __shfl_xor_sync(0xffffffff, temp, i); + Combiner{}(temp, rhs); + } + + return temp; +} + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int warpId = threadId % policy::cuda::WARP_SIZE; + int warpNum = threadId / policy::cuda::WARP_SIZE; + + T temp = val; + + if (numThreads % policy::cuda::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + // reduce per warp values + if (numThreads > policy::cuda::WARP_SIZE) { + + static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE, + "Max Warps must be less than or equal to Warp Size for this algorithm to work"); + + // Need to separate declaration and initialization for clang-cuda + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + + // Partial placement new: Should call new(tmpsd) here but recasting memory + // to avoid calling constructor/destructor in shared memory. + RAJA::detail::SoAArray* sd = + reinterpret_cast *>(tmpsd); + + // write per warp values to shared memory + if (warpId == 0) { + sd->set(warpNum, temp); + } + + __syncthreads(); + + if (warpNum == 0) { + + // read per warp values + if (warpId * policy::cuda::WARP_SIZE < numThreads) { + temp = sd->get(warpId); + } else { + temp = identity; + } + + for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + } + + __syncthreads(); + } + + return temp; +} + +} // end namespace impl + +} // end namespace cuda + +} // end namespace RAJA + +#endif // closing endif for RAJA_ENABLE_CUDA guard + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 6497a64f42..c070d618ea 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -87,7 +87,7 @@ namespace statement */ template struct CudaKernelExt - : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit, EnclosedStmts...> { + : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit, EnclosedStmts...> { }; @@ -284,7 +284,7 @@ struct CudaLaunchHelper(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -294,8 +294,10 @@ struct CudaLaunchHelper( - func, shmem_size, recommended_blocks, recommended_threads); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_device; + recommended_threads = data.func_max_threads_per_block; } else { @@ -305,8 +307,9 @@ struct CudaLaunchHelper( - func, shmem_size, recommended_blocks); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } @@ -360,7 +363,7 @@ struct CudaLaunchHelper(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -373,16 +376,18 @@ struct CudaLaunchHelper( - func, shmem_size, max_blocks, actual_threads); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks( + func, shmem_size, actual_threads); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } else { // // determine blocks when actual_threads == num_threads // - ::RAJA::cuda::cuda_occupancy_max_blocks( - func, shmem_size, max_blocks); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks( + func, shmem_size); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp index 11870f13b0..9de20c7b4b 100644 --- a/include/RAJA/policy/cuda/kernel/For.hpp +++ b/include/RAJA/policy/cuda/kernel/For.hpp @@ -108,7 +108,7 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -123,7 +123,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::cuda::cuda_indexer>; + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE @@ -180,7 +180,7 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -195,7 +195,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::cuda::cuda_indexer>; + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE @@ -246,7 +246,7 @@ struct CudaStatementExecutor< statement::For, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp index dd7c4c4ffe..8486abaa2c 100644 --- a/include/RAJA/policy/cuda/kernel/ForICount.hpp +++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp @@ -103,20 +103,20 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { using Base = CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -166,20 +166,20 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { using Base = CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -226,7 +226,7 @@ struct CudaStatementExecutor< statement::ForICount, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp index ad54c86a54..ad901f6b02 100644 --- a/include/RAJA/policy/cuda/kernel/Tile.hpp +++ b/include/RAJA/policy/cuda/kernel/Tile.hpp @@ -143,7 +143,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -153,7 +153,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -233,7 +233,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -243,7 +243,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -318,7 +318,7 @@ struct CudaStatementExecutor< Data, statement::Tile, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp index 84a0bec412..c611346d46 100644 --- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp +++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp @@ -131,14 +131,14 @@ struct CudaStatementExecutor< Data, statement::TileTCount, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -146,7 +146,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -209,14 +209,14 @@ struct CudaStatementExecutor< Data, statement::TileTCount, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -224,7 +224,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -281,7 +281,7 @@ struct CudaStatementExecutor< Data, statement::TileTCount, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp index a33b564309..9c904ea45a 100644 --- a/include/RAJA/policy/cuda/kernel/internal.hpp +++ b/include/RAJA/policy/cuda/kernel/internal.hpp @@ -388,7 +388,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -402,7 +402,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -418,7 +418,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -436,7 +436,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -451,7 +451,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -469,7 +469,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -488,7 +488,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -508,7 +508,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -527,7 +527,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index 26e56e5cda..602221e58a 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -433,7 +433,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -457,7 +457,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -493,7 +493,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -625,7 +625,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -649,7 +649,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -686,7 +686,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -810,18 +810,18 @@ struct LoopExecute -struct LoopExecute, sync, IndexMapper0>, SEGMENT> - : LoopExecute, sync, IndexMapper0>, SEGMENT> {}; template -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -852,7 +852,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -914,7 +914,7 @@ struct TileExecute -struct TileExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -964,7 +964,7 @@ struct TileTCountExecute -struct TileTCountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 92c1f1c701..84cd8a301c 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -22,6 +22,7 @@ #if defined(RAJA_CUDA_ACTIVE) +#include #include #include "RAJA/pattern/reduce.hpp" @@ -78,6 +79,110 @@ struct IndexGlobal; template struct IndexFlatten; +/*! + * Use the max occupancy of a kernel on the current device when launch + * parameters are not fully determined. + * Note that the maximum occupancy of the kernel may be less than the maximum + * occupancy of the device in terms of total threads. + */ +struct MaxOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use a fraction and an offset of the max occupancy of a kernel on the current + * device when launch parameters are not fully determined. + * The following formula is used, with care to avoid zero, to determine the + * maximum grid size: + * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm + */ +template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +struct FractionOffsetOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + using Fraction = typename t_Fraction::template rebind; + + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) { + func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm); + } + + if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) { + func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET); + } + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use an occupancy that is less than the max occupancy of the device when + * launch parameters are not fully determined. + * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is + * below the maximum occupancy of the device. + * Otherwise use the given AvoidMaxOccupancyCalculator to determine the + * maximum grid size. + */ +template < typename AvoidMaxOccupancyConcretizer > +struct AvoidDeviceMaxThreadOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_max_threads_per_sm = data.device_max_threads_per_sm; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + IdxT func_threads_per_block = data.func_threads_per_block; + + IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm; + + if (func_max_threads_per_sm < device_max_threads_per_sm) { + return MaxOccupancyConcretizer::template get_max_grid_size(data); + } else { + return AvoidMaxOccupancyConcretizer::template get_max_grid_size(data); + } + } +}; + + +enum struct reduce_algorithm : int +{ + combine_last_block, + init_device_combine_atomic_block, + init_host_combine_atomic_block +}; + +enum struct block_communication_mode : int +{ + device_fence, + block_fence +}; + +template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode, + size_t t_replication, size_t t_atomic_stride > +struct ReduceTuning +{ + static constexpr reduce_algorithm algorithm = t_algorithm; + static constexpr block_communication_mode comm_mode = t_comm_mode; + static constexpr size_t replication = t_replication; + static constexpr size_t atomic_stride = t_atomic_stride; +}; + } // namespace cuda namespace policy @@ -100,7 +205,8 @@ struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>; }; -template +template struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, RAJA::Pattern::forall, @@ -108,9 +214,11 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Platform::cuda> { using IterationMapping = _IterationMapping; using IterationGetter = _IterationGetter; + using LaunchConcretizer = _LaunchConcretizer; }; -template +template struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, RAJA::Pattern::region, @@ -119,8 +227,6 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform }; - - // // NOTE: There is no Index set segment iteration policy for CUDA // @@ -156,8 +262,8 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average /////////////////////////////////////////////////////////////////////// /// -template -struct cuda_reduce_base +template < typename tuning > +struct cuda_reduce_policy : public RAJA:: make_policy_pattern_launch_platform_t; -using cuda_reduce = cuda_reduce_base; -using cuda_reduce_atomic = cuda_reduce_base; +template < RAJA::cuda::reduce_algorithm algorithm, + RAJA::cuda::block_communication_mode comm_mode, + size_t replication = named_usage::unspecified, + size_t atomic_stride = named_usage::unspecified > +using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning< + algorithm, comm_mode, replication, atomic_stride> >; + +// Policies for RAJA::Reduce* objects with specific behaviors. +// - *atomic* policies may use atomics to combine partial results and falls back +// on a non-atomic policy when atomics can't be used with the given type. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions run to run. The memory used with +// atomics is initialized on the device which can be expensive on some HW. +// On some HW this is faster overall than the non-atomic policies. +// - *atomic_host* policies are similar to the atomic policies above. However +// the memory used with atomics is initialized on the host which is +// significantly cheaper on some HW. On some HW this is faster overall than +// the non-atomic and atomic policies. +// - *device_fence policies use normal memory accesses with device scope fences +// in the implementation. This works on all HW. +// - *block_fence policies use special (atomic) memory accesses that only cache +// in a cache shared by the whole device to avoid having to use +// device scope fences. This improves performance on some HW but +// is more difficult to code correctly. +using cuda_reduce_device_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::combine_last_block, + RAJA::cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_block_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::combine_last_block, + RAJA::cuda::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block, + RAJA::cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block, + RAJA::cuda::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block, + RAJA::cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block, + RAJA::cuda::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; + +// Policy for RAJA::Reduce* objects that gives the same answer every time when +// used in the same way +using cuda_reduce = cuda_reduce_device_fence; + +// Policy for RAJA::Reduce* objects that may use atomics and may not give the +// same answer every time when used in the same way +using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence; + +// Policy for RAJA::Reduce* objects that lets you select the default atomic or +// non-atomic policy with a bool +template < bool with_atomic > +using cuda_reduce_base = std::conditional_t; // Policy for RAJA::statement::Reduce that reduces threads in a block @@ -235,6 +405,7 @@ struct cuda_thread_masked_loop {}; // Operations in the included files are parametrized using the following // values for CUDA warp size and max block size. // +constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 32; constexpr const RAJA::Index_type WARP_SIZE = 32; constexpr const RAJA::Index_type MAX_BLOCK_SIZE = 1024; constexpr const RAJA::Index_type MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE; @@ -882,54 +1053,181 @@ using global_z = IndexGlobal; } // namespace cuda +// contretizers used in forall, scan, and sort policies + +using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer, -1>>; + +template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer; + +using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer; + +using CudaReduceDefaultConcretizer = CudaMaxOccupancyConcretizer; + +using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer; + // policies usable with forall, scan, and sort + template using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, Async>; template using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, true>; template using cuda_exec_grid = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, Async>; template using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, true>; template using cuda_exec = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, true>; template using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + Concretizer, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + Concretizer, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>; + +template +using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_base_explicit = std::conditional_t, + cuda_exec_explicit>; + +template +using cuda_exec_base_explicit_async = std::conditional_t, + cuda_exec_explicit_async>; + +template +using cuda_exec_base = std::conditional_t, + cuda_exec>; + +template +using cuda_exec_base_async = std::conditional_t, + cuda_exec_async>; + // policies usable with WorkGroup template @@ -951,6 +1249,12 @@ using policy::cuda::cuda_atomic; using policy::cuda::cuda_atomic_explicit; // policies usable with reducers +using policy::cuda::cuda_reduce_device_fence; +using policy::cuda::cuda_reduce_block_fence; +using policy::cuda::cuda_reduce_atomic_device_init_device_fence; +using policy::cuda::cuda_reduce_atomic_device_init_block_fence; +using policy::cuda::cuda_reduce_atomic_host_init_device_fence; +using policy::cuda::cuda_reduce_atomic_host_init_block_fence; using policy::cuda::cuda_reduce_base; using policy::cuda::cuda_reduce; using policy::cuda::cuda_reduce_atomic; @@ -964,7 +1268,7 @@ using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer< kernel_sync_requirement::none, cuda::thread_x>; using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, cuda::thread_x>; @@ -996,13 +1300,13 @@ using cuda_indexer_direct = policy::cuda::cuda_indexer< template < typename ... indexers > using cuda_indexer_loop = policy::cuda::cuda_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::sync, indexers...>; @@ -1014,7 +1318,7 @@ using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer< template < typename ... indexers > using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 115f652e11..516b02383c 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -25,6 +25,8 @@ #if defined(RAJA_ENABLE_CUDA) +#include + #include #include "RAJA/util/macros.hpp" @@ -33,11 +35,13 @@ #include "RAJA/util/basic_mempool.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/reduce.hpp" #include "RAJA/pattern/detail/reduce.hpp" #include "RAJA/pattern/reduce.hpp" #include "RAJA/policy/cuda/MemUtils_CUDA.hpp" +#include "RAJA/policy/cuda/intrinsics.hpp" #if defined(RAJA_ENABLE_DESUL_ATOMICS) #include "RAJA/policy/desul/atomic.hpp" @@ -56,6 +60,7 @@ namespace reduce namespace cuda { + //! atomic operator version of Combiner object template struct atomic; @@ -64,7 +69,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v); } }; @@ -72,7 +77,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v); } }; @@ -80,7 +85,23 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v); + } +}; + +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v); + } +}; + +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v); } }; @@ -101,418 +122,69 @@ namespace cuda namespace impl { -/*! - * \brief Abstracts T into an equal or greater size array of integers whose - * size is between min_integer_type_size and max_interger_type_size inclusive. - */ -template -union AsIntegerArray { - - static_assert(min_integer_type_size <= max_integer_type_size, - "incompatible min and max integer type size"); - using integer_type = typename std::conditional< - ((alignof(T) >= alignof(long long) && - sizeof(long long) <= max_integer_type_size) || - sizeof(long) < min_integer_type_size), - long long, - typename std::conditional< - ((alignof(T) >= alignof(long) && - sizeof(long) <= max_integer_type_size) || - sizeof(int) < min_integer_type_size), - long, - typename std::conditional< - ((alignof(T) >= alignof(int) && - sizeof(int) <= max_integer_type_size) || - sizeof(short) < min_integer_type_size), - int, - typename std::conditional< - ((alignof(T) >= alignof(short) && - sizeof(short) <= max_integer_type_size) || - sizeof(char) < min_integer_type_size), - short, - typename std::conditional< - ((alignof(T) >= alignof(char) && - sizeof(char) <= max_integer_type_size)), - char, - void>::type>::type>::type>::type>::type; - static_assert(!std::is_same::value, - "could not find a compatible integer type"); - static_assert(sizeof(integer_type) >= min_integer_type_size, - "integer_type smaller than min integer type size"); - static_assert(sizeof(integer_type) <= max_integer_type_size, - "integer_type greater than max integer type size"); - - static constexpr size_t num_integer_type = - (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type); - - T value; - integer_type array[num_integer_type]; - - RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){}; - - RAJA_HOST_DEVICE constexpr size_t array_size() const - { - return num_integer_type; - } -}; - -// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits -constexpr const size_t min_shfl_int_type_size = sizeof(int); -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 -constexpr const size_t max_shfl_int_type_size = sizeof(long long); -#else -constexpr const size_t max_shfl_int_type_size = sizeof(int); -#endif - -/*! - ****************************************************************************** - * - * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. - * - * \Note Returns an undefined value if src lane is inactive (divergence). - * Returns this lane's value if src lane is out of bounds or has exited. - * - ****************************************************************************** - */ -template -RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask); -#else - u.array[i] = ::__shfl_xor(u.array[i], laneMask); -#endif - } - return u.value; -} - -template -RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane); -#else - u.array[i] = ::__shfl(u.array[i], srcLane); -#endif - } - return u.value; -} - -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync(unsigned int var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE long shfl_xor_sync(long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync(unsigned long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync(long long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync(unsigned long long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE double shfl_xor_sync(double var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -#else - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -#endif - - -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync(unsigned int var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE long shfl_sync(long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync(unsigned long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE long long shfl_sync(long long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync(unsigned long long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE double shfl_sync(double var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -#else - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) +//! reduce values in grid into thread 0 of last running block +// returns true if put reduced value in val +template +RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val, + T identity, + TempIterator in_device_mem, + unsigned int* device_count) { - return ::__shfl(var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) -{ - return ::__shfl(var, srcLane); -} - -#endif - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) -{ - int numThreads = blockDim.x * blockDim.y * blockDim.z; + typename TempIterator::template rebind_accessor device_mem(in_device_mem); int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - - T temp = val; - - if (numThreads % policy::cuda::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - return temp; -} - -/*! - * Allreduce values in a warp. - * - * - * This does a butterfly pattern leaving each lane with the full reduction - * - */ -template -RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) -{ - T temp = val; - - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - T rhs = __shfl_xor_sync(0xffffffff, temp, i); - Combiner{}(temp, rhs); - } - - return temp; -} - - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) -{ int numThreads = blockDim.x * blockDim.y * blockDim.z; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - int warpId = threadId % policy::cuda::WARP_SIZE; - int warpNum = threadId / policy::cuda::WARP_SIZE; - - T temp = val; - - if (numThreads % policy::cuda::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - // reduce per warp values - if (numThreads > policy::cuda::WARP_SIZE) { - - static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE, - "Max Warps must be less than or equal to Warp Size for this algorithm to work"); - - // Need to separate declaration and initialization for clang-cuda - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; - - // Partial placement new: Should call new(tmpsd) here but recasting memory - // to avoid calling constructor/destructor in shared memory. - RAJA::detail::SoAArray* sd = - reinterpret_cast *>(tmpsd); + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; - // write per warp values to shared memory - if (warpId == 0) { - sd->set(warpNum, temp); - } + int replicationId = blockId % replication; + int slotId = blockId / replication; - __syncthreads(); + int maxNumSlots = (numBlocks + replication - 1) / replication; + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); - if (warpNum == 0) { + int atomicOffset = replicationId * atomic_stride; + int beginSlots = replicationId * maxNumSlots; + int blockSlot = beginSlots + slotId; - // read per warp values - if (warpId * policy::cuda::WARP_SIZE < numThreads) { - temp = sd->get(warpId); - } else { - temp = identity; - } + T temp = block_reduce(val, identity); - for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } + if (numSlots <= 1u) { + if (threadId == 0) { + val = temp; } - - __syncthreads(); + return (threadId == 0) ? replicationId : replication; } - return temp; -} - - -//! reduce values in grid into thread 0 of last running block -// returns true if put reduced value in val -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, - T identity, - TempIterator device_mem, - unsigned int* device_count) -{ - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int numThreads = blockDim.x * blockDim.y * blockDim.z; - unsigned int wrap_around = numBlocks - 1; - - int blockId = blockIdx.x + gridDim.x * blockIdx.y + - (gridDim.x * gridDim.y) * blockIdx.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - T temp = block_reduce(val, identity); - // one thread per block writes to device_mem - bool lastBlock = false; + bool isLastBlock = false; if (threadId == 0) { - device_mem.set(blockId, temp); + device_mem.set(blockSlot, temp); // ensure write visible to all threadblocks - __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around); + Accessor::fence_release(); + // increment counter, (wraps back to zero if old count == (numSlots-1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1)); + isLastBlock = (old_count == (numSlots-1)); } // returns non-zero value if any thread passes in a non-zero value - lastBlock = __syncthreads_or(lastBlock); + isLastBlock = __syncthreads_or(isLastBlock); // last block accumulates values from device_mem - if (lastBlock) { + if (isLastBlock) { temp = identity; + Accessor::fence_acquire(); - for (int i = threadId; i < numBlocks; i += numThreads) { - Combiner{}(temp, device_mem.get(i)); + for (unsigned int i = threadId; + i < numSlots; + i += numThreads) { + Combiner{}(temp, device_mem.get(beginSlots + i)); } temp = block_reduce(temp, identity); @@ -523,7 +195,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, } } - return lastBlock && threadId == 0; + return (isLastBlock && threadId == 0) ? replicationId : replication; } namespace expt { @@ -634,6 +306,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red // last block accumulates values from device_mem if (lastBlock) { temp = OP::identity(); + __threadfence(); for (int i = threadId; i < numBlocks; i += numThreads) { temp = OP{}(temp, red.device_mem.get(i)); @@ -653,64 +326,104 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val, - T identity, - T* device_mem, - unsigned int* device_count) +template +RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val, + T identity, + T* device_mem, + unsigned int* device_count) { - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - unsigned int wrap_around = numBlocks + 1; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - // one thread in first block initializes device_mem + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); + + if (numSlots <= 1u) { + T temp = block_reduce(val, identity); + if (threadId == 0) { + val = temp; + } + return (threadId == 0) ? replicationId : replication; + } + + // the first block of each replication initializes device_mem if (threadId == 0) { - unsigned int old_val = ::atomicCAS(device_count, 0u, 1u); + unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u); if (old_val == 0u) { - device_mem[0] = identity; - __threadfence(); - ::atomicAdd(device_count, 1u); + Accessor::set(device_mem, atomicOffset, identity); + Accessor::fence_release(); + ::atomicAdd(&device_count[atomicOffset], 1u); } } T temp = block_reduce(val, identity); - // one thread per block performs atomic on device_mem - bool lastBlock = false; + // one thread per block performs an atomic on device_mem + bool isLastBlock = false; if (threadId == 0) { - // thread waits for device_mem to be initialized - while (static_cast(device_count)[0] < 2u) + // wait for device_mem to be initialized + while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u) ; - __threadfence(); - RAJA::reduce::cuda::atomic{}(device_mem[0], temp); - __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around); - - // last block gets value from device_mem - if (lastBlock) { - val = device_mem[0]; + Accessor::fence_acquire(); + RAJA::reduce::cuda::atomic{}(device_mem[atomicOffset], temp); + Accessor::fence_release(); + // increment counter, (wraps back to zero if old count == (numSlots+1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1)); + isLastBlock = (old_count == (numSlots+1)); + + // the last block for each replication gets the value from device_mem + if (isLastBlock) { + Accessor::fence_acquire(); + val = Accessor::get(device_mem, atomicOffset); } } - return lastBlock; + return isLastBlock ? replicationId : replication; +} + +//! reduce values in block into thread 0 and atomically combines into device_mem +template +RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val, + T identity, + T* device_mem) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + T temp = block_reduce(val, identity); + + // one thread per block performs an atomic on device_mem + if (threadId == 0 && temp != identity) { + RAJA::reduce::cuda::atomic{}(device_mem[atomicOffset], temp); + } } } // namespace impl //! Object that manages pinned memory buffers for reduction results // use one per reducer object -template +template class PinnedTally { public: //! Object put in Pinned memory with value and pointer to next Node struct Node { Node* next; - T value; + T values[num_slots]; }; //! Object per resource to keep track of pinned memory nodes struct ResourceNode { @@ -785,7 +498,7 @@ class PinnedTally return ret; } - T& operator*() { return m_n->value; } + auto operator*() -> T(&)[num_slots] { return m_n->values; } bool operator==(const ResourceNodeIterator& rhs) const { @@ -822,7 +535,7 @@ class PinnedTally ResourceNodeIterator end() { return {nullptr, nullptr}; } //! get new value for use in resource - T* new_value(::RAJA::resources::Cuda res) + auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots] { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); @@ -839,10 +552,10 @@ class PinnedTally rn->node_list = nullptr; resource_list = rn; } - Node* n = cuda::pinned_mempool_type::getInstance().template malloc(1); + Node* n = mempool::getInstance().template malloc(1); n->next = rn->node_list; rn->node_list = n; - return &n->value; + return n->values; } //! synchronize all resources used @@ -862,7 +575,7 @@ class PinnedTally while (rn->node_list) { Node* n = rn->node_list; rn->node_list = n->next; - cuda::pinned_mempool_type::getInstance().free(n); + mempool::getInstance().free(n); } resource_list = rn->next; free(rn); @@ -889,46 +602,59 @@ class PinnedTally //! Reduction data for Cuda Offload -- stores value, host pointer, and device //! pointer -template -struct Reduce_Data { +template +struct ReduceLastBlock_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; unsigned int* device_count; - RAJA::detail::SoAPtr device; - bool own_device_ptr; + RAJA::detail::SoAPtr device; + bool owns_device_pointer; - Reduce_Data() : Reduce_Data(T(), T()){}; + ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){} /*! \brief create from a default value and offload information * * allocates PinnedTally to hold device values */ - Reduce_Data(T initValue, T identity_) + ReduceLastBlock_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, device{}, - own_device_ptr{false} + owns_device_pointer{false} { } RAJA_HOST_DEVICE - Reduce_Data(const Reduce_Data& other) + ReduceLastBlock_Data(const ReduceLastBlock_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, device{other.device}, - own_device_ptr{false} + owns_device_pointer{false} { } - Reduce_Data& operator=(const Reduce_Data&) = default; + ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE @@ -936,8 +662,11 @@ struct Reduce_Data { { T temp = value; - if (impl::grid_reduce(temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce_last_block< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -949,10 +678,11 @@ struct Reduce_Data { if (act) { cuda_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - device.allocate(numBlocks); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); - own_device_ptr = true; + size_t maxNumSlots = (numBlocks + replication - 1) / replication; + device.allocate(maxNumSlots*replication); + device_count = count_mempool_type::getInstance() + .template malloc(replication*atomic_stride); + owns_device_pointer = true; } return act; } @@ -961,54 +691,147 @@ struct Reduce_Data { // free device pointers bool teardownForDevice() { - bool act = own_device_ptr; + bool act = owns_device_pointer; if (act) { device.deallocate(); - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; - own_device_ptr = false; + owns_device_pointer = false; } return act; } }; +//! Reduction data for Cuda Offload -- stores value, host pointer +template +struct ReduceAtomicHostInit_Data +{ + using tally_mempool_type = device_pinned_mempool_type; + + static constexpr size_t tally_slots = replication * atomic_stride; + + mutable T value; + T identity; + bool is_setup; + bool owns_device_pointer; + + ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}; + + ReduceAtomicHostInit_Data(T initValue, T identity_) + : value{initValue}, + identity{identity_}, + is_setup{false}, + owns_device_pointer{false} + { + } + + RAJA_HOST_DEVICE + ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other) + : value{other.identity}, + identity{other.identity}, + is_setup{other.is_setup}, + owns_device_pointer{false} + { + } + + ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default; + + //! initialize output to identity to ensure never read + // uninitialized memory + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } + + //! reduce values in grid to single value, store in output + RAJA_DEVICE + void grid_reduce(T* output) + { + T temp = value; + + impl::grid_reduce_atomic_host_init( + temp, identity, output); + } + + //! check and setup for device + // allocate device pointers and get a new result buffer from the pinned tally + bool setupForDevice() + { + bool act = !is_setup && setupReducers(); + if (act) { + is_setup = true; + owns_device_pointer = true; + } + return act; + } + + //! if own resources teardown device setup + // free device pointers + bool teardownForDevice() + { + bool act = owns_device_pointer; + if (act) { + is_setup = false; + owns_device_pointer = false; + } + return act; + } +}; //! Reduction data for Cuda Offload -- stores value, host pointer -template -struct ReduceAtomic_Data { +template +struct ReduceAtomicDeviceInit_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; unsigned int* device_count; T* device; - bool own_device_ptr; + bool owns_device_pointer; - ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){}; + ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}; - ReduceAtomic_Data(T initValue, T identity_) + ReduceAtomicDeviceInit_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, device{nullptr}, - own_device_ptr{false} + owns_device_pointer{false} { } RAJA_HOST_DEVICE - ReduceAtomic_Data(const ReduceAtomic_Data& other) + ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, device{other.device}, - own_device_ptr{false} + owns_device_pointer{false} { } - ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default; + ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE @@ -1016,9 +839,11 @@ struct ReduceAtomic_Data { { T temp = value; - if (impl::grid_reduce_atomic( - temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce_atomic_device_init< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -1028,10 +853,10 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(1); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); - own_device_ptr = true; + device = data_mempool_type::getInstance().template malloc(replication*atomic_stride); + device_count = count_mempool_type::getInstance() + .template malloc(replication*atomic_stride); + owns_device_pointer = true; } return act; } @@ -1040,22 +865,68 @@ struct ReduceAtomic_Data { // free device pointers bool teardownForDevice() { - bool act = own_device_ptr; + bool act = owns_device_pointer; if (act) { - device_mempool_type::getInstance().free(device); + data_mempool_type::getInstance().free(device); device = nullptr; - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; - own_device_ptr = false; + owns_device_pointer = false; } return act; } }; + //! Cuda Reduction entity -- generalize on reduction, and type -template +template class Reduce { + static constexpr size_t replication = (tuning::replication > 0) + ? tuning::replication + : 1; + static constexpr size_t atomic_stride = (tuning::atomic_stride > 0) + ? tuning::atomic_stride + : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : 1); + + using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence), + impl::AccessorDeviceScopeUseBlockFence, + std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), + impl::AccessorDeviceScopeUseDeviceFence, + void>>; + + static constexpr bool atomic_policy = + (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) || + (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block); + static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available::value; + + //! cuda reduction data storage class and folding algorithm + using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) || + (atomic_policy && !atomic_available), + cuda::ReduceLastBlock_Data, + std::conditional_t, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block), + cuda::ReduceAtomicHostInit_Data, + void>>, + void>>; + + static constexpr size_t tally_slots = reduce_data_type::tally_slots; + + using TallyType = PinnedTally; + + //! union to hold either pointer to PinnedTally or pointer to value + // only use list before setup for device and only use val_ptr after + union tally_u { + TallyType* list; + T* val_ptr; + constexpr tally_u(TallyType* l) : list(l){}; + constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; + }; + public: Reduce() : Reduce(T(), Combiner::identity()) {} @@ -1063,7 +934,7 @@ class Reduce // the original object's parent is itself explicit Reduce(T init_val, T identity_ = Combiner::identity()) : parent{this}, - tally_or_val_ptr{new PinnedTally}, + tally_or_val_ptr{new TallyType}, val(init_val, identity_) { } @@ -1090,9 +961,8 @@ class Reduce #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) if (parent) { if (val.setupForDevice()) { - tally_or_val_ptr.val_ptr = - tally_or_val_ptr.list->new_value(currentResource()); - val.init_grid_val(tally_or_val_ptr.val_ptr); + tally_or_val_ptr.val_ptr = val.init_grid_vals( + tally_or_val_ptr.list->new_value(currentResource())); parent = nullptr; } } @@ -1136,9 +1006,15 @@ class Reduce auto end = tally_or_val_ptr.list->end(); if (n != end) { tally_or_val_ptr.list->synchronize_resources(); + ::RAJA::detail::HighAccuracyReduce + reducer(std::move(val.value)); for (; n != end; ++n) { - Combiner{}(val.value, *n); + T(&values)[tally_slots] = *n; + for (size_t r = 0; r < tally_slots; ++r) { + reducer.combine(std::move(values[r])); + } } + val.value = reducer.get_and_clear(); tally_or_val_ptr.list->free_list(); } return val.value; @@ -1159,38 +1035,20 @@ class Reduce private: const Reduce* parent; - - //! union to hold either pointer to PinnedTally or poiter to value - // only use list before setup for device and only use val_ptr after - union tally_u { - PinnedTally* list; - T* val_ptr; - constexpr tally_u(PinnedTally* l) : list(l){}; - constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; - }; - tally_u tally_or_val_ptr; - - //! cuda reduction data storage class and folding algorithm - using reduce_data_type = typename std::conditional< - maybe_atomic && RAJA::reduce::cuda::cuda_atomic_available::value, - cuda::ReduceAtomic_Data, - cuda::Reduce_Data>::type; - - //! storage for reduction data reduce_data_type val; }; } // end namespace cuda //! specialization of ReduceSum for cuda_reduce -template -class ReduceSum, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceSum, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable operator+= for ReduceSum -- alias for combine() RAJA_HOST_DEVICE @@ -1202,13 +1060,13 @@ class ReduceSum, T> }; //! specialization of ReduceBitOr for cuda_reduce -template -class ReduceBitOr, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceBitOr, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable operator|= for ReduceBitOr -- alias for combine() RAJA_HOST_DEVICE @@ -1220,13 +1078,13 @@ class ReduceBitOr, T> }; //! specialization of ReduceBitAnd for cuda_reduce -template -class ReduceBitAnd, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceBitAnd, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable operator&= for ReduceBitAnd -- alias for combine() RAJA_HOST_DEVICE @@ -1238,13 +1096,13 @@ class ReduceBitAnd, T> }; //! specialization of ReduceMin for cuda_reduce -template -class ReduceMin, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceMin, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable min() for ReduceMin -- alias for combine() RAJA_HOST_DEVICE @@ -1256,13 +1114,13 @@ class ReduceMin, T> }; //! specialization of ReduceMax for cuda_reduce -template -class ReduceMax, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceMax, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable max() for ReduceMax -- alias for combine() RAJA_HOST_DEVICE @@ -1274,18 +1132,18 @@ class ReduceMax, T> }; //! specialization of ReduceMinLoc for cuda_reduce -template -class ReduceMinLoc, T, IndexType> +template +class ReduceMinLoc, T, IndexType> : public cuda::Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::min; using NonLocCombiner = RAJA::reduce::min; - using Base = cuda::Reduce; + using Base = cuda::Reduce; using Base::Base; //! constructor requires a default value for the reducer @@ -1324,18 +1182,18 @@ class ReduceMinLoc, T, IndexType> }; //! specialization of ReduceMaxLoc for cuda_reduce -template -class ReduceMaxLoc, T, IndexType> +template +class ReduceMaxLoc, T, IndexType> : public cuda:: Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::max; using NonLocCombiner = RAJA::reduce::max; - using Base = cuda::Reduce; + using Base = cuda::Reduce; using Base::Base; //! constructor requires a default value for the reducer diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp index 5d89844e3c..0a9b0bf305 100644 --- a/include/RAJA/policy/cuda/scan.hpp +++ b/include/RAJA/policy/cuda/scan.hpp @@ -44,6 +44,7 @@ namespace scan */ template inclusive_inplace( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, Function binary_op) @@ -96,6 +97,7 @@ inclusive_inplace( */ template exclusive_inplace( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, Function binary_op, @@ -152,6 +154,7 @@ exclusive_inplace( */ template inclusive( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, OutputIter out, @@ -206,6 +209,7 @@ inclusive( */ template exclusive( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, OutputIter out, diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp index 6e6e4c5696..c5a353b704 100644 --- a/include/RAJA/policy/cuda/sort.hpp +++ b/include/RAJA/policy/cuda/sort.hpp @@ -44,7 +44,9 @@ namespace sort /*! \brief static assert unimplemented stable sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -54,7 +56,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter, Iter, Compare) @@ -75,13 +77,15 @@ stable( /*! \brief stable sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter begin, Iter end, operators::less>) @@ -143,13 +147,15 @@ stable( /*! \brief stable sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter begin, Iter end, operators::greater>) @@ -212,7 +218,9 @@ stable( /*! \brief static assert unimplemented sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -222,7 +230,7 @@ concepts::enable_if_t, camp::is_same>>>>>> unstable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter, Iter, Compare) @@ -243,13 +251,15 @@ unstable( /*! \brief sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, Iter begin, Iter end, operators::less> comp) @@ -260,13 +270,15 @@ unstable( /*! \brief sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, Iter begin, Iter end, operators::greater> comp) @@ -278,7 +290,8 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> stable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter, KeyIter, ValIter, @@ -314,7 +327,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -322,7 +336,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -396,7 +410,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -404,7 +419,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -479,7 +494,8 @@ stable_pairs( /*! \brief static assert unimplemented sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> unstable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter, KeyIter, ValIter, @@ -515,7 +531,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -523,7 +540,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -535,7 +552,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -543,7 +561,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index e45d3a6aff..84c6d1fa38 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -63,7 +63,7 @@ struct PinnedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { hipErrchk(hipHostFree(ptr)); @@ -82,7 +82,7 @@ struct DeviceAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { hipErrchk(hipFree(ptr)); @@ -105,7 +105,26 @@ struct DeviceZeroedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure + bool free(void* ptr) + { + hipErrchk(hipFree(ptr)); + return true; + } +}; + +//! Allocator for device pinned memory for use in basic_mempool +struct DevicePinnedAllocator { + + // returns a valid pointer on success, nullptr on failure + void* malloc(size_t nbytes) + { + void* ptr; + hipErrchk(hipMalloc(&ptr, nbytes)); + return ptr; + } + + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { hipErrchk(hipFree(ptr)); @@ -116,6 +135,7 @@ struct DeviceZeroedAllocator { using device_mempool_type = basic_mempool::MemPool; using device_zeroed_mempool_type = basic_mempool::MemPool; +using device_pinned_mempool_type = basic_mempool::MemPool; using pinned_mempool_type = basic_mempool::MemPool; namespace detail @@ -281,6 +301,7 @@ RAJA_INLINE typename std::remove_reference::type make_launch_body( return return_type(std::forward(loop_body)); } +//! Get the properties of the current device RAJA_INLINE hipDeviceProp_t get_device_prop() { @@ -291,213 +312,236 @@ hipDeviceProp_t get_device_prop() return prop; } +//! Get a copy of the device properties, this copy is cached on first use to speedup later calls RAJA_INLINE hipDeviceProp_t& device_prop() { - static hipDeviceProp_t prop = get_device_prop(); + static thread_local hipDeviceProp_t prop = get_device_prop(); return prop; } +static constexpr int hip_occupancy_uninitialized_int = -1; +static constexpr size_t hip_occupancy_uninitialized_size_t = + std::numeric_limits::max(); + +//! Struct with the maximum theoretical occupancy of the device struct HipFixedMaxBlocksData { - int multiProcessorCount; - int maxThreadsPerMultiProcessor; + int device_sm_per_device = hip::device_prop().multiProcessorCount; + int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor; }; +//! Get the maximum theoretical occupancy of the device RAJA_INLINE -int hip_max_blocks(int block_size) +HipFixedMaxBlocksData hip_max_blocks() { - static HipFixedMaxBlocksData data = []() { - hipDeviceProp_t& prop = hip::device_prop(); - return HipFixedMaxBlocksData{prop.multiProcessorCount, - prop.maxThreadsPerMultiProcessor}; - }(); - - int max_blocks = data.multiProcessorCount * - (data.maxThreadsPerMultiProcessor / block_size); + static thread_local HipFixedMaxBlocksData data; - return max_blocks; + return data; } +//! Struct with the maximum occupancy of a kernel in simple terms struct HipOccMaxBlocksThreadsData { - size_t prev_shmem_size; - int max_blocks; - int max_threads; + size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t; + int func_max_blocks_per_device = hip_occupancy_uninitialized_int; + int func_max_threads_per_block = hip_occupancy_uninitialized_int; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +//! Get the maximum occupancy of a kernel with unknown threads per block +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void hip_occupancy_max_blocks_threads(Func&& func, size_t shmem_size, - int &max_blocks, int &max_threads) +HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local HipOccMaxBlocksThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; + static thread_local HipOccMaxBlocksThreadsData data; - if (data.prev_shmem_size != shmem_size) { + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { + + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxPotentialBlockSize( - &data.max_blocks, &data.max_threads, func, shmem_size)); + &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block)); #else RAJA_UNUSED_VAR(func); hipDeviceProp_t& prop = hip::device_prop(); - data.max_blocks = prop.multiProcessorCount; - data.max_threads = 1024; + data.func_max_blocks_per_device = prop.multiProcessorCount; + data.func_max_threads_per_block = 1024; #endif - data.prev_shmem_size = shmem_size; - } - max_blocks = data.max_blocks; - max_threads = data.max_threads; - + return data; } -struct HipOccMaxBlocksFixedThreadsData +//! Struct with the maximum occupancy of a kernel in specific terms +struct HipOccMaxBlocksData : HipFixedMaxBlocksData { - size_t prev_shmem_size; - int max_blocks; - int multiProcessorCount; + size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t; + int func_threads_per_block = hip_occupancy_uninitialized_int; + int func_max_blocks_per_sm = hip_occupancy_uninitialized_int; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func > +//! Get the maximum occupancy of a kernel with compile time threads per block +template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block > RAJA_INLINE -void hip_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks) +HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local HipOccMaxBlocksFixedThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; + static thread_local HipOccMaxBlocksData data; + + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { - if (data.prev_shmem_size != shmem_size) { + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); #else RAJA_UNUSED_VAR(func); - data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024; - if (data.max_blocks <= 0) { data.max_blocks = 1 } + data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024; + if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 } #endif - if (data.multiProcessorCount == uninitialized) { - - data.multiProcessorCount = hip::device_prop().multiProcessorCount; - - } - - data.max_blocks *= data.multiProcessorCount; - - data.prev_shmem_size = shmem_size; - } - max_blocks = data.max_blocks; - + return data; } -struct HipOccMaxBlocksVariableThreadsData -{ - size_t prev_shmem_size; - int prev_num_threads; - int max_blocks; - int multiProcessorCount; -}; - -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +//! Get the maximum occupancy of a kernel with runtime threads per block +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void hip_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks, int num_threads) +HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block, int func_threads_per_block) { - static constexpr int uninitialized = 0; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local HipOccMaxBlocksVariableThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized, uninitialized}; + static thread_local HipOccMaxBlocksData data; - if ( data.prev_shmem_size != shmem_size || - data.prev_num_threads != num_threads ) { + if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block || + data.func_threads_per_block != func_threads_per_block ) { + + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); #else RAJA_UNUSED_VAR(func); - data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024; - if (data.max_blocks <= 0) { data.max_blocks = 1 } + data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024; + if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 } #endif - if (data.multiProcessorCount == uninitialized) { - - data.multiProcessorCount = hip::device_prop().multiProcessorCount; - - } - - data.max_blocks *= data.multiProcessorCount; - - data.prev_shmem_size = shmem_size; - data.prev_num_threads = num_threads; - } - max_blocks = data.max_blocks; - + return data; } -struct HipOccupancyDefaults + +/*! + ****************************************************************************** + * + * \brief Concretizer Implementation that chooses block size and/or grid + * size when one or both has not been specified at compile time. + * + * \tparam IdxT Index type to use for integer calculations. + * \tparam Concretizer Class that determines the max number of blocks to use + * when fitting for the device. + * \tparam UniqueMarker A type that is unique to each global function, used to + * help cache the occupancy data for that global function. + * + * The methods come in two flavors: + * - The fit_len methods choose grid and block sizes that result in a total + * number of threads of at least the len given in the constructor or 0 if + * that is not possible. + * - The fit_device methods choose grid and block sizes that best fit the + * occupancy of the global function according to the occupancy calculator and + * the Concretizer class. + * + * Common terms: + * - block size - threads per block + * - grid size - blocks per device + * + ****************************************************************************** + */ +template < typename IdxT, typename Concretizer, typename UniqueMarker> +struct ConcretizerImpl { - HipOccupancyDefaults(const void* RAJA_UNUSED_ARG(func)) + ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len) + : m_func(func) + , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block) + , m_len(len) { } - template < typename IdxT > - inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size), - IdxT RAJA_UNUSED_ARG(block_size)) const + IdxT get_max_block_size() const { - return std::numeric_limits::max(); + auto data = hip_occupancy_max_blocks_threads( + m_func, m_func_dynamic_shmem_per_block); + IdxT func_max_threads_per_block = data.func_max_threads_per_block; + return func_max_threads_per_block; } - template < typename IdxT = hip_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const + //! Get a block size when grid size is specified + IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const { - return std::make_pair(static_cast(::RAJA::policy::hip::MAX_BLOCK_SIZE), - std::numeric_limits::max()); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + if (func_threads_per_block <= func_max_threads_per_block) { + return func_threads_per_block; + } else { + return IdxT(0); + } } -}; -template < typename UniqueMarker > -struct HipOccupancyCalculator -{ - HipOccupancyCalculator(const void* func) - : m_func(func) - { } + //! Get a grid size when block size is specified + IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const + { + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return func_blocks_per_device; + } + + //! Get a block size and grid size when neither is specified + auto get_block_and_grid_size_to_fit_len() const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); + } + + //! Get a block size when grid size is specified + IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + return std::min(func_threads_per_block, func_max_threads_per_block); + } - template < typename IdxT > - inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const + //! Get a grid size when block size is specified + IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const { - int max_grid_size = -1; - ::RAJA::hip::hip_occupancy_max_blocks( - m_func, dynamic_shmem_size, max_grid_size, block_size); - return static_cast(max_grid_size); + auto data = hip_occupancy_max_blocks( + m_func, m_func_dynamic_shmem_per_block, func_threads_per_block); + IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size(data); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return std::min(func_blocks_per_device, func_max_blocks_per_device); } - template < typename IdxT = hip_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const + //! Get a block size and grid size when neither is specified + auto get_block_and_grid_size_to_fit_device() const { - int max_block_size = -1; - int max_grid_size = -1; - ::RAJA::hip::hip_occupancy_max_blocks_threads( - m_func, dynamic_shmem_size, max_grid_size, max_block_size); - return std::make_pair(static_cast(max_block_size), - static_cast(max_grid_size)); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); } private: const void* m_func; + size_t m_func_dynamic_shmem_per_block; + IdxT m_len; }; } // namespace hip diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index b0b86131ef..6fa21f9217 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -71,16 +71,17 @@ namespace impl * ****************************************************************************** */ -template +template struct ForallDimensionCalculator; // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0 // there are specializations for named_usage::unspecified // but named_usage::ignored is not supported so no specializations are provided // and static_asserts in the general case catch unsupported values -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); @@ -92,8 +93,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, static void set_dimensions(internal::HipDims& dims, IdxT len, const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) { - if ( len > (static_cast(IndexGetter::block_size) * - static_cast(IndexGetter::grid_size)) ) { + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + if ( len > (block_size * grid_size) ) { RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); } @@ -102,9 +105,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); @@ -113,17 +117,26 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - // BEWARE: if calculated block_size is too high then the kernel launch will fail - internal::set_hip_dim(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::grid_size))); - internal::set_hip_dim(dims.blocks, static_cast(IndexGetter::grid_size)); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size); + + if ( block_size == IdxT(0) ) { + RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); + } + + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); @@ -132,16 +145,22 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - internal::set_hip_dim(dims.threads, static_cast(IndexGetter::block_size)); - internal::set_hip_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::block_size))); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size); + + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template +template struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { using IndexGetter = ::RAJA::hip::IndexGlobal; @@ -150,104 +169,104 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const auto sizes = concretizer.get_block_and_grid_size_to_fit_len(); - internal::set_hip_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_hip_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first))); + internal::set_hip_dim(dims.threads, sizes.first); + internal::set_hip_dim(dims.blocks, sizes.second); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len), const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) { - internal::set_hip_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_hip_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_block_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::grid_size)), - static_cast(max_sizes.first)); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size); - internal::set_hip_dim(dims.threads, calculated_block_size); - internal::set_hip_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size, - static_cast(IndexMapper::block_size)); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::block_size)), - static_cast(max_grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size); - internal::set_hip_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_hip_dim(dims.blocks, calculated_grid_size); + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first)), - static_cast(max_sizes.second)); + const auto sizes = concretizer.get_block_and_grid_size_to_fit_device(); - internal::set_hip_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_hip_dim(dims.blocks, calculated_grid_size); + internal::set_hip_dim(dims.threads, sizes.first); + internal::set_hip_dim(dims.blocks, sizes.second); } }; @@ -273,7 +292,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -297,7 +316,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -322,7 +341,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -349,7 +368,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -375,7 +394,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -400,7 +420,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -427,7 +448,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -455,7 +477,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -487,7 +510,7 @@ void forallp_hip_kernel(LOOP_BODY loop_body, template RAJA_INLINE concepts::enable_if_t< @@ -495,7 +518,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> forall_impl(resources::Hip hip_res, - ::RAJA::policy::hip::hip_execconst&, + ::RAJA::policy::hip::hip_execconst&, Iterable&& iter, LoopBody&& loop_body, ForallParam) @@ -503,9 +526,9 @@ forall_impl(resources::Hip hip_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::hip::hip_exec; + using EXEC_POL = ::RAJA::policy::hip::hip_exec; using UniqueMarker = ::camp::list; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -556,7 +579,7 @@ forall_impl(resources::Hip hip_res, template RAJA_INLINE concepts::enable_if_t< @@ -564,7 +587,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty> > forall_impl(resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec const&, + ::RAJA::policy::hip::hip_exec const&, Iterable&& iter, LoopBody&& loop_body, ForallParam f_params) @@ -572,9 +595,9 @@ forall_impl(resources::Hip hip_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::hip::hip_exec; + using EXEC_POL = ::RAJA::policy::hip::hip_exec; using UniqueMarker = ::camp::list; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -652,11 +675,11 @@ forall_impl(resources::Hip hip_res, */ template RAJA_INLINE resources::EventProxy forall_impl(resources::Hip r, - ExecPolicy>, + ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) { @@ -665,7 +688,7 @@ forall_impl(resources::Hip r, iset.segmentCall(r, isi, detail::CallForall(), - ::RAJA::policy::hip::hip_exec(), + ::RAJA::policy::hip::hip_exec(), loop_body); } // iterate over segments of index set diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp new file mode 100644 index 0000000000..354e5d7278 --- /dev/null +++ b/include/RAJA/policy/hip/intrinsics.hpp @@ -0,0 +1,362 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA intrinsics templates for HIP execution. + * + * These methods should work on any platform that supports + * HIP devices. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_hip_intrinsics_HPP +#define RAJA_hip_intrinsics_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include + +#include + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/SoAArray.hpp" +#include "RAJA/util/types.hpp" + +#include "RAJA/policy/hip/policy.hpp" + + +namespace RAJA +{ + +namespace hip +{ + +namespace impl +{ + +/*! + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. + * + * \Note This uses device scope fences to ensure ordering and to flush local + * caches so that memory accesses become visible to the whole device. + * \Note This class uses normal memory accesses that are cached in local caches + * so device scope fences are required to make memory accesses visible + * to the whole device. + */ +struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor +{ + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { + __threadfence(); + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { + __threadfence(); + } +}; + +/*! + ****************************************************************************** + * + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. + * + * \Note This may use block scope fences to ensure ordering and avoid flushing + * local caches so special memory accesses are used to ensure visibility + * to the whole device. + * \Note This class uses device scope atomic memory accesses to bypass local + * caches so memory accesses are visible to the whole device without + * device scope fences. + * \Note A memory access may be split into multiple memory accesses, so + * even though atomic instructions are used concurrent accesses between + * different threads are not thread safe. + * + ****************************************************************************** + */ +struct AccessorDeviceScopeUseBlockFence +{ + // hip has 32 and 64 bit atomics + static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); + static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long); + + template < typename T > + static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + auto ptr = const_cast(reinterpret_cast(in_ptr + idx)); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load) + u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + u.array[i] = atomicAdd(&ptr[i], integer_type(0)); +#endif + } + + return u.get_value(); + } + + template < typename T > + static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + u.set_value(val); + auto ptr = reinterpret_cast(in_ptr + idx); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store) + __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + atomicExch(&ptr[i], u.array[i]); +#endif + } + } + + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); +#else + __threadfence(); +#endif + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \ + RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt) + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); + // Wait until all vmem operations complete (s_waitcnt vmcnt(0)) + __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8)); +#else + __threadfence(); +#endif + } +}; + + +// hip only has shfl primitives for 32 bits +constexpr size_t min_shfl_int_type_size = sizeof(unsigned int); +constexpr size_t max_shfl_int_type_size = sizeof(unsigned int); + +/*! + ****************************************************************************** + * + * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. + * + * \Note Returns an undefined value if src lane is inactive (divergence). + * Returns this lane's value if src lane is out of bounds or has exited. + * + ****************************************************************************** + */ +template +RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { + u.array[i] = ::__shfl_xor(u.array[i], laneMask); + } + return u.get_value(); +} + +template +RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { + u.array[i] = ::__shfl(u.array[i], srcLane); + } + return u.get_value(); +} + + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + T temp = val; + + if (numThreads % policy::hip::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + return temp; +} + +/*! + * Allreduce values in a warp. + * + * + * This does a butterfly pattern leaving each lane with the full reduction + * + */ +template +RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) +{ + T temp = val; + + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + return temp; +} + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int warpId = threadId % policy::hip::WARP_SIZE; + int warpNum = threadId / policy::hip::WARP_SIZE; + + T temp = val; + + if (numThreads % policy::hip::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + // reduce per warp values + if (numThreads > policy::hip::WARP_SIZE) { + + static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE, + "Max Warps must be less than or equal to Warp Size for this algorithm to work"); + + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + RAJA::detail::SoAArray* sd = + reinterpret_cast *>(tmpsd); + + // write per warp values to shared memory + if (warpId == 0) { + sd->set(warpNum, temp); + } + + __syncthreads(); + + if (warpNum == 0) { + + // read per warp values + if (warpId * policy::hip::WARP_SIZE < numThreads) { + temp = sd->get(warpId); + } else { + temp = identity; + } + + for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + } + + __syncthreads(); + } + + return temp; +} + +} // end namespace impl + +} // end namespace hip + +} // end namespace RAJA + +#endif // closing endif for RAJA_ENABLE_HIP guard + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/hip/kernel.hpp b/include/RAJA/policy/hip/kernel.hpp index 678d48e3c1..4f907f5f5f 100644 --- a/include/RAJA/policy/hip/kernel.hpp +++ b/include/RAJA/policy/hip/kernel.hpp @@ -4,7 +4,7 @@ * \file * * \brief RAJA header file containing constructs used to run kernel::forall - * traversals on GPU with CUDA. + * traversals on GPU with HIP. * ****************************************************************************** */ diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp index ce8e87d869..848ea42edf 100644 --- a/include/RAJA/policy/hip/kernel/For.hpp +++ b/include/RAJA/policy/hip/kernel/For.hpp @@ -108,7 +108,7 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -123,7 +123,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::hip::hip_indexer>; + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE @@ -180,7 +180,7 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -195,7 +195,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::hip::hip_indexer>; + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE @@ -246,7 +246,7 @@ struct HipStatementExecutor< statement::For, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp index 001cc28b77..014b4db3ac 100644 --- a/include/RAJA/policy/hip/kernel/ForICount.hpp +++ b/include/RAJA/policy/hip/kernel/ForICount.hpp @@ -103,20 +103,20 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { using Base = HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -166,20 +166,20 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { using Base = HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -226,7 +226,7 @@ struct HipStatementExecutor< statement::ForICount, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 67bea1299a..68156600b2 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -87,7 +87,7 @@ namespace statement */ template struct HipKernelExt - : public internal::Statement<::RAJA::policy::hip::hip_exec, EnclosedStmts...> { + : public internal::Statement<::RAJA::policy::hip::hip_exec, EnclosedStmts...> { }; @@ -263,7 +263,7 @@ struct HipLaunchHelper,Stmt inline static void recommended_blocks_threads(size_t shmem_size, int &recommended_blocks, int &recommended_threads) { - auto func = kernelGetter_t::get(); + auto func = reinterpret_cast(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -273,8 +273,10 @@ struct HipLaunchHelper,Stmt // determine blocks at runtime // determine threads at runtime // - ::RAJA::hip::hip_occupancy_max_blocks_threads( - func, shmem_size, recommended_blocks, recommended_threads); + auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_device; + recommended_threads = data.func_max_threads_per_block; } else { @@ -284,8 +286,9 @@ struct HipLaunchHelper,Stmt // recommended_threads = num_threads; - ::RAJA::hip::hip_occupancy_max_blocks( - func, shmem_size, recommended_blocks); + auto data = ::RAJA::hip::hip_occupancy_max_blocks( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } @@ -339,7 +342,7 @@ struct HipLaunchHelper,Stmt inline static void max_blocks(size_t shmem_size, int &max_blocks, int actual_threads) { - auto func = kernelGetter_t::get(); + auto func = reinterpret_cast(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -352,16 +355,18 @@ struct HipLaunchHelper,Stmt // // determine blocks when actual_threads != num_threads // - ::RAJA::hip::hip_occupancy_max_blocks( - func, shmem_size, max_blocks, actual_threads); + auto data = ::RAJA::hip::hip_occupancy_max_blocks( + func, shmem_size, actual_threads); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } else { // // determine blocks when actual_threads == num_threads // - ::RAJA::hip::hip_occupancy_max_blocks( - func, shmem_size, max_blocks); + auto data = ::RAJA::hip::hip_occupancy_max_blocks( + func, shmem_size); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp index 24f38b7647..62dda7f20d 100644 --- a/include/RAJA/policy/hip/kernel/Tile.hpp +++ b/include/RAJA/policy/hip/kernel/Tile.hpp @@ -143,7 +143,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -153,7 +153,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -233,7 +233,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -243,7 +243,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -318,7 +318,7 @@ struct HipStatementExecutor< Data, statement::Tile, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp index c92f92fb71..07637fbd8f 100644 --- a/include/RAJA/policy/hip/kernel/TileTCount.hpp +++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp @@ -131,14 +131,14 @@ struct HipStatementExecutor< Data, statement::TileTCount, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -146,7 +146,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -209,14 +209,14 @@ struct HipStatementExecutor< Data, statement::TileTCount, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -224,7 +224,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -281,7 +281,7 @@ struct HipStatementExecutor< Data, statement::TileTCount, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp index 2c93520b93..aa0610d736 100644 --- a/include/RAJA/policy/hip/kernel/internal.hpp +++ b/include/RAJA/policy/hip/kernel/internal.hpp @@ -388,7 +388,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -402,7 +402,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -418,7 +418,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -436,7 +436,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -451,7 +451,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -469,7 +469,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -488,7 +488,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -508,7 +508,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -527,7 +527,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 2e54b16a81..76f592d20b 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -433,7 +433,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -457,7 +457,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -493,7 +493,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -625,7 +625,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -649,7 +649,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -686,7 +686,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -810,18 +810,18 @@ struct LoopExecute -struct LoopExecute, sync, IndexMapper0>, SEGMENT> - : LoopExecute, sync, IndexMapper0>, SEGMENT> {}; template -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -852,7 +852,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -914,7 +914,7 @@ struct TileExecute -struct TileExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -964,7 +964,7 @@ struct TileTCountExecute -struct TileTCountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 75f9abd878..c359a68de0 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -74,6 +74,110 @@ struct IndexGlobal; template struct IndexFlatten; +/*! + * Use the max occupancy of a kernel on the current device when launch + * parameters are not fully determined. + * Note that the maximum occupancy of the kernel may be less than the maximum + * occupancy of the device in terms of total threads. + */ +struct MaxOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use a fraction and an offset of the max occupancy of a kernel on the current + * device when launch parameters are not fully determined. + * The following formula is used, with care to avoid zero, to determine the + * maximum grid size: + * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm + */ +template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +struct FractionOffsetOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + using Fraction = typename t_Fraction::template rebind; + + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) { + func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm); + } + + if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) { + func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET); + } + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use an occupancy that is less than the max occupancy of the device when + * launch parameters are not fully determined. + * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is + * below the maximum occupancy of the device. + * Otherwise use the given AvoidMaxOccupancyCalculator to determine the + * maximum grid size. + */ +template < typename AvoidMaxOccupancyConcretizer > +struct AvoidDeviceMaxThreadOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_max_threads_per_sm = data.device_max_threads_per_sm; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + IdxT func_threads_per_block = data.func_threads_per_block; + + IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm; + + if (func_max_threads_per_sm < device_max_threads_per_sm) { + return MaxOccupancyConcretizer::template get_max_grid_size(data); + } else { + return AvoidMaxOccupancyConcretizer::template get_max_grid_size(data); + } + } +}; + + +enum struct reduce_algorithm : int +{ + combine_last_block, + init_device_combine_atomic_block, + init_host_combine_atomic_block +}; + +enum struct block_communication_mode : int +{ + device_fence, + block_fence +}; + +template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode, + size_t t_replication, size_t t_atomic_stride > +struct ReduceTuning +{ + static constexpr reduce_algorithm algorithm = t_algorithm; + static constexpr block_communication_mode comm_mode = t_comm_mode; + static constexpr size_t replication = t_replication; + static constexpr size_t atomic_stride = t_atomic_stride; +}; + } // namespace hip namespace policy @@ -93,7 +197,8 @@ struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t< using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>; }; -template +template struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::forall, @@ -101,6 +206,7 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Platform::hip> { using IterationMapping = _IterationMapping; using IterationGetter = _IterationGetter; + using LaunchConcretizer = _LaunchConcretizer; }; template @@ -147,8 +253,9 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average /////////////////////////////////////////////////////////////////////// /// -template -struct hip_reduce_base + +template < typename tuning > +struct hip_reduce_policy : public RAJA:: make_policy_pattern_launch_platform_t; -using hip_reduce = hip_reduce_base; -using hip_reduce_atomic = hip_reduce_base; +template < RAJA::hip::reduce_algorithm algorithm, + RAJA::hip::block_communication_mode comm_mode, + size_t replication = named_usage::unspecified, + size_t atomic_stride = named_usage::unspecified > +using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning< + algorithm, comm_mode, replication, atomic_stride> >; + +// Policies for RAJA::Reduce* objects with specific behaviors. +// - *atomic* policies may use atomics to combine partial results and falls back +// on a non-atomic policy when atomics can't be used with the given type. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions run to run. The memory used with +// atomics is initialized on the device which can be expensive on some HW. +// On some HW this is faster overall than the non-atomic policies. +// - *atomic_host* policies are similar to the atomic policies above. However +// the memory used with atomics is initialized on the host which is +// significantly cheaper on some HW. On some HW this is faster overall than +// the non-atomic and atomic policies. +// - *device_fence policies use normal memory accesses with device scope fences +// in the implementation. This works on all HW. +// - *block_fence policies use special (atomic) memory accesses that only cache +// in a cache shared by the whole device to avoid having to use +// device scope fences. This improves performance on some HW but +// is more difficult to code correctly. +using hip_reduce_device_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::combine_last_block, + RAJA::hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_block_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::combine_last_block, + RAJA::hip::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_device_combine_atomic_block, + RAJA::hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_device_combine_atomic_block, + RAJA::hip::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_host_combine_atomic_block, + RAJA::hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; +/// +using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_host_combine_atomic_block, + RAJA::hip::block_communication_mode::block_fence, + named_usage::unspecified, named_usage::unspecified>; + +// Policy for RAJA::Reduce* objects that gives the same answer every time when +// used in the same way +using hip_reduce = hip_reduce_block_fence; + +// Policy for RAJA::Reduce* objects that may use atomics and may not give the +// same answer every time when used in the same way +using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence; + +// Policy for RAJA::Reduce* objects that lets you select the default atomic or +// non-atomic policy with a bool +template < bool with_atomic > +using hip_reduce_base = std::conditional_t; // Policy for RAJA::statement::Reduce that reduces threads in a block @@ -226,6 +397,7 @@ struct hip_thread_masked_loop {}; // Operations in the included files are parametrized using the following // values for HIP warp size and max block size. // +constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 64; // 128 on gfx90a #if defined(__HIP_PLATFORM_AMD__) constexpr const RAJA::Index_type WARP_SIZE = 64; #elif defined(__HIP_PLATFORM_NVIDIA__) @@ -816,6 +988,7 @@ struct IndexFlatten }; + // helper to get just the thread indexing part of IndexGlobal template < typename index_global > struct get_index_thread; @@ -876,30 +1049,100 @@ using global_z = IndexGlobal; } // namespace hip +// contretizers used in forall, scan, and sort policies + +using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer, -1>>; + +template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer; + +using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer; + +using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer, 0>; + +using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer; + // policies usable with forall, scan, and sort + template using hip_exec_grid = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, Async>; + iteration_mapping::StridedLoop, hip::global_x, + HipDefaultConcretizer, Async>; template using hip_exec_grid_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, true>; + iteration_mapping::StridedLoop, hip::global_x, + HipDefaultConcretizer, true>; template using hip_exec = policy::hip::hip_exec< - iteration_mapping::Direct, hip::global_x, Async>; + iteration_mapping::Direct, hip::global_x, + HipDefaultConcretizer, Async>; template using hip_exec_async = policy::hip::hip_exec< - iteration_mapping::Direct, hip::global_x, true>; + iteration_mapping::Direct, hip::global_x, + HipDefaultConcretizer, true>; template using hip_exec_occ_calc = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, Async>; + iteration_mapping::StridedLoop, hip::global_x, + HipDefaultConcretizer, Async>; template using hip_exec_occ_calc_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, true>; + iteration_mapping::StridedLoop, hip::global_x, + HipDefaultConcretizer, true>; + +template +using hip_exec_occ_max = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipMaxOccupancyConcretizer, Async>; + +template +using hip_exec_occ_max_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipMaxOccupancyConcretizer, true>; + +template +using hip_exec_occ_fraction = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipFractionOffsetOccupancyConcretizer, Async>; + +template +using hip_exec_occ_fraction_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipFractionOffsetOccupancyConcretizer, true>; + +template +using hip_exec_occ_custom = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + Concretizer, Async>; + +template +using hip_exec_occ_custom_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + Concretizer, true>; + +template +using hip_exec_with_reduce = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipReduceDefaultConcretizer, Async>; + +template +using hip_exec_with_reduce_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipReduceDefaultConcretizer, true>; + +template +using hip_exec_base = std::conditional_t, + hip_exec>; + +template +using hip_exec_base_async = std::conditional_t, + hip_exec_async>; // policies usable with WorkGroup using policy::hip::hip_work; @@ -914,6 +1157,12 @@ using policy::hip::hip_atomic; using policy::hip::hip_atomic_explicit; // policies usable with reducers +using policy::hip::hip_reduce_device_fence; +using policy::hip::hip_reduce_block_fence; +using policy::hip::hip_reduce_atomic_device_init_device_fence; +using policy::hip::hip_reduce_atomic_device_init_block_fence; +using policy::hip::hip_reduce_atomic_host_init_device_fence; +using policy::hip::hip_reduce_atomic_host_init_block_fence; using policy::hip::hip_reduce_base; using policy::hip::hip_reduce; using policy::hip::hip_reduce_atomic; @@ -927,7 +1176,7 @@ using hip_warp_direct = RAJA::policy::hip::hip_indexer< kernel_sync_requirement::none, hip::thread_x>; using hip_warp_loop = RAJA::policy::hip::hip_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, hip::thread_x>; @@ -953,13 +1202,13 @@ using hip_indexer_direct = policy::hip::hip_indexer< template < typename ... indexers > using hip_indexer_loop = policy::hip::hip_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using hip_indexer_syncable_loop = policy::hip::hip_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::sync, indexers...>; @@ -971,7 +1220,7 @@ using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer< template < typename ... indexers > using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index df47616cb6..2dbaf9f7e5 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -35,11 +35,13 @@ #include "RAJA/util/basic_mempool.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/reduce.hpp" #include "RAJA/pattern/detail/reduce.hpp" #include "RAJA/pattern/reduce.hpp" #include "RAJA/policy/hip/MemUtils_HIP.hpp" +#include "RAJA/policy/hip/intrinsics.hpp" #include "RAJA/policy/hip/atomic.hpp" #include "RAJA/policy/hip/policy.hpp" #include "RAJA/policy/hip/raja_hiperrchk.hpp" @@ -52,6 +54,7 @@ namespace reduce namespace hip { + //! atomic operator version of Combiner object template struct atomic; @@ -60,7 +63,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v); + RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v); } }; @@ -68,7 +71,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMin(RAJA::hip_atomic{}, &val, v); + RAJA::atomicMin(RAJA::hip_atomic{}, &val, v); } }; @@ -76,7 +79,23 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMax(RAJA::hip_atomic{}, &val, v); + RAJA::atomicMax(RAJA::hip_atomic{}, &val, v); + } +}; + +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v); + } +}; + +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicOr(RAJA::hip_atomic{}, &val, v); } }; @@ -97,295 +116,69 @@ namespace hip namespace impl { -/*! - * \brief Abstracts T into an equal or greater size array of integers whose - * size is between min_integer_type_size and max_interger_type_size inclusive. - */ -template -union AsIntegerArray { - - static_assert(min_integer_type_size <= max_integer_type_size, - "incompatible min and max integer type size"); - using integer_type = typename std::conditional< - ((alignof(T) >= alignof(long long) && - sizeof(long long) <= max_integer_type_size) || - sizeof(long) < min_integer_type_size), - long long, - typename std::conditional< - ((alignof(T) >= alignof(long) && - sizeof(long) <= max_integer_type_size) || - sizeof(int) < min_integer_type_size), - long, - typename std::conditional< - ((alignof(T) >= alignof(int) && - sizeof(int) <= max_integer_type_size) || - sizeof(short) < min_integer_type_size), - int, - typename std::conditional< - ((alignof(T) >= alignof(short) && - sizeof(short) <= max_integer_type_size) || - sizeof(char) < min_integer_type_size), - short, - typename std::conditional< - ((alignof(T) >= alignof(char) && - sizeof(char) <= max_integer_type_size)), - char, - void>::type>::type>::type>::type>::type; - static_assert(!std::is_same::value, - "could not find a compatible integer type"); - static_assert(sizeof(integer_type) >= min_integer_type_size, - "integer_type smaller than min integer type size"); - static_assert(sizeof(integer_type) <= max_integer_type_size, - "integer_type greater than max integer type size"); - - constexpr static size_t num_integer_type = - (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type); - - T value; - integer_type array[num_integer_type]; - - RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){}; - - RAJA_HOST_DEVICE constexpr size_t array_size() const - { - return num_integer_type; - } -}; - -// hip only has shfl primitives for 32 bits -constexpr const size_t min_shfl_int_type_size = sizeof(int); -constexpr const size_t max_shfl_int_type_size = sizeof(int); - -/*! - ****************************************************************************** - * - * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. - * - * \Note Returns an undefined value if src lane is inactive (divergence). - * Returns this lane's value if src lane is out of bounds or has exited. - * - ****************************************************************************** - */ -template -RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { - u.array[i] = ::__shfl_xor(u.array[i], laneMask); - } - return u.value; -} - -template -RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { - u.array[i] = ::__shfl(u.array[i], srcLane); - } - return u.value; -} - - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) -{ - return ::__shfl(var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) -{ - return ::__shfl(var, srcLane); -} - - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) +//! reduce values in grid into thread 0 of last running block +// returns true if put reduced value in val +template +RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val, + T identity, + TempIterator in_device_mem, + unsigned int* device_count) { - int numThreads = blockDim.x * blockDim.y * blockDim.z; + typename TempIterator::template rebind_accessor device_mem(in_device_mem); int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - - T temp = val; - - if (numThreads % policy::hip::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - return temp; -} - -/*! - * Allreduce values in a warp. - * - * - * This does a butterfly pattern leaving each lane with the full reduction - * - */ -template -RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) -{ - T temp = val; - - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - return temp; -} - - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) -{ int numThreads = blockDim.x * blockDim.y * blockDim.z; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - int warpId = threadId % policy::hip::WARP_SIZE; - int warpNum = threadId / policy::hip::WARP_SIZE; - - T temp = val; - - if (numThreads % policy::hip::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - // reduce per warp values - if (numThreads > policy::hip::WARP_SIZE) { - - static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE, - "Max Warps must be less than or equal to Warp Size for this algorithm to work"); - - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; - RAJA::detail::SoAArray* sd = - reinterpret_cast *>(tmpsd); + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; - // write per warp values to shared memory - if (warpId == 0) { - sd->set(warpNum, temp); - } + int replicationId = blockId % replication; + int slotId = blockId / replication; - __syncthreads(); + int maxNumSlots = (numBlocks + replication - 1) / replication; + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); - if (warpNum == 0) { + int atomicOffset = replicationId * atomic_stride; + int beginSlots = replicationId * maxNumSlots; + int blockSlot = beginSlots + slotId; - // read per warp values - if (warpId * policy::hip::WARP_SIZE < numThreads) { - temp = sd->get(warpId); - } else { - temp = identity; - } + T temp = block_reduce(val, identity); - for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } + if (numSlots <= 1u) { + if (threadId == 0) { + val = temp; } - - __syncthreads(); + return (threadId == 0) ? replicationId : replication; } - return temp; -} - - -//! reduce values in grid into thread 0 of last running block -// returns true if put reduced value in val -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, - T identity, - TempIterator device_mem, - unsigned int* device_count) -{ - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int numThreads = blockDim.x * blockDim.y * blockDim.z; - unsigned int wrap_around = numBlocks - 1; - - int blockId = blockIdx.x + gridDim.x * blockIdx.y + - (gridDim.x * gridDim.y) * blockIdx.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - T temp = block_reduce(val, identity); - // one thread per block writes to device_mem - __shared__ bool lastBlock; + __shared__ bool isLastBlock; if (threadId == 0) { - device_mem.set(blockId, temp); + device_mem.set(blockSlot, temp); // ensure write visible to all threadblocks - __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around) ? 1: 0; + Accessor::fence_release(); + // increment counter, (wraps back to zero if old count == (numSlots-1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1)); + isLastBlock = (old_count == (numSlots-1)); } // returns non-zero value if any thread passes in a non-zero value __syncthreads(); // last block accumulates values from device_mem - if (lastBlock) { + if (isLastBlock) { temp = identity; + Accessor::fence_acquire(); - for (int i = threadId; i < numBlocks; i += numThreads) { - Combiner{}(temp, device_mem.get(i)); + for (unsigned int i = threadId; + i < numSlots; + i += numThreads) { + Combiner{}(temp, device_mem.get(beginSlots + i)); } temp = block_reduce(temp, identity); @@ -396,7 +189,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, } } - return lastBlock && threadId == 0; + return (isLastBlock && threadId == 0) ? replicationId : replication; } namespace expt { @@ -507,6 +300,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red // last block accumulates values from device_mem if (lastBlock) { temp = OP::identity(); + __threadfence(); for (int i = threadId; i < numBlocks; i += numThreads) { temp = OP{}(temp, red.device_mem.get(i)); @@ -526,64 +320,106 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val, - T identity, - T* device_mem, - unsigned int* device_count) +template +RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val, + T identity, + T* device_mem, + unsigned int* device_count) { - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - unsigned int wrap_around = numBlocks + 1; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - // one thread in first block initializes device_mem + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); + + if (numSlots <= 1u) { + T temp = block_reduce(val, identity); + if (threadId == 0) { + val = temp; + } + return (threadId == 0) ? replicationId : replication; + } + + // the first block of each replication initializes device_mem if (threadId == 0) { - unsigned int old_val = ::atomicCAS(device_count, 0u, 1u); + unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u); if (old_val == 0u) { - device_mem[0] = identity; - __threadfence(); - ::atomicAdd(device_count, 1u); + Accessor::set(device_mem, atomicOffset, identity); + Accessor::fence_release(); + ::atomicAdd(&device_count[atomicOffset], 1u); } } T temp = block_reduce(val, identity); - // one thread per block performs atomic on device_mem - bool lastBlock = false; + // one thread per block performs an atomic on device_mem + bool isLastBlock = false; if (threadId == 0) { - // thread waits for device_mem to be initialized - while (static_cast(device_count)[0] < 2u) + // wait for device_mem to be initialized + while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u) ; - __threadfence(); - RAJA::reduce::hip::atomic{}(device_mem[0], temp); - __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around); - - // last block gets value from device_mem - if (lastBlock) { - val = device_mem[0]; + Accessor::fence_acquire(); + RAJA::reduce::hip::atomic{}(device_mem[atomicOffset], temp); + Accessor::fence_release(); + // increment counter, (wraps back to zero if old count == (numSlots+1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1)); + isLastBlock = (old_count == (numSlots+1)); + + // the last block for each replication gets the value from device_mem + if (isLastBlock) { + Accessor::fence_acquire(); + val = Accessor::get(device_mem, atomicOffset); } } - return lastBlock; + return isLastBlock ? replicationId : replication; +} + +//! reduce values in block into thread 0 and atomically combines into device_mem +template +RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val, + T identity, + T* device_mem) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + T temp = block_reduce(val, identity); + + // one thread per block performs an atomic on device_mem + if (threadId == 0 && temp != identity) { + RAJA::reduce::hip::atomic{}(device_mem[atomicOffset], temp); + } + } } // namespace impl //! Object that manages pinned memory buffers for reduction results // use one per reducer object -template +template class PinnedTally { public: //! Object put in Pinned memory with value and pointer to next Node struct Node { Node* next; - T value; + T values[num_slots]; }; //! Object per resource to keep track of pinned memory nodes struct ResourceNode { @@ -658,7 +494,7 @@ class PinnedTally return ret; } - T& operator*() { return m_n->value; } + auto operator*() -> T(&)[num_slots] { return m_n->values; } bool operator==(const ResourceNodeIterator& rhs) const { @@ -695,7 +531,7 @@ class PinnedTally ResourceNodeIterator end() { return {nullptr, nullptr}; } //! get new value for use in resource - T* new_value(::RAJA::resources::Hip res) + auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots] { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); @@ -712,10 +548,10 @@ class PinnedTally rn->node_list = nullptr; resource_list = rn; } - Node* n = hip::pinned_mempool_type::getInstance().template malloc(1); + Node* n = mempool::getInstance().template malloc(1); n->next = rn->node_list; rn->node_list = n; - return &n->value; + return n->values; } //! synchronize all resources used @@ -735,7 +571,7 @@ class PinnedTally while (rn->node_list) { Node* n = rn->node_list; rn->node_list = n->next; - hip::pinned_mempool_type::getInstance().free(n); + mempool::getInstance().free(n); } resource_list = rn->next; free(rn); @@ -762,23 +598,30 @@ class PinnedTally //! Reduction data for Hip Offload -- stores value, host pointer, and device //! pointer -template -struct Reduce_Data { +template +struct ReduceLastBlock_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; unsigned int* device_count; - RAJA::detail::SoAPtr device; + RAJA::detail::SoAPtr device; bool own_device_ptr; - Reduce_Data() : Reduce_Data(T(), T()){}; + ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}; /*! \brief create from a default value and offload information * * allocates PinnedTally to hold device values */ - Reduce_Data(T initValue, T identity_) + ReduceLastBlock_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, @@ -788,7 +631,7 @@ struct Reduce_Data { } RAJA_HOST_DEVICE - Reduce_Data(const Reduce_Data& other) + ReduceLastBlock_Data(const ReduceLastBlock_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, @@ -797,20 +640,28 @@ struct Reduce_Data { { } - Reduce_Data& operator=(const Reduce_Data&) = default; + ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE void grid_reduce(T* output) { T temp = value; - - if (impl::grid_reduce(temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce_last_block< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -822,9 +673,10 @@ struct Reduce_Data { if (act) { hip_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - device.allocate(numBlocks); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + size_t maxNumSlots = (numBlocks + replication - 1) / replication; + device.allocate(maxNumSlots*replication); + device_count = count_mempool_type::getInstance() + .template malloc(replication*atomic_stride); own_device_ptr = true; } return act; @@ -837,7 +689,7 @@ struct Reduce_Data { bool act = own_device_ptr; if (act) { device.deallocate(); - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; own_device_ptr = false; } @@ -847,8 +699,95 @@ struct Reduce_Data { //! Reduction data for Hip Offload -- stores value, host pointer -template -struct ReduceAtomic_Data { +template +struct ReduceAtomicHostInit_Data +{ + using tally_mempool_type = device_pinned_mempool_type; + + static constexpr size_t tally_slots = replication * atomic_stride; + + mutable T value; + T identity; + bool is_setup; + bool own_device_ptr; + + ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){} + + ReduceAtomicHostInit_Data(T initValue, T identity_) + : value{initValue}, + identity{identity_}, + is_setup{false}, + own_device_ptr{false} + { + } + + RAJA_HOST_DEVICE + ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other) + : value{other.identity}, + identity{other.identity}, + is_setup{other.is_setup}, + own_device_ptr{false} + { + } + + ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default; + + //! initialize output to identity to ensure never read + // uninitialized memory + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } + + //! reduce values in grid to single value, store in output + RAJA_DEVICE + void grid_reduce(T* output) + { + T temp = value; + + impl::grid_reduce_atomic_host_init( + temp, identity, output); + } + + //! check and setup for device + // allocate device pointers and get a new result buffer from the pinned tally + bool setupForDevice() + { + bool act = !is_setup && setupReducers(); + if (act) { + is_setup = true; + own_device_ptr = true; + } + return act; + } + + //! if own resources teardown device setup + // free device pointers + bool teardownForDevice() + { + bool act = own_device_ptr; + if (act) { + is_setup = false; + own_device_ptr = false; + } + return act; + } +}; + +//! Reduction data for Hip Offload -- stores value, host pointer +template +struct ReduceAtomicDeviceInit_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; @@ -856,9 +795,9 @@ struct ReduceAtomic_Data { T* device; bool own_device_ptr; - ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){}; + ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){} - ReduceAtomic_Data(T initValue, T identity_) + ReduceAtomicDeviceInit_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, @@ -868,7 +807,7 @@ struct ReduceAtomic_Data { } RAJA_HOST_DEVICE - ReduceAtomic_Data(const ReduceAtomic_Data& other) + ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, @@ -877,11 +816,17 @@ struct ReduceAtomic_Data { { } - ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default; + ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE @@ -889,9 +834,11 @@ struct ReduceAtomic_Data { { T temp = value; - if (impl::grid_reduce_atomic( - temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce_atomic_device_init< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -901,9 +848,9 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(1); - device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + device = data_mempool_type::getInstance().template malloc(replication*atomic_stride); + device_count = count_mempool_type::getInstance() + .template malloc(replication*atomic_stride); own_device_ptr = true; } return act; @@ -915,9 +862,9 @@ struct ReduceAtomic_Data { { bool act = own_device_ptr; if (act) { - device_mempool_type::getInstance().free(device); + data_mempool_type::getInstance().free(device); device = nullptr; - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; own_device_ptr = false; } @@ -925,10 +872,56 @@ struct ReduceAtomic_Data { } }; + //! Hip Reduction entity -- generalize on reduction, and type -template +template class Reduce { + static constexpr size_t replication = (tuning::replication > 0) + ? tuning::replication + : 32; + static constexpr size_t atomic_stride = (tuning::atomic_stride > 0) + ? tuning::atomic_stride + : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : 1); + + using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence), + impl::AccessorDeviceScopeUseBlockFence, + std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), + impl::AccessorDeviceScopeUseDeviceFence, + void>>; + + static constexpr bool atomic_policy = + (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) || + (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block); + static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available::value; + + //! hip reduction data storage class and folding algorithm + using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) || + (atomic_policy && !atomic_available), + hip::ReduceLastBlock_Data, + std::conditional_t, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block), + hip::ReduceAtomicHostInit_Data, + void>>, + void>>; + + static constexpr size_t tally_slots = reduce_data_type::tally_slots; + + using TallyType = PinnedTally; + + //! union to hold either pointer to PinnedTally or pointer to value + // only use list before setup for device and only use val_ptr after + union tally_u { + TallyType* list; + T* val_ptr; + constexpr tally_u(TallyType* l) : list(l){}; + constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; + }; + public: Reduce() : Reduce(T(), Combiner::identity()) {} @@ -936,7 +929,7 @@ class Reduce // the original object's parent is itself explicit Reduce(T init_val, T identity_ = Combiner::identity()) : parent{this}, - tally_or_val_ptr{new PinnedTally}, + tally_or_val_ptr{new TallyType}, val(init_val, identity_) { } @@ -963,9 +956,8 @@ class Reduce #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) if (parent) { if (val.setupForDevice()) { - tally_or_val_ptr.val_ptr = - tally_or_val_ptr.list->new_value(currentResource()); - val.init_grid_val(tally_or_val_ptr.val_ptr); + tally_or_val_ptr.val_ptr = val.init_grid_vals( + tally_or_val_ptr.list->new_value(currentResource())); parent = nullptr; } } @@ -1009,9 +1001,15 @@ class Reduce auto end = tally_or_val_ptr.list->end(); if (n != end) { tally_or_val_ptr.list->synchronize_resources(); + ::RAJA::detail::HighAccuracyReduce + reducer(std::move(val.value)); for (; n != end; ++n) { - Combiner{}(val.value, *n); + T(&values)[tally_slots] = *n; + for (size_t r = 0; r < tally_slots; ++r) { + reducer.combine(std::move(values[r])); + } } + val.value = reducer.get_and_clear(); tally_or_val_ptr.list->free_list(); } return val.value; @@ -1032,38 +1030,20 @@ class Reduce private: const Reduce* parent; - - //! union to hold either pointer to PinnedTally or poiter to value - // only use list before setup for device and only use val_ptr after - union tally_u { - PinnedTally* list; - T* val_ptr; - constexpr tally_u(PinnedTally* l) : list(l){}; - constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; - }; - tally_u tally_or_val_ptr; - - //! hip reduction data storage class and folding algorithm - using reduce_data_type = typename std::conditional< - maybe_atomic && RAJA::reduce::hip::hip_atomic_available::value, - hip::ReduceAtomic_Data, - hip::Reduce_Data>::type; - - //! storage for reduction data reduce_data_type val; }; } // end namespace hip //! specialization of ReduceSum for hip_reduce -template -class ReduceSum, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceSum, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable operator+= for ReduceSum -- alias for combine() RAJA_HOST_DEVICE @@ -1075,13 +1055,13 @@ class ReduceSum, T> }; //! specialization of ReduceBitOr for hip_reduce -template -class ReduceBitOr, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceBitOr, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable operator|= for ReduceBitOr -- alias for combine() RAJA_HOST_DEVICE @@ -1093,13 +1073,13 @@ class ReduceBitOr, T> }; //! specialization of ReduceBitAnd for hip_reduce -template -class ReduceBitAnd, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceBitAnd, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable operator&= for ReduceBitAnd -- alias for combine() RAJA_HOST_DEVICE @@ -1111,13 +1091,13 @@ class ReduceBitAnd, T> }; //! specialization of ReduceMin for hip_reduce -template -class ReduceMin, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceMin, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable min() for ReduceMin -- alias for combine() RAJA_HOST_DEVICE @@ -1129,13 +1109,13 @@ class ReduceMin, T> }; //! specialization of ReduceMax for hip_reduce -template -class ReduceMax, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceMax, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable max() for ReduceMax -- alias for combine() RAJA_HOST_DEVICE @@ -1147,18 +1127,18 @@ class ReduceMax, T> }; //! specialization of ReduceMinLoc for hip_reduce -template -class ReduceMinLoc, T, IndexType> +template +class ReduceMinLoc, T, IndexType> : public hip::Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::min; using NonLocCombiner = RAJA::reduce::min; - using Base = hip::Reduce; + using Base = hip::Reduce; using Base::Base; //! constructor requires a default value for the reducer @@ -1197,18 +1177,18 @@ class ReduceMinLoc, T, IndexType> }; //! specialization of ReduceMaxLoc for hip_reduce -template -class ReduceMaxLoc, T, IndexType> +template +class ReduceMaxLoc, T, IndexType> : public hip:: Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::max; using NonLocCombiner = RAJA::reduce::max; - using Base = hip::Reduce; + using Base = hip::Reduce; using Base::Base; //! constructor requires a default value for the reducer diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp index 40e44c2e19..cdf0a9b82d 100644 --- a/include/RAJA/policy/hip/scan.hpp +++ b/include/RAJA/policy/hip/scan.hpp @@ -49,6 +49,7 @@ namespace scan */ template @@ -56,7 +57,7 @@ RAJA_INLINE resources::EventProxy inclusive_inplace( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, Function binary_op) @@ -121,6 +122,7 @@ inclusive_inplace( */ template exclusive_inplace( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, Function binary_op, @@ -198,6 +200,7 @@ exclusive_inplace( */ template inclusive( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, OutputIter out, @@ -271,6 +274,7 @@ inclusive( */ template exclusive( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, OutputIter out, diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp index a6918968c8..eb16246623 100644 --- a/include/RAJA/policy/hip/sort.hpp +++ b/include/RAJA/policy/hip/sort.hpp @@ -73,7 +73,9 @@ namespace detail /*! \brief static assert unimplemented stable sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -83,7 +85,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter, Iter, Compare) @@ -102,13 +104,15 @@ stable( /*! \brief stable sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter begin, Iter end, operators::less>) @@ -190,13 +194,15 @@ stable( /*! \brief stable sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter begin, Iter end, operators::greater>) @@ -279,7 +285,9 @@ stable( /*! \brief static assert unimplemented sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -289,7 +297,7 @@ concepts::enable_if_t, camp::is_same>>>>>> unstable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter, Iter, Compare) @@ -308,13 +316,15 @@ unstable( /*! \brief sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, Iter begin, Iter end, operators::less> comp) @@ -325,13 +335,15 @@ unstable( /*! \brief sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, Iter begin, Iter end, operators::greater> comp) @@ -343,7 +355,8 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> stable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter, KeyIter, ValIter, @@ -379,7 +392,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -387,7 +401,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -483,7 +497,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -491,7 +506,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -588,7 +603,8 @@ stable_pairs( /*! \brief static assert unimplemented sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> unstable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter, KeyIter, ValIter, @@ -624,7 +640,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -632,7 +649,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -644,7 +661,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -652,7 +670,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, diff --git a/include/RAJA/policy/loop.hpp b/include/RAJA/policy/loop.hpp deleted file mode 100644 index 2cd9525dcd..0000000000 --- a/include/RAJA/policy/loop.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/*! -****************************************************************************** -* -* \file -* -* \brief Header file containing RAJA headers for sequential execution. -* -* These methods work on all platforms. -* -****************************************************************************** -*/ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef RAJA_loop_HPP -#define RAJA_loop_HPP - -#if !defined(RAJA_ENABLE_DESUL_ATOMICS) - #include "RAJA/policy/sequential/atomic.hpp" -#endif - -#include "RAJA/policy/sequential/forall.hpp" -#include "RAJA/policy/sequential/kernel.hpp" -#include "RAJA/policy/loop/policy.hpp" -#include "RAJA/policy/sequential/scan.hpp" -#include "RAJA/policy/sequential/sort.hpp" -#include "RAJA/policy/sequential/launch.hpp" -#include "RAJA/policy/sequential/WorkGroup.hpp" - -#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/loop/policy.hpp b/include/RAJA/policy/loop/policy.hpp deleted file mode 100644 index 1bf34250bb..0000000000 --- a/include/RAJA/policy/loop/policy.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/*! - ****************************************************************************** - * - * \file - * - * \brief Header file containing RAJA sequential policy definitions. - * - ****************************************************************************** - */ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef policy_loop_HPP -#define policy_loop_HPP - -#include "RAJA/policy/PolicyBase.hpp" - -#include "RAJA/policy/sequential/policy.hpp" - -namespace RAJA -{ -namespace policy -{ -namespace loop -{ - -// -////////////////////////////////////////////////////////////////////// -// -// Execution policies -// -////////////////////////////////////////////////////////////////////// -// - -/// -/// Segment execution policies -/// - -using loop_exec = seq_exec; - -/// -/// Index set segment iteration policies -/// -using loop_segit = seq_exec; - -/// -/// WorkGroup execution policies -/// -using loop_work = seq_work; - -/// -/////////////////////////////////////////////////////////////////////// -/// -/// Reduction execution policies -/// -/////////////////////////////////////////////////////////////////////// -/// -using loop_reduce = seq_reduce; - - -/// -/////////////////////////////////////////////////////////////////////// -/// -/// Atomic execution policies -/// -/////////////////////////////////////////////////////////////////////// -/// -using loop_atomic = seq_atomic; - -} // end namespace loop - -} // end namespace policy - -using policy::loop::loop_atomic; -using policy::loop::loop_exec; -using policy::loop::loop_reduce; -using policy::loop::loop_segit; -using policy::loop::loop_work; - -} // namespace RAJA - -#endif diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index 0dffee6a21..9176444cd4 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -56,13 +56,13 @@ struct LaunchExecute> { // Compute the number of blocks and threads // - const ::sycl::range<3> blockSize(params.threads.value[0], + const ::sycl::range<3> blockSize(params.threads.value[2], params.threads.value[1], - params.threads.value[2]); + params.threads.value[0]); - const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0], + const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2], params.threads.value[1] * params.teams.value[1], - params.threads.value[2] * params.teams.value[2]); + params.threads.value[0] * params.teams.value[0]); // Only launch kernel if we have something to iterate over constexpr size_t zero = 0; @@ -138,13 +138,13 @@ struct LaunchExecute> { // Compute the number of blocks and threads // - const ::sycl::range<3> blockSize(params.threads.value[0], + const ::sycl::range<3> blockSize(params.threads.value[2], params.threads.value[1], - params.threads.value[2]); + params.threads.value[0]); - const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0], + const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2], params.threads.value[1] * params.teams.value[1], - params.threads.value[2] * params.teams.value[2]); + params.threads.value[0] * params.teams.value[0]); // Only launch kernel if we have something to iterate over constexpr size_t zero = 0; diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp index d76b862c22..b4249e7182 100644 --- a/include/RAJA/util/Operators.hpp +++ b/include/RAJA/util/Operators.hpp @@ -42,9 +42,20 @@ namespace operators namespace detail { +// truly associative (does not include fp add/multiply) struct associative_tag { }; +// associative up to floating point rounding differences +struct fp_associative_tag : associative_tag { +}; + +// get associativity tag appropriate for the type +template < typename T > +using associative_or_fp_associative_tag = + std::conditional_t>::value, + fp_associative_tag, associative_tag>; + template struct binary_function { using first_argument_type = Arg1; @@ -327,7 +338,7 @@ static_assert(check(), template struct plus : public detail::binary_function, - detail::associative_tag { + detail::associative_or_fp_associative_tag { RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs, const Arg2& rhs) const { @@ -347,7 +358,7 @@ struct minus : public detail::binary_function { template struct multiplies : public detail::binary_function, - detail::associative_tag { + detail::associative_or_fp_associative_tag { RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs, const Arg2& rhs) const @@ -569,6 +580,12 @@ struct is_associative { std::is_base_of::value; }; +template +struct is_fp_associative { + static constexpr const bool value = + std::is_base_of::value; +}; + template struct safe_plus : public plus + // for RAJA::reduce::detail::ValueLoc #include "RAJA/pattern/detail/reduce.hpp" +#include "RAJA/util/types.hpp" namespace RAJA { @@ -38,18 +41,37 @@ namespace detail */ template > + RAJA::basic_mempool::generic_allocator>, + typename accessor = DefaultAccessor > class SoAPtr { - using value_type = T; + template < typename, typename, typename > + friend class SoAPtr; // friend other instantiations of this class public: + using value_type = T; + + template < typename rhs_accessor > + using rebind_accessor = SoAPtr; + SoAPtr() = default; + SoAPtr(SoAPtr const&) = default; + SoAPtr(SoAPtr &&) = default; + SoAPtr& operator=(SoAPtr const&) = default; + SoAPtr& operator=(SoAPtr &&) = default; + explicit SoAPtr(size_t size) : mem(mempool::getInstance().template malloc(size)) { } + template < typename rhs_accessor, + std::enable_if_t::value>* = nullptr > + RAJA_HOST_DEVICE + explicit SoAPtr(SoAPtr const& rhs) + : mem(rhs.mem) + { } + SoAPtr& allocate(size_t size) { mem = mempool::getInstance().template malloc(size); @@ -65,8 +87,8 @@ class SoAPtr RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; } - RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; } - RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; } + RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); } + RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); } private: value_type* mem = nullptr; @@ -75,21 +97,41 @@ class SoAPtr /*! * @brief Specialization for RAJA::reduce::detail::ValueLoc. */ -template -class SoAPtr, mempool> +template +class SoAPtr, mempool, accessor> { - using value_type = RAJA::reduce::detail::ValueLoc; using first_type = T; using second_type = IndexType; + template < typename, typename, typename > + friend class SoAPtr; // fiend other instantiations of this class + public: + using value_type = RAJA::reduce::detail::ValueLoc; + + template < typename rhs_accessor > + using rebind_accessor = SoAPtr; + SoAPtr() = default; + SoAPtr(SoAPtr const&) = default; + SoAPtr(SoAPtr &&) = default; + SoAPtr& operator=(SoAPtr const&) = default; + SoAPtr& operator=(SoAPtr &&) = default; + explicit SoAPtr(size_t size) : mem(mempool::getInstance().template malloc(size)), mem_idx(mempool::getInstance().template malloc(size)) { } + template < typename rhs_accessor, + std::enable_if_t::value>* = nullptr > + RAJA_HOST_DEVICE + explicit SoAPtr(SoAPtr const& rhs) + : mem(rhs.mem) + , mem_idx(rhs.mem_idx) + { } + SoAPtr& allocate(size_t size) { mem = mempool::getInstance().template malloc(size); @@ -110,12 +152,12 @@ class SoAPtr, mempool> RAJA_HOST_DEVICE value_type get(size_t i) const { - return value_type(mem[i], mem_idx[i]); + return value_type(accessor::get(mem, i), accessor::get(mem_idx, i)); } RAJA_HOST_DEVICE void set(size_t i, value_type val) { - mem[i] = val; - mem_idx[i] = val.getLoc(); + accessor::set(mem, i, first_type(val)); + accessor::set(mem_idx, i, val.getLoc()); } private: diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp index 61624e0725..f0208ccbd3 100644 --- a/include/RAJA/util/basic_mempool.hpp +++ b/include/RAJA/util/basic_mempool.hpp @@ -309,6 +309,7 @@ class MemPool } + /// Free all backing allocations, even if they are currently in use void free_chunks() { #if defined(RAJA_ENABLE_OPENMP) diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp new file mode 100644 index 0000000000..b279ec29ff --- /dev/null +++ b/include/RAJA/util/for_each.hpp @@ -0,0 +1,95 @@ +/*! +****************************************************************************** +* +* \file +* +* \brief Header file providing RAJA for_each templates. +* +****************************************************************************** +*/ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_util_for_each_HPP +#define RAJA_util_for_each_HPP + +#include "RAJA/config.hpp" + +#include +#include + +#include "camp/list.hpp" + +#include "RAJA/pattern/detail/algorithm.hpp" + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/types.hpp" + +namespace RAJA +{ + +namespace detail +{ + +// runtime loop applying func to each element in the range in order +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func) +{ + for (; begin != end; ++begin) { + func(*begin); + } + + return func; +} + +// compile time expansion applying func to a each type in the list in order +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each_type(camp::list const&, UnaryFunc func) +{ + // braced init lists are evaluated in order + int seq_unused_array[] = {0, (func(Ts{}), 0)...}; + RAJA_UNUSED_VAR(seq_unused_array); + + return func; +} + +} // namespace detail + + +/*! + \brief Apply func to all the elements in the given range in order + using a sequential for loop in O(N) operations and O(1) extra memory + see https://en.cppreference.com/w/cpp/algorithm/for_each +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + for_each(Container&& c, UnaryFunc func) +{ + using std::begin; + using std::end; + + return detail::for_each(begin(c), end(c), std::move(func)); +} + +/*! + \brief Apply func to each type in the given list in order + using a compile-time expansion in O(N) operations and O(1) extra memory +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each_type(camp::list const& c, UnaryFunc func) +{ + return detail::for_each_type(c, std::move(func)); +} + +} // namespace RAJA + +#endif diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp index fc83f8999b..55e90010d8 100644 --- a/include/RAJA/util/macros.hpp +++ b/include/RAJA/util/macros.hpp @@ -56,6 +56,8 @@ #define RAJA_HOST __host__ #define RAJA_SUPPRESS_HD_WARN +#define RAJA_USE_HIP_INTRINSICS + #else #define RAJA_HOST_DEVICE @@ -64,6 +66,13 @@ #define RAJA_SUPPRESS_HD_WARN #endif + +#if defined(__has_builtin) +#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) __has_builtin(x) +#else +#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) 0 +#endif + /*! ******************************************************************************* * \def RAJA_USED_ARG(x) diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp new file mode 100644 index 0000000000..36c7cca1a0 --- /dev/null +++ b/include/RAJA/util/math.hpp @@ -0,0 +1,75 @@ +/*! +****************************************************************************** +* +* \file +* +* \brief Header file providing RAJA math templates. +* +****************************************************************************** +*/ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_util_math_HPP +#define RAJA_util_math_HPP + +#include "RAJA/config.hpp" + +#include +#include + +namespace RAJA +{ + +/*! + \brief evaluate log base 2 of n + + For positive n calculate log base 2 of n, and round the result down to the + nearest integer. + For zero or negative n return 0 + +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +RAJA_HOST_DEVICE RAJA_INLINE +constexpr T log2(T n) noexcept +{ + T result = 0; + if (n > 0) { + while(n >>= 1) { + ++result; + } + } + return result; +} + +/*! + \brief "round up" to the next greatest power of 2 + + For a integer n, + if n is non-negative, + if n is a power of 2, return n + if n is not a power of 2, return the next greater power of 2 + if n is negative, return 0 +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +RAJA_HOST_DEVICE +constexpr T next_pow2(T n) noexcept +{ + --n; + for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) { + n |= n >> s; + } + ++n; + return n; +} + +} // namespace RAJA + +#endif diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp new file mode 100644 index 0000000000..6d0c28f861 --- /dev/null +++ b/include/RAJA/util/reduce.hpp @@ -0,0 +1,400 @@ +/*! +****************************************************************************** +* +* \file +* +* \brief Header file providing RAJA sort templates. +* +****************************************************************************** +*/ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_util_reduce_HPP +#define RAJA_util_reduce_HPP + +#include "RAJA/config.hpp" + +#include +#include +#include +#include + +#include "RAJA/pattern/detail/algorithm.hpp" + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/concepts.hpp" +#include "RAJA/util/math.hpp" +#include "RAJA/util/Operators.hpp" + +namespace RAJA +{ + +namespace detail +{ + +/*! + \brief Reduce class that does a reduction with a left fold. +*/ +template +struct LeftFoldReduce +{ + RAJA_HOST_DEVICE RAJA_INLINE + constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(), + BinaryOp op = BinaryOp{}) noexcept + : m_op(std::move(op)) + , m_accumulated_value(std::move(init)) + { + + } + + LeftFoldReduce(LeftFoldReduce const&) = delete; + LeftFoldReduce& operator=(LeftFoldReduce const&) = delete; + LeftFoldReduce(LeftFoldReduce &&) = delete; + LeftFoldReduce& operator=(LeftFoldReduce &&) = delete; + + ~LeftFoldReduce() = default; + + + /*! + \brief reset the combined value of the reducer to the identity + */ + RAJA_HOST_DEVICE RAJA_INLINE + void clear() noexcept + { + m_accumulated_value = BinaryOp::identity(); + } + + /*! + \brief return the combined value and clear the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get_and_clear() + { + T accumulated_value = std::move(m_accumulated_value); + + clear(); + + return accumulated_value; + } + + /*! + \brief return the combined value + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get() + { + return m_accumulated_value; + } + + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + void combine(T val) + { + m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val)); + } + +private: + BinaryOp m_op; + T m_accumulated_value; +}; + +/*! + \brief Reduce class that does a reduction with a binary tree. +*/ +template +struct BinaryTreeReduce +{ + static_assert(std::is_unsigned::value, "SizeType must be unsigned"); + static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels"); + + static constexpr SizeType num_levels = t_num_levels; + + RAJA_HOST_DEVICE RAJA_INLINE + constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(), + BinaryOp op = BinaryOp{}) noexcept + : m_op(std::move(op)) + { + combine(std::move(init)); + } + + BinaryTreeReduce(BinaryTreeReduce const&) = delete; + BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete; + BinaryTreeReduce(BinaryTreeReduce &&) = delete; + BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete; + + RAJA_HOST_DEVICE RAJA_INLINE + ~BinaryTreeReduce() + { + clear(); + } + + + /*! + \brief reset the combined value of the reducer to the identity + */ + RAJA_HOST_DEVICE RAJA_INLINE + void clear() noexcept + { + // destroy all values on the tree stack and reset count to 0 + for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) { + + if (m_count & mask) { + + get_value(level)->~T(); + + m_count ^= mask; + + } + } + } + + /*! + \brief return the combined value and clear the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get_and_clear() + { + // accumulate all values + T value = BinaryOp::identity(); + + for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) { + + if (m_count & mask) { + + value = m_op(std::move(value), std::move(*get_value(level))); + get_value(level)->~T(); + + m_count ^= mask; + } + } + + return value; + } + + /*! + \brief return the combined value + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get() + { + // accumulate all values + T value = BinaryOp::identity(); + + for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) { + + if (count & mask) { + + value = m_op(std::move(value), *get_value(level)); + + count ^= mask; + } + } + + return value; + } + + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + void combine(T value) + { + // accumulate values and store in the first unused level found + // clear values from used levels along the way + SizeType level = 0; + for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) { + + value = m_op(std::move(*get_value(level)), std::move(value)); + get_value(level)->~T(); + + } + + new(get_storage(level)) T(std::move(value)); + + ++m_count; + } + +private: + BinaryOp m_op; + + // A counter of the number of inputs combined. + // The bits of count indicate which levels of tree stack have a value + SizeType m_count = 0; + + // Each level in tree stack has a value that holds the accumulation of 2^level + // values or is unused and has no value. + std::aligned_storage_t m_tree_stack[num_levels]; + + RAJA_HOST_DEVICE RAJA_INLINE + void* get_storage(SizeType level) + { + return &m_tree_stack[level]; + } + + RAJA_HOST_DEVICE RAJA_INLINE + T* get_value(SizeType level) + { +#if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + // TODO: check that launder is supported in device code + return std::launder(reinterpret_cast(&m_tree_stack[level])); +#else + return reinterpret_cast(&m_tree_stack[level]); +#endif + } +}; + + +template +using HighAccuracyReduce = std::conditional_t< + RAJA::operators::is_fp_associative::value, + BinaryTreeReduce, + LeftFoldReduce>; + + +/*! + \brief Combine into a single value using a left fold with the given + operation using O(N) operations and O(1) memory +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +T left_fold_reduce(Iter begin, + Iter end, + T init, + BinaryOp op) +{ + LeftFoldReduce reducer(std::move(init), std::move(op)); + + for (; begin != end; ++begin) { + + reducer.combine(*begin); + + } + + return reducer.get_and_clear(); +} + +/*! + \brief reduce using a binary tree with the given operation + and using O(N) operations and O(lg(n)) memory + + This is more accurate than sequentially adding into a single value for + floating point types. +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +T binary_tree_reduce(Iter begin, + Iter end, + T init, + BinaryOp op) +{ + using std::distance; + using SizeType = std::make_unsigned_t; + BinaryTreeReduce reducer(std::move(init), std::move(op)); + + for (; begin != end; ++begin) { + + reducer.combine(*begin); + + } + + return reducer.get_and_clear(); +} + + +/*! + \brief reducer that uses a high accuracy implementation when round-off error + is a concern, or a faster algorithm with it is not a concern +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +T high_accuracy_reduce(Iter begin, + Iter end, + T init, + BinaryOp op) +{ + HighAccuracyReduce reducer(std::move(init), std::move(op)); + + for (; begin != end; ++begin) { + + reducer.combine(*begin); + + } + + return reducer.get_and_clear(); +} + +} // namespace detail + +/*! + \brief Accumulate given range to a single value + using a left fold algorithm in O(N) operations and O(1) extra memory + see https://en.cppreference.com/w/cpp/algorithm/accumulate +*/ +template , + typename BinaryOp = operators::plus> +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{}) +{ + using std::begin; + using std::end; + static_assert(type_traits::is_binary_function::value, + "BinaryOp must model BinaryFunction"); + + return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op)); +} + +/*! + \brief Reduce given range to a single value + using a binary tree algorithm in O(N) operations and O(lg(N)) extra memory + see https://en.cppreference.com/w/cpp/algorithm/reduce +*/ +template , + typename BinaryOp = operators::plus> +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{}) +{ + using std::begin; + using std::end; + static_assert(type_traits::is_binary_function::value, + "BinaryOp must model BinaryFunction"); + + return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op)); +} + +/*! + \brief Reduce given range to a single value + using an algorithm with high accuracy when floating point round off is a + concern + see https://en.cppreference.com/w/cpp/algorithm/reduce +*/ +template , + typename BinaryOp = operators::plus> +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{}) +{ + using std::begin; + using std::end; + static_assert(type_traits::is_binary_function::value, + "BinaryOp must model BinaryFunction"); + + return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op)); +} + +} // namespace RAJA + +#endif diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp index a54ce434a2..28a476d951 100644 --- a/include/RAJA/util/resource.hpp +++ b/include/RAJA/util/resource.hpp @@ -65,8 +65,9 @@ namespace RAJA using type = camp::resources::Cuda; }; - template - struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit>{ + template + struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit>{ using type = camp::resources::Cuda; }; @@ -75,8 +76,9 @@ namespace RAJA using type = camp::resources::Cuda; }; - template - struct get_resource>>{ + template + struct get_resource>>{ using type = camp::resources::Cuda; }; #endif @@ -87,8 +89,9 @@ namespace RAJA using type = camp::resources::Hip; }; - template - struct get_resource<::RAJA::policy::hip::hip_exec>{ + template + struct get_resource<::RAJA::policy::hip::hip_exec>{ using type = camp::resources::Hip; }; @@ -97,8 +100,9 @@ namespace RAJA using type = camp::resources::Hip; }; - template - struct get_resource>>{ + template + struct get_resource>>{ using type = camp::resources::Hip; }; #endif diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp index f1eebfc282..bbec03dfe1 100644 --- a/include/RAJA/util/sort.hpp +++ b/include/RAJA/util/sort.hpp @@ -26,8 +26,8 @@ #include "RAJA/pattern/detail/algorithm.hpp" #include "RAJA/util/macros.hpp" - #include "RAJA/util/concepts.hpp" +#include "RAJA/util/math.hpp" namespace RAJA { @@ -35,23 +35,6 @@ namespace RAJA namespace detail { -/*! - \brief evaluate log base 2 of N rounded down to the nearest integer >= 0 -*/ -RAJA_HOST_DEVICE RAJA_INLINE -unsigned -ulog2(size_t N) -{ - unsigned val = 0; - - while (N > 1) { - val += 1; - N >>= 1; - } - - return val; -} - /*! \brief unstable partition given range inplace using predicate function and using O(N) predicate evaluations and O(1) memory @@ -426,7 +409,7 @@ intro_sort(Iter begin, auto N = end - begin; // set max depth to 2*lg(N) - unsigned max_depth = 2*detail::ulog2(N); + unsigned max_depth = 2*RAJA::log2(N); #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // limit max_depth statically in device code to allow compiler to remove recursion diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 811f681b9b..7e331ef00e 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -30,6 +30,9 @@ #include "camp/helpers.hpp" +#include "RAJA/util/macros.hpp" + + namespace RAJA { @@ -67,6 +70,18 @@ enum struct kernel_sync_requirement : int namespace iteration_mapping { +struct DirectBase {}; +struct LoopBase {}; +struct ContiguousLoopBase : LoopBase {}; +struct StridedLoopBase : LoopBase {}; +struct UnsizedLoopBase {}; +struct SizedLoopBase {}; +template < size_t t_max_iterations > +struct SizedLoopSpecifyingBase : SizedLoopBase +{ + static constexpr size_t max_iterations = t_max_iterations; +}; + /// /// Direct assumes the loop has enough iterations for all of the indices and /// maps directly from an iteration to an index. @@ -88,7 +103,7 @@ namespace iteration_mapping /// // 3 -> {3} /// // 4 -> {} /// -struct Direct {}; +struct Direct : DirectBase {}; /// /// Contiguousloop assumes the loop has fewer iterations than indices and @@ -115,7 +130,10 @@ struct Direct {}; /// // 1 -> {3, 4, 5} /// // 2 -> {6, 7} /// -struct Contiguousloop {}; +template < size_t max_iterations > +struct Contiguousloop : ContiguousLoopBase, + std::conditional_t<(max_iterations != named_usage::unspecified), + SizedLoopSpecifyingBase, UnsizedLoopBase> {}; /// /// StridedLoop assumes the loop has fewer iterations than indices and @@ -142,7 +160,10 @@ struct Contiguousloop {}; /// // 1 -> {1, 4, 7} /// // 2 -> {2, 5} /// -struct StridedLoop {}; +template < size_t max_iterations > +struct StridedLoop : StridedLoopBase, + std::conditional_t<(max_iterations != named_usage::unspecified), + SizedLoopSpecifyingBase, UnsizedLoopBase> {}; } // namespace iteration_mapping @@ -172,6 +193,28 @@ struct SizeList { }; +/// +/// Compile time fraction for use with integral types +/// +template +struct Fraction +{ + static_assert(denominator != int_t(0), "denominator must not be zero"); + + using inverse = Fraction; + + template < typename new_int_t > + using rebind = Fraction; + + static constexpr int_t multiply(int_t val) noexcept + { + return (val / denominator) * numerator + + (val % denominator) * numerator / denominator; + } + +}; + + /*! ****************************************************************************** * @@ -823,6 +866,98 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr; #endif + +namespace detail { + +/*! + * \brief Abstracts access to memory using normal memory accesses. + */ +struct DefaultAccessor +{ + template < typename T > + static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i) + { + return ptr[i]; + } + + template < typename T > + static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val) + { + ptr[i] = val; + } +}; + + +/*! + * \brief Abstracts T into an equal or greater size array of integers whose + * size is between min_integer_type_size and max_interger_type_size inclusive. + */ +template +struct AsIntegerArray +{ + static_assert(min_integer_type_size <= max_integer_type_size, + "incompatible min and max integer type size"); + using integer_type = std::conditional_t< + ((alignof(T) >= alignof(unsigned long long) && + sizeof(unsigned long long) <= max_integer_type_size) || + sizeof(unsigned long) < min_integer_type_size), + unsigned long long, + std::conditional_t< + ((alignof(T) >= alignof(unsigned long) && + sizeof(unsigned long) <= max_integer_type_size) || + sizeof(unsigned int) < min_integer_type_size), + unsigned long, + std::conditional_t< + ((alignof(T) >= alignof(unsigned int) && + sizeof(unsigned int) <= max_integer_type_size) || + sizeof(unsigned short) < min_integer_type_size), + unsigned int, + std::conditional_t< + ((alignof(T) >= alignof(unsigned short) && + sizeof(unsigned short) <= max_integer_type_size) || + sizeof(unsigned char) < min_integer_type_size), + unsigned short, + std::conditional_t< + ((alignof(T) >= alignof(unsigned char) && + sizeof(unsigned char) <= max_integer_type_size)), + unsigned char, + void>>>>>; + static_assert(!std::is_same::value, + "could not find a compatible integer type"); + static_assert(sizeof(integer_type) >= min_integer_type_size, + "integer_type smaller than min integer type size"); + static_assert(sizeof(integer_type) <= max_integer_type_size, + "integer_type greater than max integer type size"); + + static constexpr size_t num_integer_type = + (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type); + + integer_type array[num_integer_type] = {0}; + + AsIntegerArray() = default; + + RAJA_HOST_DEVICE constexpr size_t array_size() const + { + return num_integer_type; + } + + RAJA_HOST_DEVICE constexpr T get_value() const + { + T value; + memcpy(&value, &array[0], sizeof(T)); + return value; + } + + RAJA_HOST_DEVICE constexpr void set_value(T value) + { + memcpy(&array[0], &value, sizeof(T)); + } +}; + +} // namespace detail + } // namespace RAJA #endif // closing endif for header file include guard diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index af75606a7f..a8d22367e0 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit af75606a7fc0492e35cdd3860337c4e873f43124 +Subproject commit a8d22367e03d4c9c180a11886414430bdf6428a8 diff --git a/scripts/uberenv b/scripts/uberenv index 4941c237ee..cf91883ef0 160000 --- a/scripts/uberenv +++ b/scripts/uberenv @@ -1 +1 @@ -Subproject commit 4941c237eec514d6d68872243efb9f4af8843f4d +Subproject commit cf91883ef0500a808338ad6c8b56647da15fa5f3 diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp index cdb8940256..8da7b81eb7 100644 --- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp +++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp @@ -36,12 +36,16 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range) for(s_type b=0; b (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)), RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size), @@ -52,7 +56,11 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range) INDEX_TYPE * tile_ptr = ctx.getSharedMemory(RAJA::stripIndexType(thread_range)); RAJA::View> Tile(tile_ptr, RAJA::stripIndexType(thread_range)); + int * int_tile_ptr = ctx.getSharedMemory(RAJA::stripIndexType(thread_range)); + RAJA::View> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range)); + RAJA::loop(ctx, inner_range, [&](INDEX_TYPE tid) { + Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid); Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid; }); @@ -60,7 +68,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range) RAJA::loop(ctx, inner_range, [&](INDEX_TYPE tid) { INDEX_TYPE idx = tid + thread_range * bid; - working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)); + working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid)); }); ctx.releaseSharedMemory(); diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index 2fe790ff93..40adaccc8c 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -108,7 +108,9 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols; using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>, RAJA::cuda_exec_occ_calc<256>, RAJA::cuda_exec_grid<256, 64>, - RAJA::cuda_exec_explicit<256,2> >; + RAJA::cuda_exec_explicit<256,2>, + RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction>, + RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >; using CudaForallReduceExecPols = CudaForallExecPols; @@ -119,7 +121,9 @@ using CudaForallAtomicExecPols = CudaForallExecPols; #if defined(RAJA_ENABLE_HIP) using HipForallExecPols = camp::list< RAJA::hip_exec<128>, RAJA::hip_exec_occ_calc<256>, - RAJA::hip_exec_grid<256, 64> >; + RAJA::hip_exec_grid<256, 64>, + RAJA::hip_exec_occ_fraction<256, RAJA::Fraction>, + RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >; using HipForallReduceExecPols = HipForallExecPols; diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp index 5b5dfdbebf..7179e48fdc 100644 --- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp +++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp @@ -81,8 +81,8 @@ using Hip_launch_policies = camp::list; using sycl_direct_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list; diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp index 38bc4c8bb0..f84823e414 100644 --- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp +++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp @@ -100,12 +100,12 @@ using Hip_launch_policies = camp::list; using sycl_direct_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //slowest RAJA::LoopPolicy, - RAJA::LoopPolicy, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //fastest + RAJA::LoopPolicy, RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list; diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp index 9961cd0741..fea90a8305 100644 --- a/test/include/RAJA_test-launch-execpol.hpp +++ b/test/include/RAJA_test-launch-execpol.hpp @@ -68,7 +68,7 @@ using Hip_launch_policies = camp::list< using sycl_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using Sycl_launch_policies = camp::list< sycl_policies diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp index 9e5779853c..6173fc6ffa 100644 --- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp +++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp @@ -75,8 +75,8 @@ using Hip_launch_policies = camp::list< #if defined(RAJA_ENABLE_SYCL) using sycl_loop_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list< diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp index 9d217757b2..d703216a13 100644 --- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp +++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp @@ -95,12 +95,12 @@ using Hip_launch_policies = camp::list< #if defined(RAJA_ENABLE_SYCL) using sycl_loop_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //slowest index RAJA::LoopPolicy, - RAJA::LoopPolicy, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //fastest index + RAJA::LoopPolicy, RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list< diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp index bec07358e6..fa2b39f761 100644 --- a/test/include/RAJA_test-launch-runtime-execpol.hpp +++ b/test/include/RAJA_test-launch-runtime-execpol.hpp @@ -52,8 +52,8 @@ using Sequential_launch_policies = camp::list; using seq_sycl_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using Sequential_launch_policies = camp::list; @@ -110,8 +110,8 @@ using OpenMP_launch_policies = camp::list; using omp_sycl_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using OpenMP_launch_policies = camp::list; diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp index d8d5fc670b..e9e075b287 100644 --- a/test/include/RAJA_test-reducepol.hpp +++ b/test/include/RAJA_test-reducepol.hpp @@ -34,11 +34,21 @@ using OpenMPTargetReducePols = #endif #if defined(RAJA_ENABLE_CUDA) -using CudaReducePols = camp::list< RAJA::cuda_reduce >; +using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence, + RAJA::cuda_reduce_block_fence, + RAJA::cuda_reduce_atomic_device_init_device_fence, + RAJA::cuda_reduce_atomic_device_init_block_fence, + RAJA::cuda_reduce_atomic_host_init_device_fence, + RAJA::cuda_reduce_atomic_host_init_block_fence >; #endif #if defined(RAJA_ENABLE_HIP) -using HipReducePols = camp::list< RAJA::hip_reduce >; +using HipReducePols = camp::list< RAJA::hip_reduce_device_fence, + RAJA::hip_reduce_block_fence, + RAJA::hip_reduce_atomic_device_init_device_fence, + RAJA::hip_reduce_atomic_device_init_block_fence, + RAJA::hip_reduce_atomic_host_init_device_fence, + RAJA::hip_reduce_atomic_host_init_block_fence >; #endif #if defined(RAJA_ENABLE_SYCL) diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt index 856e4519b6..ea93727d59 100644 --- a/test/unit/algorithm/CMakeLists.txt +++ b/test/unit/algorithm/CMakeLists.txt @@ -48,43 +48,64 @@ foreach( SORT_BACKEND ${SORT_BACKENDS} ) endforeach() -set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge ) -set( CUDA_UTIL_SORTS Shell Heap Intro ) -set( HIP_UTIL_SORTS Shell Heap Intro ) -macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_SORTS) - set( SORT_BACKEND ${SORT_BACKEND_in} ) - set( SORT_SIZE ${SORT_SIZE_in} ) - foreach( UTIL_SORT ${UTIL_SORTS} ) - configure_file( test-algorithm-util-sort.cpp.in - test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp ) +macro(RAJA_GENERATE_ALGORITHM_UTIL_TESTS ALG ALG_BACKEND_in ALG_SIZE_in UTIL_ALGS) + set( ALG_BACKEND ${ALG_BACKEND_in} ) + set( ALG_SIZE ${ALG_SIZE_in} ) + foreach( UTIL_ALG ${UTIL_ALGS} ) + configure_file( test-algorithm-util-${ALG}.cpp.in + test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp ) - raja_add_test( NAME test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND} - SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp ) + raja_add_test( NAME test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp ) - target_include_directories(test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.exe + target_include_directories(test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) endforeach() - unset( SORT_SIZE ) - unset( SORT_BACKEND ) + unset( ALG_SIZE ) + unset( ALG_BACKEND ) endmacro() -RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Default "${SEQUENTIAL_UTIL_SORTS}" ) -RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Small "Insertion" ) +set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge ) +set( CUDA_UTIL_SORTS Shell Heap Intro ) +set( HIP_UTIL_SORTS Shell Heap Intro ) + +RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Default "${SEQUENTIAL_UTIL_SORTS}" ) +RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Small "Insertion" ) if(RAJA_ENABLE_CUDA) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Small "${CUDA_UTIL_SORTS}" ) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Tiny "Insertion" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Small "${CUDA_UTIL_SORTS}" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Tiny "Insertion" ) endif() if(RAJA_ENABLE_HIP) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Small "${HIP_UTIL_SORTS}" ) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Tiny "Insertion" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Small "${HIP_UTIL_SORTS}" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Tiny "Insertion" ) endif() + +set( UTIL_REDUCES BinaryTree Accumulate ) + +RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Sequential Default "${UTIL_REDUCES}" ) + +if(RAJA_ENABLE_CUDA) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Cuda Small "${UTIL_REDUCES}" ) +endif() + +if(RAJA_ENABLE_HIP) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Hip Small "${UTIL_REDUCES}" ) +endif() + + unset( SORT_BACKENDS ) unset( SEQUENTIAL_UTIL_SORTS ) unset( CUDA_UTIL_SORTS ) unset( HIP_UTIL_SORTS ) +unset( UTIL_REDUCES ) + + +raja_add_test( + NAME test-algorithm-util-for_each + SOURCES test-algorithm-util-for_each.cpp) diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp new file mode 100644 index 0000000000..db918ad234 --- /dev/null +++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp @@ -0,0 +1,150 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing unit tests for for_each +/// + +#include "RAJA_test-base.hpp" + +#include "RAJA_unit-test-types.hpp" + +#include "camp/resource.hpp" + +#include +#include +#include + +template +class ForEachUnitTest : public ::testing::Test {}; + +TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes); + + +TYPED_TEST(ForEachUnitTest, EmptyRange) +{ + std::vector numbers; + + std::vector copies; + RAJA::for_each(numbers, [&](TypeParam& number) { + number += 1; + copies.push_back(number); + }); + + ASSERT_EQ(copies.size(), 0); + ASSERT_EQ(numbers.size(), 0); +} + +TYPED_TEST(ForEachUnitTest, VectorRange) +{ + std::vector numbers; + for (TypeParam i = 0; i < 13; ++i) { + numbers.push_back(i); + } + + std::vector copies; + RAJA::for_each(numbers, [&](TypeParam& number) { + copies.push_back(number); + number += 1; + }); + + ASSERT_EQ(copies.size(), 13); + for (TypeParam i = 0; i < 13; ++i) { + ASSERT_EQ(numbers[i], copies[i]+1); + } +} + +TYPED_TEST(ForEachUnitTest, RajaSpanRange) +{ + std::vector numbers; + for (TypeParam i = 0; i < 11; ++i) { + numbers.push_back(i); + } + + std::vector copies; + RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) { + copies.push_back(number); + number += 1; + }); + + ASSERT_EQ(copies.size(), 11); + for (TypeParam i = 0; i < 11; ++i) { + ASSERT_EQ(numbers[i], copies[i]+1); + } +} + +TYPED_TEST(ForEachUnitTest, SetRange) +{ + std::set numbers; + for (TypeParam i = 0; i < 6; ++i) { + numbers.insert(i); + } + + std::vector copies; + RAJA::for_each(numbers, [&](TypeParam const& number) { + copies.push_back(number); + }); + + ASSERT_EQ(copies.size(), 6); + for (TypeParam i = 0; i < 6; ++i) { + ASSERT_EQ(i, copies[i]); + ASSERT_EQ(numbers.count(i), 1); + } +} + + +TYPED_TEST(ForEachUnitTest, EmptyTypeList) +{ + using numbers = camp::list<>; + + std::vector copies; + RAJA::for_each_type(numbers{}, [&](auto number) { + copies.push_back(number); + }); + + ASSERT_EQ(copies.size(), 0); +} + + +template < typename T, T val > +T get_num(std::integral_constant) +{ + return val; +} + +template < typename TypeParam, + std::enable_if_t::value>* = nullptr > +void run_int_type_test() +{ + using numbers = camp::list, + std::integral_constant, + std::integral_constant, + std::integral_constant, + std::integral_constant>; + + std::vector copies; + RAJA::for_each_type(numbers{}, [&](auto number) { + copies.push_back(get_num(number)); + }); + + ASSERT_EQ(copies.size(), 5); + for (TypeParam i = 0; i < 5; ++i) { + ASSERT_EQ(i, copies[i]); + } +} +/// +template < typename TypeParam, + std::enable_if_t::value>* = nullptr > +void run_int_type_test() +{ + // ignore non-ints +} + +TYPED_TEST(ForEachUnitTest, IntTypeList) +{ + run_int_type_test(); +} diff --git a/test/unit/algorithm/test-algorithm-util-reduce.cpp.in b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in new file mode 100644 index 0000000000..d7dd20bcd2 --- /dev/null +++ b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in @@ -0,0 +1,36 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" + +// +// Header for tests in ./tests directory +// +// Note: CMake adds ./tests as an include dir for these tests. +// +#include "test-algorithm-util-reduce.hpp" + + +// +// Cartesian product of types used in parameterized tests +// +using @ALG_BACKEND@@UTIL_ALG@ReduceTypes = + Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@ReduceReducers, + @ALG_BACKEND@ResourceList, + ReduceValTypeList, + ReduceMaxNList@ALG_SIZE@ > >::Types; + +// +// Instantiate parameterized test +// +INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@, + ReduceUnitTest, + @ALG_BACKEND@@UTIL_ALG@ReduceTypes ); diff --git a/test/unit/algorithm/test-algorithm-util-sort.cpp.in b/test/unit/algorithm/test-algorithm-util-sort.cpp.in index 7dbb0dcd93..0555a9e9f0 100644 --- a/test/unit/algorithm/test-algorithm-util-sort.cpp.in +++ b/test/unit/algorithm/test-algorithm-util-sort.cpp.in @@ -22,15 +22,15 @@ // // Cartesian product of types used in parameterized tests // -using @SORT_BACKEND@@UTIL_SORT@SortTypes = - Test< camp::cartesian_product<@SORT_BACKEND@@UTIL_SORT@SortSorters, - @SORT_BACKEND@ResourceList, +using @ALG_BACKEND@@UTIL_ALG@SortTypes = + Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@SortSorters, + @ALG_BACKEND@ResourceList, SortKeyTypeList, - SortMaxNList@SORT_SIZE@ > >::Types; + SortMaxNList@ALG_SIZE@ > >::Types; // // Instantiate parameterized test // -INSTANTIATE_TYPED_TEST_SUITE_P( @SORT_BACKEND@, +INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@, SortUnitTest, - @SORT_BACKEND@@UTIL_SORT@SortTypes ); + @ALG_BACKEND@@UTIL_ALG@SortTypes ); diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp new file mode 100644 index 0000000000..4e3f9fb795 --- /dev/null +++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp @@ -0,0 +1,350 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-689114 +// +// All rights reserved. +// +// This file is part of RAJA. +// +// For details about use and distribution, please read RAJA/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Header file containing test infrastructure for reduce tests +/// + +#ifndef __TEST_ALGORITHM_REDUCE_UTILS_HPP__ +#define __TEST_ALGORITHM_REDUCE_UTILS_HPP__ + +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "type_helper.hpp" +#include "RAJA_unit-test-forone.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + + +// tag classes to differentiate reduce by attributes and apply correct testing +struct left_fold_reduce_tag { }; +struct unordered_reduce_tag { }; + +struct reduce_interface_tag { }; + +struct reduce_default_interface_tag { }; +struct reduce_init_interface_tag { }; +struct reduce_init_op_interface_tag { }; + + +// synchronize based on a RAJA execution policy +template < typename policy > +struct PolicySynchronize +{ + void synchronize() + { + // no synchronization needed + } +}; + +#if defined(RAJA_ENABLE_CUDA) +// partial specialization for cuda_exec +template < size_t BLOCK_SIZE, bool Async > +struct PolicySynchronize> +{ + void synchronize() + { + if (Async) { RAJA::synchronize(); } + } +}; +#endif + +#if defined(RAJA_ENABLE_HIP) +// partial specialization for hip_exec +template < size_t BLOCK_SIZE, bool Async > +struct PolicySynchronize> +{ + void synchronize() + { + if (Async) { RAJA::synchronize(); } + } +}; +#endif + + +template +struct ReduceData; + +template +struct ReduceData +{ + ValType* values = nullptr; + ValType* reduced_value = nullptr; + Res m_res; + + template < typename RandomGenerator > + ReduceData(size_t N, Res res, RandomGenerator gen_random) + : m_res(res) + { + if (N > 0) { + values = m_res.template allocate(N, camp::resources::MemoryAccess::Managed); + } + reduced_value = m_res.template allocate(1, camp::resources::MemoryAccess::Managed); + + for (size_t i = 0; i < N; i++) { + values[i] = gen_random(); + } + } + + void copy_data(size_t N) + { + if ( N == 0 ) return; + } + + Res resource() + { + return m_res; + } + + ReduceData(ReduceData const&) = delete; + ReduceData& operator=(ReduceData const&) = delete; + + ~ReduceData() + { + if (values != nullptr) { + m_res.deallocate(values, camp::resources::MemoryAccess::Managed); + m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed); + } + } +}; + + +template +void doReduce(ReduceData & data, + RAJA::Index_type N, + T, + BinaryOp, + Reducer reducer, reduce_interface_tag, reduce_default_interface_tag) +{ + data.copy_data(N); + data.resource().wait(); + reducer(data.reduced_value, RAJA::make_span(data.values, N)); + reducer.synchronize(); +} + +template +void doReduce(ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp, + Reducer reducer, reduce_interface_tag, reduce_init_interface_tag) +{ + data.copy_data(N); + data.resource().wait(); + reducer(data.reduced_value, RAJA::make_span(data.values, N), init); + reducer.synchronize(); +} + +template +void doReduce(ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp op, + Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag) +{ + data.copy_data(N); + data.resource().wait(); + reducer(data.reduced_value, RAJA::make_span(data.values, N), init, op); + reducer.synchronize(); +} + + +template +::testing::AssertionResult testReduce( + const char* test_name, + const unsigned seed, + ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp op, + TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci) +{ + doReduce(data, N, init, op, test_reducer, si, ci); + + T reduced_check_value = init; + for (RAJA::Index_type i = 0; i < N; i++) { + reduced_check_value = op(std::move(reduced_check_value), data.values[i]); + } + + if (reduced_check_value != *data.reduced_value) { + return ::testing::AssertionFailure() + << test_reducer.name() << " (left fold reduce) " << test_name + << " (with N " << N << " with seed " << seed << ")" + << " incorrect " << *data.reduced_value + << ", expected " << reduced_check_value; + } + + return ::testing::AssertionSuccess(); +} + +template +::testing::AssertionResult testReduce( + const char* test_name, + const unsigned seed, + ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp op, + TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci) +{ + doReduce(data, N, init, op, test_reducer, si, ci); + + T reduced_check_value = init; + for (RAJA::Index_type i = 0; i < N; i++) { + reduced_check_value = op(std::move(reduced_check_value), data.values[i]); + } + + if (reduced_check_value != *data.reduced_value) { + return ::testing::AssertionFailure() + << test_reducer.name() << " (unordered reduce) " << test_name + << " (with N " << N << " with seed " << seed << ")" + << " incorrect " << *data.reduced_value + << ", expected " << reduced_check_value; + } + + return ::testing::AssertionSuccess(); +} + + +template +void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res) +{ + using reduce_category = typename Reducer::reduce_category ; + using interface_category = typename Reducer::reduce_interface ; + using no_init_operator = reduce_default_interface_tag; + using init_no_operator = reduce_init_interface_tag; + using init_operator = reduce_init_op_interface_tag; + + std::mt19937 rng(seed); + RAJA::Index_type N = std::uniform_int_distribution((MaxN+1)/2, MaxN)(rng); + std::uniform_int_distribution dist(-N, N); + + ReduceData data(N, res, [&](){ return dist(rng); }); + + ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus::identity(), RAJA::operators::plus{}, + reducer, reduce_category{}, interface_category{}, no_init_operator{})); + ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus{}, + reducer, reduce_category{}, interface_category{}, init_no_operator{})); + ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum{}, + reducer, reduce_category{}, interface_category{}, init_operator{})); + ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum{}, + reducer, reduce_category{}, interface_category{}, init_operator{})); +} + +template +void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res) +{ + testReducerInterfaces(seed, 0, reducer, res); + for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) { + testReducerInterfaces(seed, n, reducer, res); + } +} + +inline unsigned get_random_seed() +{ + static unsigned seed = std::random_device{}(); + return seed; +} + + +TYPED_TEST_SUITE_P(ReduceUnitTest); + +template < typename T > +class ReduceUnitTest : public ::testing::Test +{ }; + +TYPED_TEST_P(ReduceUnitTest, UnitReduce) +{ + using Reducer = typename camp::at>::type; + using ResType = typename camp::at>::type; + using ValType = typename camp::at>::type; + using MaxNType = typename camp::at>::type; + + unsigned seed = get_random_seed(); + RAJA::Index_type MaxN = MaxNType::value; + Reducer reducer{}; + ResType res = ResType::get_default(); + + testReducer(seed, MaxN, reducer, res); +} + +REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce); + + +// +// Key types for reduce tests +// +using ReduceValTypeList = + camp::list< + RAJA::Index_type, + int, +#if defined(RAJA_TEST_EXHAUSTIVE) + unsigned, + long long, + unsigned long long, + float, +#endif + double + >; + +// Max test lengths for reduce tests +using ReduceMaxNListDefault = + camp::list< + camp::num<10000> + >; + +using ReduceMaxNListSmall = + camp::list< + camp::num<1000> + >; + +using ReduceMaxNListTiny = + camp::list< + camp::num<100> + >; + +#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__ + diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp new file mode 100644 index 0000000000..062e0f9b91 --- /dev/null +++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp @@ -0,0 +1,205 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-689114 +// +// All rights reserved. +// +// This file is part of RAJA. +// +// For details about use and distribution, please read RAJA/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Header file containing Reducer classes for util reduce tests +/// + +#ifndef __TEST_ALGORITHM_UTIL_REDUCE_HPP__ +#define __TEST_ALGORITHM_UTIL_REDUCE_HPP__ + +#include "test-algorithm-reduce-utils.hpp" + + +template < typename test_policy > +using ForoneSynchronize = PolicySynchronize>; + + +template < typename test_policy, typename platform = test_platform > +struct BinaryTreeReduce; + +template < typename test_policy, typename platform = test_platform > +struct Accumulate; + + +template < typename test_policy > +struct BinaryTreeReduce + : ForoneSynchronize +{ + using reduce_category = unordered_reduce_tag; + using reduce_interface = reduce_interface_tag; + + const char* name() + { + return "RAJA::binary_tree_reduce"; + } + + template < typename T, typename... Args > + void operator()(T* reduced_value, Args&&... args) + { + *reduced_value = RAJA::binary_tree_reduce(std::forward(args)...); + } +}; + +template < typename test_policy > +struct Accumulate + : ForoneSynchronize +{ + using reduce_category = left_fold_reduce_tag; + using reduce_interface = reduce_interface_tag; + + const char* name() + { + return "RAJA::accumulate"; + } + + template < typename T, typename... Args > + void operator()(T* reduced_value, Args&&... args) + { + *reduced_value = RAJA::accumulate(std::forward(args)...); + } +}; + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + +template < typename test_policy > +struct BinaryTreeReduce + : ForoneSynchronize +{ + using reduce_category = unordered_reduce_tag; + using reduce_interface = reduce_interface_tag; + + std::string m_name; + + BinaryTreeReduce() + : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info::name() + std::string(">")) + { } + + const char* name() + { + return m_name.c_str(); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::binary_tree_reduce(c); + }); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::binary_tree_reduce(c, init); + }); + } + + template < typename T, typename Container, typename BinaryOp > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init, BinaryOp op) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::binary_tree_reduce(c, init, op); + }); + } +}; + +template < typename test_policy > +struct Accumulate + : ForoneSynchronize +{ + using reduce_category = left_fold_reduce_tag; + using reduce_interface = reduce_interface_tag; + + std::string m_name; + + Accumulate() + : m_name(std::string("RAJA::accumulate<") + test_policy_info::name() + std::string(">")) + { } + + const char* name() + { + return m_name.c_str(); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::accumulate(c); + }); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::accumulate(c, init); + }); + } + + template < typename T, typename Container, typename BinaryOp > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init, BinaryOp op) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::accumulate(c, init, op); + }); + } +}; + +#endif + + +using SequentialBinaryTreeReduceReducers = + camp::list< + BinaryTreeReduce + >; + +using SequentialAccumulateReduceReducers = + camp::list< + Accumulate + >; + +#if defined(RAJA_ENABLE_CUDA) + +using CudaBinaryTreeReduceReducers = + camp::list< + BinaryTreeReduce + >; + +using CudaAccumulateReduceReducers = + camp::list< + Accumulate + >; + +#endif + +#if defined(RAJA_ENABLE_HIP) + +using HipBinaryTreeReduceReducers = + camp::list< + BinaryTreeReduce + >; + +using HipAccumulateReduceReducers = + camp::list< + Accumulate + >; + +#endif + +#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__ + diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt index fdec220da9..869b897714 100644 --- a/test/unit/util/CMakeLists.txt +++ b/test/unit/util/CMakeLists.txt @@ -21,4 +21,8 @@ raja_add_test( NAME test-span SOURCES test-span.cpp) +raja_add_test( + NAME test-fraction + SOURCES test-fraction.cpp) + add_subdirectory(operator) diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp new file mode 100644 index 0000000000..5161b2bb3a --- /dev/null +++ b/test/unit/util/test-fraction.cpp @@ -0,0 +1,64 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing tests for Fraction +/// + +#include +#include "RAJA_gtest.hpp" +#include + +template +void testFractionMultiplyTypesValues() +{ + using Frac = RAJA::Fraction; + + ASSERT_EQ(Frac::multiply(IntegerType(0)), IntegerType(0)); + + ASSERT_EQ(Frac::multiply(IntegerType(1)), + IntegerType(double(numerator) / double(denominator))); + + ASSERT_EQ(Frac::multiply(IntegerType(100)), + IntegerType(double(numerator) / double(denominator) * double(100))); + + ASSERT_EQ(Frac::multiply(IntegerType(101)), + IntegerType(double(numerator) / double(denominator) * double(101))); + + // Test where naive algorithm causes overflow, when within precision of double + if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) { + + static constexpr IntegerType max = std::numeric_limits::max(); + static constexpr IntegerType val = (numerator > denominator) ? + (max / numerator * denominator) : max; + + ASSERT_EQ(Frac::multiply(IntegerType(val)), + IntegerType(double(numerator) / double(denominator) * double(val))); + } + +} + +template +void testFractionMultiplyTypes() +{ + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); +} + + +#define RAJA_FRACTION_RUN_TEST(test) \ + test(); \ + test(); + +TEST(Fraction, basic_multiply_Fraction) +{ + RAJA_FRACTION_RUN_TEST(testFractionMultiplyTypes) +}