diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8c1f7a472e..fb6bc7055c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -75,7 +75,7 @@ stages:
     include:
       - local: '.gitlab/custom-jobs-and-variables.yml'
       - project: 'radiuss/radiuss-shared-ci'
-        ref: 'v2023.12.3'
+        ref: 'v2024.04.0'
         file: 'pipelines/${CI_MACHINE}.yml'
       - artifact: '${CI_MACHINE}-jobs.yml'
         job: 'generate-job-lists'
@@ -100,9 +100,11 @@ trigger-rajaperf:
     strategy: depend
 
 include:
+  - project: 'lc-templates/id_tokens'
+    file: 'id_tokens.yml'
   # [Optional] checks preliminary to running the actual CI test
   - project: 'radiuss/radiuss-shared-ci'
-    ref: 'v2023.12.3'
+    ref: 'v2024.04.0'
     file: 'utilities/preliminary-ignore-draft-pr.yml'
   # pipelines subscribed by the project
   - local: '.gitlab/subscribed-pipelines.yml'
diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index eb7011b78a..62d7908945 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -19,17 +19,17 @@ variables:
 # Note: We repeat the reservation, necessary when jobs are manually re-triggered.
   RUBY_JOB_ALLOC: "--reservation=ci --nodes=1"
 # Project specific variants for ruby
-  PROJECT_RUBY_VARIANTS: "~shared +openmp +tests"
+  PROJECT_RUBY_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for ruby
   PROJECT_RUBY_DEPS: ""
 
 # Poodle
 # Arguments for top level allocation
-  POODLE_SHARED_ALLOC: "--exclusive --time=60 --nodes=1"
+  POODLE_SHARED_ALLOC: "--exclusive --time=120 --nodes=1"
 # Arguments for job level allocation
   POODLE_JOB_ALLOC: "--nodes=1"
 # Project specific variants for poodle
-  PROJECT_POODLE_VARIANTS: "~shared +openmp +tests"
+  PROJECT_POODLE_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for poodle
   PROJECT_POODLE_DEPS: ""
 
@@ -39,26 +39,26 @@ variables:
 # Arguments for job level allocation
   CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
-  PROJECT_CORONA_VARIANTS: "~shared ~openmp +tests"
+  PROJECT_CORONA_VARIANTS: "~shared ~openmp +vectorization +tests"
 # Project specific deps for corona
   PROJECT_CORONA_DEPS: "^blt@develop "
 
 # Tioga
 # Arguments for top level allocation
-  TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1 -o per-resource.count=2"
+  TIOGA_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2"
 # Arguments for job level allocation
   TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
-  PROJECT_TIOGA_VARIANTS: "~shared ~openmp +tests"
+  PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for corona
   PROJECT_TIOGA_DEPS: "^blt@develop "
 
 # Lassen and Butte use a different job scheduler (spectrum lsf) that does not
 # allow pre-allocation the same way slurm does.
 # Arguments for job level allocation
-  LASSEN_JOB_ALLOC: "1 -W 30 -q pci"
+  LASSEN_JOB_ALLOC: "1 -W 40 -q pci"
 # Project specific variants for lassen
-  PROJECT_LASSEN_VARIANTS: "~shared +openmp +tests cuda_arch=70"
+  PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70"
 # Project specific deps for lassen
   PROJECT_LASSEN_DEPS: "^blt@develop "
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e4823564b..9e5ecec0b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ include(CMakeDependentOption)
 # Set version number
 set(RAJA_VERSION_MAJOR 2024)
 set(RAJA_VERSION_MINOR 02)
-set(RAJA_VERSION_PATCHLEVEL 1)
+set(RAJA_VERSION_PATCHLEVEL 2)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 2e26861191..c2df2a03ea 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -20,6 +20,39 @@ Notable changes include:
   * Bug fixes/improvements:
 
 
+Version 2024.02.2 -- Release date 2024-05-08
+============================================
+
+This release contains a bugfix and new execution policies that improve
+performance for GPU kernels with reductions.
+
+Notable changes include:
+
+  * New features / API changes:
+     * New GPU execution policies for CUDA and HIP added which provide
+       improved performance for GPU kernels with reductions. Please see the 
+       RAJA User Guide for more information. Short summary:
+         * Option added to change max grid size in policies that use the
+           occupancy calculator.
+         * Policies added to run with max occupancy, a fraction of of the
+           max occupancy, and to run with a "concretizer" which allows a 
+           user to determine how to run based on what the occupancy 
+           calculator determines about a kernel.
+         * Additional options to tune kernels containing reductions, such as
+             * an option to initialize data on host for reductions that use
+               atomic operations
+             * an option to avoid device scope memory fences 
+     * Change ordering of SYCL thread index ordering in RAJA::launch to 
+       follow the SYCL "row-major" convention. Please see RAJA User Guide
+       for more information.
+
+  * Build changes/improvements:
+     * NONE.
+
+  * Bug fixes/improvements:
+     * Fixed issue in bump-style allocator used internally in RAJA::launch.
+
+
 Version 2024.02.1 -- Release date 2024-04-03
 ============================================
 
diff --git a/docs/Licenses/rocprim-license.txt b/docs/Licenses/rocprim-license.txt
new file mode 100644
index 0000000000..976ca2abb3
--- /dev/null
+++ b/docs/Licenses/rocprim-license.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/conf.py b/docs/conf.py
index 1570ed2888..3212170b30 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -88,7 +88,7 @@
 # The short X.Y version.
 version = u'2024.02'
 # The full version, including alpha/beta/rc tags.
-release = u'2024.02.1'
+release = u'2024.02.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst
new file mode 100644
index 0000000000..91494f3674
--- /dev/null
+++ b/docs/sphinx/user_guide/cook_book.rst
@@ -0,0 +1,23 @@
+.. ##
+.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _cook-book-label:
+
+************************
+RAJA Cook Book
+************************
+
+The following sections show common use case patterns and the recommended
+RAJA features and policies to use with them. They are intended
+to provide users with complete beyond usage examples beyond what can be found in other parts of the RAJA User Guide. In particular, the examples and discussion provide guidance on RAJA execution policy selection to improve performance of user application codes.
+
+.. toctree::
+   :maxdepth: 2
+
+   cook_book/reduction
+
diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
new file mode 100644
index 0000000000..73843ebb40
--- /dev/null
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -0,0 +1,110 @@
+.. ##
+.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _cook-book-reductions-label:
+
+=======================
+Cooking with Reductions
+=======================
+
+Please see the following section for overview discussion about RAJA reductions:
+
+ * :ref:`feat-reductions-label`.
+
+
+----------------------------
+Reductions with RAJA::forall
+----------------------------
+
+Here is the setup for a simple reduction example::
+
+  const int N = 1000;
+
+  int vec[N];
+
+  for (int i = 0; i < N; ++i) {
+
+    vec[i] = 1;
+
+  }
+
+Here a simple sum reduction is performed in a for loop::
+
+  int vsum = 0;
+
+  // Run a kernel using the reduction objects
+  for (int i = 0; i < N; ++i) {
+
+    vsum += vec[i];
+
+  }
+
+The results of these operations will yield the following values:
+
+ * ``vsum == 1000``
+
+RAJA uses policy types to specify how things are implemented.
+
+The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
+For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The
+``RAJA::cuda_exec_with_reduce<256>`` runs the loop as a CUDA GPU kernel with
+256 threads per block and other CUDA kernel launch parameters, like the
+number of blocks, optimized for performance with reducers.::
+
+  using exec_policy = RAJA::seq_exec;
+  // using exec_policy = RAJA::omp_parallel_for_exec;
+  // using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
+  // using exec_policy = RAJA::cuda_exec_with_reduce<256>;
+  // using exec_policy = RAJA::hip_exec_with_reduce<256>;
+  // using exec_policy = RAJA::sycl_exec<256>;
+
+The reduction policy specifies how the reduction is done and must match the
+execution policy. For example ``RAJA::seq_reduce`` does a sequential reduction
+and can only be used with sequential execution policies. The
+``RAJA::cuda_reduce_atomic`` policy uses atomics, if possible with the given
+data type, and can only be used with cuda execution policies. Similarly for other RAJA execution back-ends, such as HIP and OpenMP. Here are example RAJA reduction policies whose names are indicative of which execution policies they work with::
+
+  using reduce_policy = RAJA::seq_reduce;
+  // using reduce_policy = RAJA::omp_reduce;
+  // using reduce_policy = RAJA::omp_target_reduce;
+  // using reduce_policy = RAJA::cuda_reduce_atomic;
+  // using reduce_policy = RAJA::hip_reduce_atomic;
+  // using reduce_policy = RAJA::sycl_reduce;
+
+
+Here a simple sum reduction is performed using RAJA::
+
+  RAJA::ReduceSum<reduce_policy, int> vsum(0);
+
+  RAJA::forall<exec_policy>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    vsum += vec[i];
+
+  });
+
+The results of these operations will yield the following values:
+
+ * ``vsum.get() == 1000``
+
+
+Another option for the execution policy when using the cuda or hip backends are
+the base policies which have a boolean parameter to choose between the general
+use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.::
+
+  // static constexpr bool with_reduce = ...;
+  // using exec_policy = RAJA::cuda_exec_base<with_reduce, 256>;
+  // using exec_policy = RAJA::hip_exec_base<with_reduce, 256>;
+
+Another option for the reduction policy when using the cuda or hip backends are
+the base policies which have a boolean parameter to choose between the atomic
+``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.::
+
+  // static constexpr bool with_atomic = ...;
+  // using reduce_policy = RAJA::cuda_reduce_base<with_atomic>;
+  // using reduce_policy = RAJA::hip_reduce_base<with_atomic>;
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index e61be4e598..facde1da5d 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -236,180 +236,264 @@ RAJA policies for GPU execution using CUDA or HIP are essentially identical.
 The only difference is that CUDA policies have the prefix ``cuda_`` and HIP
 policies have the prefix ``hip_``.
 
- ========================================= ============= =======================================
- CUDA/HIP Execution Policies               Works with    Brief description
- ========================================= ============= =======================================
- cuda/hip_exec<BLOCK_SIZE>                 forall,       Execute loop iterations
-                                           scan,         directly mapped to global threads
-                                           sort          in a GPU kernel launched
-                                                         with given thread-block
-                                                         size and unbounded grid size.
-                                                         Note that the thread-block
-                                                         size must be provided,
-                                                         there is no default.
- cuda/hip_exec_grid<BLOCK_SIZE, GRID_SIZE> forall,       Execute loop iterations
-                                                         mapped to global threads via
-                                                         grid striding with multiple
-                                                         iterations per global thread
-                                                         in a GPU kernel launched
-                                                         with given thread-block
-                                                         size and grid size.
-                                                         Note that the thread-block
-                                                         size and grid size must be
-                                                         provided, there is no default.
- cuda/hip_exec_occ_calc<BLOCK_SIZE>        forall        Execute loop iterations
-                                                         mapped to global threads via
-                                                         grid striding with multiple
-                                                         iterations per global thread
-                                                         in a GPU kernel launched
-                                                         with given thread-block
-                                                         size and grid size bounded
-                                                         by the maximum occupancy of
-                                                         the kernel. Note that the
-                                                         thread-block size must
-                                                         be provided, there is no
-                                                         default. Note this can improve
-                                                         reducer performance in kernels
-                                                         with large iteration counts.
- cuda/hip_launch_t                         launch        Launches a device kernel,
-                                                         any code expressed within
-                                                         the lambda is executed
-                                                         on the device.
- cuda/hip_thread_x_direct                  kernel (For)  Map loop iterates
-                                           launch (loop) directly to GPU threads
-                                                         in x-dimension, one
-                                                         iterate per thread
-                                                         (see note below about
-                                                         limitations)
- cuda/hip_thread_y_direct                  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in y-dim
- cuda/hip_thread_z_direct                  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in z-dim
- cuda/hip_thread_x_loop                    kernel (For)  Similar to
-                                           launch (loop) thread-x-direct
-                                                         policy, but use a
-                                                         block-stride loop which
-                                                         doesn't limit number of
-                                                         loop iterates
- cuda/hip_thread_y_loop                    kernel (For)  Same as above, but for
-                                           launch (loop) threads in y-dimension
- cuda/hip_thread_z_loop                    kernel (For)  Same as above, but for
-                                           launch (loop) threads in z-dimension
- cuda/hip_thread_syncable_loop<dims...>    kernel (For)  Similar to thread-loop
-                                           launch (loop) policy, but safe to use
-                                                         with Cuda/HipSyncThreads
- cuda/hip_thread_size_x_direct<nxthreads>  kernel (For)  Same as thread_x_direct
-                                           launch (loop) policy above but with
-                                                         a compile time number of
-                                                         threads
- cuda/hip_thread_size_y_direct<nythreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in y-dim
- cuda/hip_thread_size_z_direct<nzthreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in z-dim
- cuda/hip_flatten_threads_{xyz}_direct     launch (loop) Reshapes threads in a
-                                                         multi-dimensional thread
-                                                         team into one-dimension,
-                                                         accepts any permutation
-                                                         of dimensions
- cuda/hip_block_x_direct                   kernel (For)  Map loop iterates
-                                           launch (loop) directly to GPU thread
-                                                         blocks in x-dimension,
-                                                         one iterate per block
- cuda/hip_block_y_direct                   kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in y-dimension
- cuda/hip_block_z_direct                   kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in z-dimension
- cuda/hip_block_x_loop                     kernel (For)  Similar to
-                                           launch (loop) block-x-direct policy,
-                                                         but use a grid-stride
-                                                         loop.
- cuda/hip_block_y_loop                     kernel (For)  Same as above, but use
-                                           launch (loop) blocks in y-dimension
- cuda/hip_block_z_loop                     kernel (For)  Same as above, but use
-                                           launch (loop) blocks in z-dimension
- cuda/hip_block_size_x_direct<nxblocks>    kernel (For)  Same as block_x_direct
-                                           launch (loop) policy above but with
-                                                         a compile time number of
-                                                         blocks
- cuda/hip_block_size_y_direct<nyblocks>    kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in y-dim
- cuda/hip_block_size_z_direct<nzblocks>    kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in z-dim
- cuda/hip_global_x_direct                  kernel (For)  Creates a unique thread
-                                           launch (loop) id for each thread on
-                                                         x-dimension of the grid.
-                                                         Same as computing
-                                                         threadIdx.x +
-                                                         threadDim.x * blockIdx.x.
- cuda/hip_global_y_direct                  kernel (For)  Same as above, but uses
-                                           launch (loop) globals in y-dimension.
- cuda/hip_global_z_direct                  kernel (For)  Same as above, but uses
-                                           launch (loop) globals in z-dimension.
- cuda/hip_global_x_loop                    kernel (For)  Similar to
-                                           launch (loop) global-x-direct policy,
-                                                         but use a grid-stride
-                                                         loop.
- cuda/hip_global_y_loop                    kernel (For)  Same as above, but use
-                                           launch (loop) globals in y-dimension
- cuda/hip_global_z_loop                    kernel (For)  Same as above, but use
-                                           launch (loop) globals in z-dimension
- cuda/hip_global_size_x_direct<nxthreads>  kernel (For)  Same as global_x_direct
-                                           launch (loop) policy above but with
-                                                         a compile time block
-                                                         size
- cuda/hip_global_size_y_direct<nythreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to globals in y-dim
- cuda/hip_global_size_z_direct<nzthreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to globals in z-dim
- cuda/hip_warp_direct                      kernel (For)  Map work to threads
-                                                         in a warp directly.
-                                                         Cannot be used in
-                                                         conjunction with
-                                                         cuda/hip_thread_x_*
-                                                         policies.
-                                                         Multiple warps can be
-                                                         created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_warp_loop                        kernel (For)  Policy to map work to
-                                                         threads in a warp using
-                                                         a warp-stride loop.
-                                                         Cannot be used in
-                                                         conjunction with
-                                                         cuda/hip_thread_x_*
-                                                         policies.
-                                                         Multiple warps can be
-                                                         created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_warp_masked_direct<BitMask<..>>  kernel (For)  Policy to map work
-                                                         directly to threads in a
-                                                         warp using a bit mask.
-                                                         Cannot be used in
-                                                         conjunction with
-                                                         cuda/hip_thread_x_*
-                                                         policies.
-                                                         Multiple warps can
-                                                         be created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_warp_masked_loop<BitMask<..>>    kernel (For)  Policy to map work to
-                                                         threads in a warp using
-                                                         a bit mask and a
-                                                         warp-stride loop. Cannot
-                                                         be used in conjunction
-                                                         with cuda/hip_thread_x_*
-                                                         policies. Multiple warps
-                                                         can be created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_block_reduce                     kernel        Perform a reduction
-                                           (Reduce)      across a single GPU
-                                                         thread block.
- cuda/hip_warp_reduce                      kernel        Perform a reduction
-                                           (Reduce)      across a single GPU
-                                                         thread warp.
- ========================================= ============= =======================================
++----------------------------------------------------+---------------+---------------------------------+
+| CUDA/HIP Execution Policies                        | Works with    | Brief description               |
++====================================================+===============+=================================+
+| cuda/hip_exec<BLOCK_SIZE>                          | forall,       | Execute loop iterations         |
+|                                                    | scan,         | directly mapped to global       |
+|                                                    | sort          | threads in a GPU kernel         |
+|                                                    |               | launched with given threadblock |
+|                                                    |               | size and unbounded grid size.   |
+|                                                    |               | Note that the threadblock       |
+|                                                    |               | size must be provided.          |
+|                                                    |               | There is no default.            |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_with_reduce<BLOCK_SIZE>              | forall        | The cuda/hip exec policy        |
+|                                                    |               | recommended for use with        |
+|                                                    |               | kernels containing reductions.  |
+|                                                    |               | In general, using the occupancy |
+|                                                    |               | calculator policies improves    |
+|                                                    |               | performance of kernels with     |
+|                                                    |               | reductions. Exactly how much    |
+|                                                    |               | occupancy to use differs by     |
+|                                                    |               | platform. This policy provides  |
+|                                                    |               | a simple way to get what works  |
+|                                                    |               | well for a platform without     |
+|                                                    |               | having to know the details.     |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_base<with_reduce, BLOCK_SIZE>        | forall        | Choose between cuda/hip_exec    |
+|                                                    |               | and cuda/hip_exec_with_reduce   |
+|                                                    |               | policies based on the boolean   |
+|                                                    |               | template parameter 'with_reduce'|
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_grid<BLOCK_SIZE, GRID_SIZE>          | forall        | Execute loop iterations         |
+|                                                    |               | mapped to global threads via    |
+|                                                    |               | grid striding with multiple     |
+|                                                    |               | iterations per global thread    |
+|                                                    |               | in a GPU kernel launched        |
+|                                                    |               | with given thread-block         |
+|                                                    |               | size and grid size.             |
+|                                                    |               | Note that the thread-block      |
+|                                                    |               | size and grid size must be      |
+|                                                    |               | provided, there is no default.  |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_max<BLOCK_SIZE>                  | forall        | Execute loop iterations         |
+|                                                    |               | mapped to global threads via    |
+|                                                    |               | grid striding with multiple     |
+|                                                    |               | iterations per global thread    |
+|                                                    |               | in a GPU kernel launched        |
+|                                                    |               | with given thread-block         |
+|                                                    |               | size and grid size bounded      |
+|                                                    |               | by the maximum occupancy of     |
+|                                                    |               | the kernel.                     |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_calc<BLOCK_SIZE>                 | forall        | Similar to the occ_max          |
+|                                                    |               | policy but may use less         |
+|                                                    |               | than the maximum occupancy      |
+|                                                    |               | determined by the occupancy     |
+|                                                    |               | calculator of the kernel for    |
+|                                                    |               | performance reasons.            |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_fraction<BLOCK_SIZE,             | forall        | Similar to the occ_max policy   |
+| Fraction<size_t, numerator, denominator>>          |               | but use a fraction of the       |
+|                                                    |               | maximum occupancy of the kernel.|
+|                                                    |               |                                 |
+|                                                    |               |                                 |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_custom<BLOCK_SIZE, Concretizer>  | forall        | Similar to the occ_max policy   |
+|                                                    |               | policy but the grid size is     |
+|                                                    |               | is determined by concretizer.   |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_launch_t                                  | launch        | Launches a device kernel, any   |
+|                                                    |               | code inside the lambda          |
+|                                                    |               | expression is executed          |
+|                                                    |               | on the device.                  |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_x_direct                           | kernel (For)  | Map loop iterates directly to   |
+|                                                    | launch (loop) | GPU threads in x-dimension, one |
+|                                                    |               | iterate per thread. See note    |
+|                                                    |               | below about limitations.        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_y_direct                           | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in y-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_z_direct                           | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in z-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_x_loop                             | kernel (For)  | Similar to thread-x-direct      |
+|                                                    | launch (loop) | policy, but use a block-stride  |
+|                                                    |               | loop which doesn't limit total  |
+|                                                    |               | number of loop iterates.        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_y_loop                             | kernel (For)  | Same as above, but for          |
+|                                                    | launch (loop) | threads in y-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_z_loop                             | kernel (For)  | Same as above, but for          |
+|                                                    | launch (loop) | threads in z-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_syncable_loop<dims...>             | kernel (For)  | Similar to thread-loop          |
+|                                                    | launch (loop) | policy, but safe to use         |
+|                                                    |               | with Cuda/HipSyncThreads.       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_x_direct<nx_threads>          | kernel (For)  | Same as thread_x_direct         |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time number of        |
+|                                                    |               | threads.                        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_y_direct<ny_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in y-dimension       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_z_direct<nz_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in z-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_flatten_threads_{xyz}_direct              | launch (loop) | Reshapes threads in a           |
+|                                                    |               | multi-dimensional thread        |
+|                                                    |               | team into one-dimension,        |
+|                                                    |               | accepts any permutation         |
+|                                                    |               | of dimensions                   |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_x_direct                            | kernel (For)  | Map loop iterates               |
+|                                                    | launch (loop) | directly to GPU thread          |
+|                                                    |               | blocks in x-dimension,          |
+|                                                    |               | one iterate per block           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_y_direct                            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in y-dimension        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_z_direct                            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in z-dimension        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_x_loop                              | kernel (For)  | Similar to                      |
+|                                                    | launch (loop) | block-x-direct policy,          |
+|                                                    |               | but use a grid-stride           |
+|                                                    |               | loop.                           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_y_loop                              | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | blocks in y-dimension           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_z_loop                              | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | blocks in z-dimension           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_x_direct<nx_blocks>            | kernel (For)  | Same as block_x_direct          |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time number of        |
+|                                                    |               | blocks                          |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_y_direct<ny_blocks>            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in y-dim              |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_z_direct<nz_blocks>            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in z-dim              |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_x_direct                           | kernel (For)  | Creates a unique thread         |
+|                                                    | launch (loop) | id for each thread on           |
+|                                                    |               | x-dimension of the grid.        |
+|                                                    |               | Same as computing               |
+|                                                    |               | threadIdx.x +                   |
+|                                                    |               | threadDim.x * blockIdx.x.       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_y_direct                           | kernel (For)  | Same as above, but uses         |
+|                                                    | launch (loop) | globals in y-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_z_direct                           | kernel (For)  | Same as above, but uses         |
+|                                                    | launch (loop) | globals in z-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_x_loop                             | kernel (For)  | Similar to                      |
+|                                                    | launch (loop) | global-x-direct policy,         |
+|                                                    |               | but use a grid-stride           |
+|                                                    |               | loop.                           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_y_loop                             | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | globals in y-dimension          |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_z_loop                             | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | globals in z-dimension          |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_x_direct<nx_threads>          | kernel (For)  | Same as global_x_direct         |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time block            |
+|                                                    |               | size                            |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_y_direct<ny_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in y-dim             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_z_direct<nz_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in z-dim             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_direct                               | kernel (For)  | Map work to threads             |
+|                                                    |               | in a warp directly.             |
+|                                                    |               | Cannot be used in               |
+|                                                    |               | conjunction with                |
+|                                                    |               | cuda/hip_thread_x_*             |
+|                                                    |               | policies.                       |
+|                                                    |               | Multiple warps can be           |
+|                                                    |               | created by using                |
+|                                                    |               | cuda/hip_thread_y/z_*           |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_loop                                 | kernel (For)  | Map work to threads in a warp   |
+|                                                    |               | using a warp-stride loop.       |
+|                                                    |               | Cannot be used with             |
+|                                                    |               | cuda/hip_thread_x_* policies.   |
+|                                                    |               | Multiple warps can be created   |
+|                                                    |               | by using cuda/hip_thread_y/z_*  |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_masked_direct<BitMask<..>>           | kernel        | Mmap work directly to threads   |
+|                                                    | (For)         | in a warp using a bit mask.     |
+|                                                    |               | Cannot be used with             |
+|                                                    |               | cuda/hip_thread_x_* policies.   |
+|                                                    |               | Multiple warps can be created   |
+|                                                    |               | by using cuda/hip_thread_y/z_*  |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_masked_loop<BitMask<..>>             | kernel        | Map work to threads in a warp   |
+|                                                    | (For)         | using a bit mask and a warp-    |
+|                                                    |               | stride loop.                    |
+|                                                    |               | Cannot be used with             |
+|                                                    |               | cuda/hip_thread_x_* policies.   |
+|                                                    |               | Multiple warps can be created   |
+|                                                    |               | by using cuda/hip_thread_y/z_*  |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_reduce                              | kernel        | Perform a reduction across a    |
+|                                                    | (Reduce)      | single GPU thread block.        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_reduce                               | kernel        | Perform a reduction across a    |
+|                                                    | (Reduce)      | single GPU thread warp.         |
+|                                                    |               | thread warp.                    |
++----------------------------------------------------+---------------+---------------------------------+
+
+When a CUDA or HIP policy leaves parameters like the block size and/or grid size
+unspecified a concretizer object is used to decide those parameters. The
+following concretizers are available to use in the ``cuda/hip_exec_occ_custom``
+policies:
+
++----------------------------------------------------+-----------------------------------------+
+| Execution Policy                                   | Brief description                       |
++====================================================+=========================================+
+| Cuda/HipDefaultConcretizer                         | The default concretizer, expected to    |
+|                                                    | provide good performance in general.    |
+|                                                    | Note that it may not use max occupancy. |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipRecForReduceConcretizer                    | Expected to provide good performance    |
+|                                                    | in loops with reducers.                 |
+|                                                    | Note that it may not use max occupancy. |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipMaxOccupancyConcretizer                    | Uses max occupancy.                     |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer   | Avoids using the max occupancy of the   |
+|                                                    | device in terms of threads.             |
+|                                                    | Note that it may use the max occupancy  |
+|                                                    | of the kernel if that is below the max  |
+|                                                    | occupancy of the device.                |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipFractionOffsetOccupancyConcretizer<        | Uses a fraction and offset to choose an |
+| Fraction<size_t, numerator, denomenator>,          | occupancy based on the max occupancy    |
+| BLOCKS_PER_SM_OFFSET>                              | Using the following formula:            |
+|                                                    | (Fraction * kernel_max_blocks_per_sm +  |
+|                                                    |  BLOCKS_PER_SM_OFFSET) * sm_per_device  |
++----------------------------------------------------+-----------------------------------------+
 
 Several notable constraints apply to RAJA CUDA/HIP *direct* policies.
 
@@ -473,99 +557,133 @@ write more explicit policies.
             ignored. For example in cuda_thread_x_direct block_size is
             unspecified so a runtime number of threads is used, but grid_size is
             ignored so blocks are ignored when getting indices.
-
+	    
 GPU Policies for SYCL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
- ======================================== ============= ==============================
- SYCL Execution Policies                  Works with    Brief description
- ======================================== ============= ==============================
- sycl_exec<WORK_GROUP_SIZE>               forall,       Execute loop iterations
-                                                        in a GPU kernel launched
-                                                        with given work group
-                                                        size.
- sycl_launch_t                            launch        Launches a sycl kernel,
-                                                        any code express within
-                                                        the lambda is executed
-                                                        on the device.
- sycl_global_0<WORK_GROUP_SIZE>           kernel (For)  Map loop iterates
-                                                        directly to GPU global
-                                                        ids in first
-                                                        dimension, one iterate
-                                                        per work item. Group
-                                                        execution into work
-                                                        groups of given size.
- sycl_global_1<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
-                                                        to global ids in second
-                                                        dim
- sycl_global_2<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
-                                                        to global ids in third
-                                                        dim
- sycl_global_item_0                       launch (loop) Creates a unique thread
-                                                        id for each thread for
-                                                        dimension 0 of the grid.
-                                                        Same as computing
-                                                        itm.get_group(0) *
-                                                        itm.get_local_range(0) +
-                                                        itm.get_local_id(0).
- sycl_global_item_1                       launch (loop) Same as above, but uses
-                                                        threads in dimension 1
-                                                        Same as computing
-                                                        itm.get_group(1) +
-                                                        itm.get_local_range(1) *
-                                                        itm.get_local_id(1).
- sycl_global_item_2                       launch (loop) Same as above, but uses
-                                                        threads in dimension 2
-                                                        Same as computing
-                                                        itm.get_group(2) +
-                                                        itm.get_local_range(2) *
-                                                        itm.get_local_id(2).
- sycl_local_0_direct                      kernel (For)  Map loop iterates
-                                          launch (loop) directly to GPU work
-                                                        items in first
-                                                        dimension, one iterate
-                                                        per work item (see note
-                                                        below about limitations)
- sycl_local_1_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to work items in second
-                                                        dim
- sycl_local_2_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to work items in third
-                                                        dim
- sycl_local_0_loop                        kernel (For)  Similar to
-                                          launch (loop) local-1-direct policy,
-                                                        but use a work
-                                                        group-stride loop which
-                                                        doesn't limit number of
-                                                        loop iterates
- sycl_local_1_loop                        kernel (For)  Same as above, but for
-                                          launch (loop) work items in second
-                                                        dimension
- sycl_local_2_loop                        kernel (For)  Same as above, but for
-                                          launch (loop) work items in third
-                                                        dimension
- sycl_group_0_direct                      kernel (For)  Map loop iterates
-                                          launch (loop) directly to GPU group
-                                                        ids in first dimension,
-                                                        one iterate per group
- sycl_group_1_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to groups in second
-                                                        dimension
- sycl_group_2_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to groups in third
-                                                        dimension
- sycl_group_0_loop                        kernel (For)  Similar to
-                                          launch (loop) group-1-direct policy,
-                                                        but use a group-stride
-                                                        loop.
- sycl_group_1_loop                        kernel (For)  Same as above, but use
-                                          launch (loop) groups in second
-                                                        dimension
- sycl_group_2_loop                        kernel (For)  Same as above, but use
-                                          launch (loop) groups in third
-                                                        dimension
-
- ======================================== ============= ==============================
+.. note:: SYCL uses C++-style ordering for its work group and global thread
+          dimension/indexing types. This is due, in part, to SYCL's closer
+          alignment with C++ multi-dimensional indexing, which is "row-major".
+          This is the reverse of the thread indexing used in CUDA or HIP,
+          which is "column-major". For example, suppose we have a thread-block 
+          or work-group where we specify the shape as (nx, ny, nz). Consider
+          an element in the thread-block or work-group with id (x, y, z).
+          In CUDA or HIP, the element index is x + y * nx + z * nx * ny. In 
+          SYCL, the element index is z + y * nz + x * nz * ny.
+
+          In terms of the CUDA or HIP built-in variables to support threads,
+          we have::
+
+            Thread ID: threadIdx.x/y/z
+            Block ID: blockIdx.x/y/z
+            Block dimension: blockDim.x/y/z
+            Grid dimension: gridDim.x/y/z 
+
+          The analogues in SYCL are::
+
+            Thread ID: sycl::nd_item.get_local_id(2/1/0)
+            Work-group ID: sycl::nd_item.get_group(2/1/0)
+            Work-group dimensions: sycl::nd_item.get_local_range().get(2/1/0)
+            ND-range dimensions: sycl::nd_item.get_group_range(2/1/0) 
+
+	  When using ``RAJA::launch``, thread and block configuration
+	  follows CUDA and HIP programming models and is always
+	  configured in three-dimensions. This means that SYCL dimension
+	  2 always exists and should be used as one would use the
+	  x dimension for CUDA and HIP.
+
+          Similarly, ``RAJA::kernel`` uses a three-dimensional work-group
+          configuration. SYCL imension 2 always exists and should be used as
+          one would use the x dimension in CUDA and HIP.  
+
+======================================== ============= ==============================
+SYCL Execution Policies                  Works with    Brief description
+======================================== ============= ==============================
+sycl_exec<WORK_GROUP_SIZE>               forall,       Execute loop iterations
+                                                       in a GPU kernel launched
+                                                       with given work group
+                                                       size.
+sycl_launch_t                            launch        Launches a sycl kernel,
+                                                       any code express within
+                                                       the lambda is executed
+                                                       on the device.
+sycl_global_0<WORK_GROUP_SIZE>           kernel (For)  Map loop iterates
+                                                       directly to GPU global
+                                                       ids in first
+                                                       dimension, one iterate
+                                                       per work item. Group
+                                                       execution into work
+                                                       groups of given size.
+sycl_global_1<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
+                                                       to global ids in second
+                                                       dim
+sycl_global_2<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
+                                                       to global ids in third
+                                                       dim
+sycl_global_item_0                       launch (loop) Creates a unique thread
+                                                       id for each thread for
+                                                       dimension 0 of the grid.
+                                                       Same as computing
+                                                       itm.get_group(0) *
+                                                       itm.get_local_range(0) +
+                                                       itm.get_local_id(0).
+sycl_global_item_1                       launch (loop) Same as above, but uses
+                                                       threads in dimension 1
+                                                       Same as computing
+                                                       itm.get_group(1) +
+                                                       itm.get_local_range(1) *
+                                                       itm.get_local_id(1).
+sycl_global_item_2                       launch (loop) Same as above, but uses
+                                                       threads in dimension 2
+                                                       Same as computing
+                                                       itm.get_group(2) +
+                                                       itm.get_local_range(2) *
+                                                       itm.get_local_id(2).
+sycl_local_0_direct                      kernel (For)  Map loop iterates
+                                         launch (loop) directly to GPU work
+                                                       items in first
+                                                       dimension, one iterate
+                                                       per work item (see note
+                                                       below about limitations)
+sycl_local_1_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to work items in second
+                                                       dim
+sycl_local_2_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to work items in third
+                                                       dim
+sycl_local_0_loop                        kernel (For)  Similar to
+                                         launch (loop) local-1-direct policy,
+                                                       but use a work
+                                                       group-stride loop which
+                                                       doesn't limit number of
+                                                       loop iterates
+sycl_local_1_loop                        kernel (For)  Same as above, but for
+                                         launch (loop) work items in second
+                                                       dimension
+sycl_local_2_loop                        kernel (For)  Same as above, but for
+                                         launch (loop) work items in third
+                                                       dimension
+sycl_group_0_direct                      kernel (For)  Map loop iterates
+                                         launch (loop) directly to GPU group
+                                                       ids in first dimension,
+                                                       one iterate per group
+sycl_group_1_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to groups in second
+                                                       dimension
+sycl_group_2_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to groups in third
+                                                       dimension
+sycl_group_0_loop                        kernel (For)  Similar to
+                                         launch (loop) group-1-direct policy,
+                                                       but use a group-stride
+                                                       loop.
+sycl_group_1_loop                        kernel (For)  Same as above, but use
+                                         launch (loop) groups in second
+                                                       dimension
+sycl_group_2_loop                        kernel (For)  Same as above, but use
+                                         launch (loop) groups in third
+                                                       dimension
+======================================== ============= ==============================
 
 OpenMP Target Offload Policies
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -676,26 +794,57 @@ It is important to note the following constraints about RAJA reduction usage:
 
 The following table summarizes RAJA reduction policy types:
 
-======================= ============= ==========================================
-Reduction Policy        Loop Policies Brief description
-                        to Use With
-======================= ============= ==========================================
-seq_reduce              seq_exec,     Non-parallel (sequential) reduction.
-omp_reduce              any OpenMP    OpenMP parallel reduction.
-                        policy
-omp_reduce_ordered      any OpenMP    OpenMP parallel reduction with result
-                        policy        guaranteed to be reproducible.
-omp_target_reduce       any OpenMP    OpenMP parallel target offload reduction.
-                        target policy
-cuda/hip_reduce         any CUDA/HIP  Parallel reduction in a CUDA/HIP kernel
-                        policy        (device synchronization will occur when
-                                      reduction value is finalized).
-cuda/hip_reduce_atomic  any CUDA/HIP  Same as above, but reduction may use CUDA
-                        policy        atomic operations.
-sycl_reduce             any SYCL      Reduction in a SYCL kernel (device
-                        policy        synchronization will occur when the
-                                      reduction value is finalized).
-======================= ============= ==========================================
+================================================= ============= ==========================================
+Reduction Policy                                  Loop Policies Brief description
+                                                  to Use With
+================================================= ============= ==========================================
+seq_reduce                                        seq_exec,     Non-parallel (sequential) reduction.
+omp_reduce                                        any OpenMP    OpenMP parallel reduction.
+                                                  policy
+omp_reduce_ordered                                any OpenMP    OpenMP parallel reduction with result
+                                                  policy        guaranteed to be reproducible.
+omp_target_reduce                                 any OpenMP    OpenMP parallel target offload reduction.
+                                                  target policy
+cuda/hip_reduce                                   any CUDA/HIP  Parallel reduction in a CUDA/HIP kernel
+                                                  policy        (device synchronization will occur when
+                                                                reduction value is finalized).
+cuda/hip_reduce_atomic                            any CUDA/HIP  Same as above, but reduction may use
+                                                  policy        atomic operations leading to run to run
+                                                                variability in the results.
+cuda/hip_reduce_base<with_atomic>                 any CUDA/HIP  Choose between cuda/hip_reduce and
+                                                  policy        cuda/hip_reduce_atomic policies based on
+                                                                the with_atomic boolean.
+cuda/hip_reduce_device_fence                      any CUDA/HIP  Same as above, and reduction uses normal
+                                                  policy        memory accesses that are not visible across
+                                                                the whole device and device scope fences
+                                                                to ensure visibility and ordering.
+                                                                This works on all architectures but
+                                                                incurs higher overheads on some architectures.
+cuda/hip_reduce_block_fence                       any CUDA/HIP  Same as above, and reduction uses special
+                                                  policy        memory accesses to a level of cache
+                                                                visible to the whole device and block scope
+                                                                fences to ensure ordering. This improves
+                                                                performance on some architectures.
+cuda/hip_reduce_atomic_host_init_device_fence     any CUDA/HIP  Same as above with device fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the host. This works well on recent
+                                                                architectures and incurs lower overheads.
+cuda/hip_reduce_atomic_host_init_block_fence      any CUDA/HIP  Same as above with block fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the host. This works well on recent
+                                                                architectures and incurs lower overheads.
+cuda/hip_reduce_atomic_device_init_device_fence   any CUDA/HIP  Same as above with device fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the device. This works on all architectures
+                                                                but incurs higher overheads.
+cuda/hip_reduce_atomic_device_init_block_fence    any CUDA/HIP  Same as above with block fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the device. This works on all architectures
+                                                                but incurs higher overheads.
+sycl_reduce                                       any SYCL      Reduction in a SYCL kernel (device
+                                                  policy        synchronization will occur when the
+                                                                reduction value is finalized).
+================================================= ============= ==========================================
 
 .. note:: RAJA reductions used with SIMD execution policies are not
           guaranteed to generate correct results. So they should not be used
diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst
index 8643e4a225..5f2f09afad 100644
--- a/docs/sphinx/user_guide/feature/reduction.rst
+++ b/docs/sphinx/user_guide/feature/reduction.rst
@@ -39,6 +39,10 @@ RAJA reductions:
 
  * :ref:`tut-reduction-label`.
 
+Please see the following cook book sections for guidance on policy usage:
+
+ * :ref:`cook-book-reductions-label`.
+
 
 ----------------
 Reduction Types
diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst
index f2fb6ca46d..f73f4d9449 100644
--- a/docs/sphinx/user_guide/index.rst
+++ b/docs/sphinx/user_guide/index.rst
@@ -32,5 +32,6 @@ to use RAJA in an application can be found in :ref:`app-considerations-label`.
    using_raja
    config_options
    features
+   cook_book
    app_considerations
    tutorial
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index f41aad477b..c37ac997a4 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -33,8 +33,10 @@
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/math.hpp"
 #include "RAJA/util/plugins.hpp"
 #include "RAJA/util/Registry.hpp"
+#include "RAJA/util/for_each.hpp"
 
 
 //
@@ -57,13 +59,6 @@
 //
 #include "RAJA/policy/sequential.hpp"
 
-//
-// NOTE: LOOP POLCIES WERE DEPRECATED IN 2023.03.0 RELEASE.
-//       THEY ARE RE-ADDED HERE AT REQUEST OF USERS.
-//       THEY WILL BE REMOVED AGAIN IN THE FUTURE.
-//
-#include "RAJA/policy/loop.hpp"
-
 //
 // All platforms should support simd and vector execution.
 //
@@ -155,6 +150,11 @@
 //
 #include "RAJA/util/sort.hpp"
 
+//
+// reduce algorithms
+//
+#include "RAJA/util/reduce.hpp"
+
 //
 // WorkPool, WorkGroup, WorkSite objects
 //
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 6f56f4ed65..213c435236 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -174,10 +174,14 @@ class LaunchContext
   template<typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
   {
-    T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
+
+    //Calculate offset in bytes with a char pointer
+    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
 
     shared_mem_offset += bytes*sizeof(T);
-    return mem_ptr;
+
+    //convert to desired type
+    return static_cast<T*>(mem_ptr);
   }
 
   /*
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 2a8f848825..43d927acab 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -61,7 +61,7 @@ struct PinnedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFreeHost(ptr));
@@ -80,7 +80,7 @@ struct DeviceAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
@@ -103,7 +103,31 @@ struct DeviceZeroedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
+  bool free(void* ptr)
+  {
+    cudaErrchk(cudaFree(ptr));
+    return true;
+  }
+};
+
+//! Allocator for device pinned memory for use in basic_mempool
+struct DevicePinnedAllocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    int device;
+    cudaErrchk(cudaGetDevice(&device));
+    void* ptr;
+    cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+
+    return ptr;
+  }
+
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
@@ -114,6 +138,7 @@ struct DeviceZeroedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
+using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
@@ -279,6 +304,7 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
+//! Get the properties of the current device
 RAJA_INLINE
 cudaDeviceProp get_device_prop()
 {
@@ -289,194 +315,217 @@ cudaDeviceProp get_device_prop()
   return prop;
 }
 
+//! Get a copy of the device properties, this copy is cached on first use to speedup later calls
 RAJA_INLINE
 cudaDeviceProp& device_prop()
 {
-  static cudaDeviceProp prop = get_device_prop();
+  static thread_local cudaDeviceProp prop = get_device_prop();
   return prop;
 }
 
 
+static constexpr int cuda_occupancy_uninitialized_int = -1;
+static constexpr size_t cuda_occupancy_uninitialized_size_t =
+    std::numeric_limits<size_t>::max();
+
+//! Struct with the maximum theoretical occupancy of the device
 struct CudaFixedMaxBlocksData
 {
-  int multiProcessorCount;
-  int maxThreadsPerMultiProcessor;
+  int device_sm_per_device = cuda::device_prop().multiProcessorCount;
+  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
+//! Get the maximum theoretical occupancy of the device
 RAJA_INLINE
-size_t cuda_max_blocks(size_t block_size)
+CudaFixedMaxBlocksData cuda_max_blocks()
 {
-  static CudaFixedMaxBlocksData data = []() {
-    cudaDeviceProp& prop = cuda::device_prop();
-    return CudaFixedMaxBlocksData{prop.multiProcessorCount,
-                                  prop.maxThreadsPerMultiProcessor};
-  }();
-
-  size_t max_blocks = data.multiProcessorCount *
-                  (data.maxThreadsPerMultiProcessor / block_size);
+  static thread_local CudaFixedMaxBlocksData data;
 
-  return max_blocks;
+  return data;
 }
 
+//! Struct with the maximum occupancy of a kernel in simple terms
 struct CudaOccMaxBlocksThreadsData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int max_threads;
+  size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
+  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
+  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+//! Get the maximum occupancy of a kernel with unknown threads per block
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void cuda_occupancy_max_blocks_threads(Func&& func, size_t shmem_size,
-                                       int &max_blocks, int &max_threads)
+CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
+  static thread_local CudaOccMaxBlocksThreadsData data;
 
-  if (data.prev_shmem_size != shmem_size) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
-    cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.max_blocks, &data.max_threads, func, shmem_size));
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
-    data.prev_shmem_size = shmem_size;
+    cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
 
   }
 
-  max_blocks  = data.max_blocks;
-  max_threads = data.max_threads;
-
+  return data;
 }
 
-struct CudaOccMaxBlocksFixedThreadsData
+//! Struct with the maximum occupancy of a kernel in specific terms
+struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int multiProcessorCount;
+  size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
+  int func_threads_per_block = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func >
+//! Get the maximum occupancy of a kernel with compile time threads per block
+template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
 RAJA_INLINE
-void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks)
+CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksFixedThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
-
-  if (data.prev_shmem_size != shmem_size) {
-
-    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.max_blocks, func, num_threads, shmem_size));
+  static thread_local CudaOccMaxBlocksData data;
 
-    if (data.multiProcessorCount == uninitialized) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
-      data.multiProcessorCount = cuda::device_prop().multiProcessorCount;
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
-
-    data.prev_shmem_size = shmem_size;
+    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct CudaOccMaxBlocksVariableThreadsData
-{
-  size_t prev_shmem_size;
-  int prev_num_threads;
-  int max_blocks;
-  int multiProcessorCount;
-};
-
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+//! Get the maximum occupancy of a kernel with runtime threads per block
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks, int num_threads)
+CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
-  static constexpr int uninitialized = 0;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksVariableThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized, uninitialized};
+  static thread_local CudaOccMaxBlocksData data;
 
-  if ( data.prev_shmem_size  != shmem_size ||
-       data.prev_num_threads != num_threads ) {
+  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+       data.func_threads_per_block != func_threads_per_block ) {
 
-    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &data.max_blocks, func, num_threads, shmem_size));
-
-    if (data.multiProcessorCount == uninitialized) {
-
-      data.multiProcessorCount = cuda::device_prop().multiProcessorCount;
-
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
-    data.prev_shmem_size  = shmem_size;
-    data.prev_num_threads = num_threads;
+    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct CudaOccupancyDefaults
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Concretizer Implementation that chooses block size and/or grid
+ *         size when one or both has not been specified at compile time.
+ *
+ * \tparam IdxT Index type to use for integer calculations.
+ * \tparam Concretizer Class that determines the max number of blocks to use
+ *         when fitting for the device.
+ * \tparam UniqueMarker A type that is unique to each global function, used to
+ *         help cache the occupancy data for that global function.
+ *
+ * The methods come in two flavors:
+ * - The fit_len methods choose grid and block sizes that result in a total
+ *   number of threads of at least the len given in the constructor or 0 if
+ *   that is not possible.
+ * - The fit_device methods choose grid and block sizes that best fit the
+ *   occupancy of the global function according to the occupancy calculator and
+ *   the Concretizer class.
+ *
+ * Common terms:
+ * - block size - threads per block
+ * - grid size - blocks per device
+ *
+ ******************************************************************************
+ */
+template < typename IdxT, typename Concretizer, typename UniqueMarker>
+struct ConcretizerImpl
 {
-  CudaOccupancyDefaults(const void* RAJA_UNUSED_ARG(func))
+  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
+    : m_func(func)
+    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
+    , m_len(len)
   { }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size),
-                                IdxT RAJA_UNUSED_ARG(block_size)) const
+  IdxT get_max_block_size() const
   {
-    return std::numeric_limits<IdxT>::max();
+    auto data = cuda_occupancy_max_blocks_threads<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block);
+    IdxT func_max_threads_per_block = data.func_max_threads_per_block;
+    return func_max_threads_per_block;
   }
 
-  template < typename IdxT = cuda_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const
+  //! Get a block size when grid size is specified
+  IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
-    return std::make_pair(static_cast<IdxT>(::RAJA::policy::cuda::MAX_BLOCK_SIZE),
-                          std::numeric_limits<IdxT>::max());
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block) {
+      return func_threads_per_block;
+    } else {
+      return IdxT(0);
+    }
   }
-};
 
-template < typename UniqueMarker >
-struct CudaOccupancyCalculator
-{
-  CudaOccupancyCalculator(const void* func)
-    : m_func(func)
-  { }
+  //! Get a grid size when block size is specified
+  IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
+  {
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return func_blocks_per_device;
+  }
+
+  //! Get a block size and grid size when neither is specified
+  auto get_block_and_grid_size_to_fit_len() const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
+  }
+
+  //! Get a block size when grid size is specified
+  IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    return std::min(func_threads_per_block, func_max_threads_per_block);
+  }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const
+  //! Get a grid size when block size is specified
+  IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
-    int max_grid_size = -1;
-    ::RAJA::cuda::cuda_occupancy_max_blocks<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, block_size);
-    return static_cast<IdxT>(max_grid_size);
+    auto data = cuda_occupancy_max_blocks<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
+    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const
+  //! Get a block size and grid size when neither is specified
+  auto get_block_and_grid_size_to_fit_device() const
   {
-    int max_block_size = -1;
-    int max_grid_size = -1;
-    ::RAJA::cuda::cuda_occupancy_max_blocks_threads<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, max_block_size);
-    return std::make_pair(static_cast<IdxT>(max_block_size),
-                          static_cast<IdxT>(max_grid_size));
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
 private:
   const void* m_func;
+  size_t m_func_dynamic_shmem_per_block;
+  IdxT m_len;
 };
 
 }  // namespace cuda
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 3837a8b062..333f0f90e8 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,16 +70,17 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename UniqueMarker>
+template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
@@ -91,8 +92,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    if ( len > (static_cast<IdxT>(IndexGetter::block_size) *
-                static_cast<IdxT>(IndexGetter::grid_size)) ) {
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    if ( len > (block_size * grid_size) ) {
       RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
@@ -101,9 +104,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
@@ -112,17 +116,26 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    internal::set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::grid_size)));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
+
+    if ( block_size == IdxT(0) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    }
+
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
@@ -131,16 +144,22 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::block_size)));
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
@@ -149,104 +168,104 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)));
+    internal::set_cuda_dim<dim>(dims.threads, sizes.first);
+    internal::set_cuda_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_block_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)),
-        static_cast<IdxT>(max_sizes.first));
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
-    internal::set_cuda_dim<dim>(dims.threads, calculated_block_size);
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
 
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size,
-                                              static_cast<IdxT>(IndexMapper::block_size));
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)),
-        static_cast<IdxT>(max_grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)),
-        static_cast<IdxT>(max_sizes.second));
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_cuda_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_cuda_dim<dim>(dims.threads, sizes.first);
+    internal::set_cuda_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
@@ -273,7 +292,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -298,7 +317,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -324,7 +343,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -352,7 +371,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -379,7 +398,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -405,7 +425,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -433,7 +454,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -462,7 +484,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -494,7 +517,7 @@ void forallp_cuda_kernel(LOOP_BODY loop_body,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          size_t BlocksPerSM, bool Async,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -502,7 +525,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
@@ -510,9 +533,9 @@ forall_impl(resources::Cuda cuda_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -563,7 +586,7 @@ forall_impl(resources::Cuda cuda_res,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          size_t BlocksPerSM, bool Async,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -571,7 +594,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
@@ -579,9 +602,9 @@ forall_impl(resources::Cuda cuda_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -659,11 +682,11 @@ forall_impl(resources::Cuda cuda_res,
  */
 template <typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          size_t BlocksPerSM, bool Async,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
 forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>>,
+            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -672,7 +695,7 @@ forall_impl(resources::Cuda r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, true>(),
+                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
new file mode 100644
index 0000000000..b0d2ea7cf1
--- /dev/null
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -0,0 +1,467 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA intrinsics templates for CUDA execution.
+ *
+ *          These methods should work on any platform that supports
+ *          CUDA devices.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_cuda_intrinsics_HPP
+#define RAJA_cuda_intrinsics_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include <type_traits>
+
+#include <cuda.h>
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/policy/cuda/policy.hpp"
+
+
+namespace RAJA
+{
+
+namespace cuda
+{
+
+namespace impl
+{
+
+/*!
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
+ *
+ * \Note This uses device scope fences to ensure ordering and to flush local
+ *       caches so that memory accesses become visible to the whole device.
+ * \Note This class uses normal memory accesses that are cached in local caches
+ *       so device scope fences are required to make memory accesses visible
+ *       to the whole device.
+ */
+struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
+{
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
+};
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
+ *
+ * \Note This may use block scope fences to ensure ordering and avoid flushing
+ *       local caches so special memory accesses are used to ensure visibility
+ *       to the whole device.
+ * \Note This class uses device scope atomic memory accesses to bypass local
+ *       caches so memory accesses are visible to the whole device without
+ *       device scope fences.
+ * \Note A memory access may be split into multiple memory accesses, so
+ *       even though atomic instructions are used concurrent accesses between
+ *       different threads are not thread safe.
+ *
+ ******************************************************************************
+ */
+struct AccessorDeviceScopeUseBlockFence
+{
+  // cuda has 32 and 64 bit atomics
+  static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
+  static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+      u.array[i] = atomicAdd(&ptr[i], integer_type(0));
+    }
+
+    return u.get_value();
+  }
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    u.set_value(val);
+    auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+      atomicExch(&ptr[i], u.array[i]);
+    }
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
+};
+
+
+// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits
+constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+constexpr size_t max_shfl_int_type_size = sizeof(unsigned long long);
+#else
+constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
+#endif
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
+ *
+ * \Note Returns an undefined value if src lane is inactive (divergence).
+ *       Returns this lane's value if src lane is out of bounds or has exited.
+ *
+ ******************************************************************************
+ */
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+    u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
+#else
+    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
+#endif
+  }
+  return u.get_value();
+}
+
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+    u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
+#else
+    u.array[i] = ::__shfl(u.array[i], srcLane);
+#endif
+  }
+  return u.get_value();
+}
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE double shfl_xor_sync<double>(double var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+#else
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+#endif
+
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE double shfl_sync<double>(double var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+#else
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+#endif
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  T temp = val;
+
+  if (numThreads % policy::cuda::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  return temp;
+}
+
+/*!
+ * Allreduce values in a warp.
+ *
+ *
+ * This does a butterfly pattern leaving each lane with the full reduction
+ *
+ */
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
+{
+  T temp = val;
+
+  for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+    T rhs = __shfl_xor_sync(0xffffffff, temp, i);
+    Combiner{}(temp, rhs);
+  }
+
+  return temp;
+}
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int warpId = threadId % policy::cuda::WARP_SIZE;
+  int warpNum = threadId / policy::cuda::WARP_SIZE;
+
+  T temp = val;
+
+  if (numThreads % policy::cuda::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  // reduce per warp values
+  if (numThreads > policy::cuda::WARP_SIZE) {
+
+    static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE,
+        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+
+    // Need to separate declaration and initialization for clang-cuda
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>)];
+
+    // Partial placement new: Should call new(tmpsd) here but recasting memory
+    // to avoid calling constructor/destructor in shared memory.
+    RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>* sd =
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS> *>(tmpsd);
+
+    // write per warp values to shared memory
+    if (warpId == 0) {
+      sd->set(warpNum, temp);
+    }
+
+    __syncthreads();
+
+    if (warpNum == 0) {
+
+      // read per warp values
+      if (warpId * policy::cuda::WARP_SIZE < numThreads) {
+        temp = sd->get(warpId);
+      } else {
+        temp = identity;
+      }
+
+      for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) {
+        T rhs = shfl_xor_sync(temp, i);
+        Combiner{}(temp, rhs);
+      }
+    }
+
+    __syncthreads();
+  }
+
+  return temp;
+}
+
+}  // end namespace impl
+
+}  // end namespace cuda
+
+}  // end namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 6497a64f42..c070d618ea 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -87,7 +87,7 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, 0, true>, EnclosedStmts...> {
+    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
 };
 
 
@@ -284,7 +284,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   inline static void recommended_blocks_threads(size_t shmem_size,
       int &recommended_blocks, int &recommended_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -294,8 +294,10 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         // determine blocks at runtime
         // determine threads at runtime
         //
-        ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
-            func, shmem_size, recommended_blocks, recommended_threads);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_threads = data.func_max_threads_per_block;
 
       } else {
 
@@ -305,8 +307,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         //
         recommended_threads = num_threads;
 
-        ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, recommended_blocks);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
@@ -360,7 +363,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   inline static void max_blocks(size_t shmem_size,
       int &max_blocks, int actual_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -373,16 +376,18 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         //
         // determine blocks when actual_threads != num_threads
         //
-        ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
-            func, shmem_size, max_blocks, actual_threads);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
+            func, shmem_size, actual_threads);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       } else {
 
         //
         // determine blocks when actual_threads == num_threads
         //
-        ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, max_blocks);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 11870f13b0..9de20c7b4b 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -108,7 +108,7 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -123,7 +123,7 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -180,7 +180,7 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -195,7 +195,7 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -246,7 +246,7 @@ struct CudaStatementExecutor<
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
 : CudaStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index dd7c4c4ffe..8486abaa2c 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -103,20 +103,20 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -166,20 +166,20 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -226,7 +226,7 @@ struct CudaStatementExecutor<
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
 : CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad54c86a54..ad901f6b02 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -143,7 +143,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -153,7 +153,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -233,7 +233,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -243,7 +243,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -318,7 +318,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
 : CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 84a0bec412..c611346d46 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -131,14 +131,14 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -146,7 +146,7 @@ struct CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -209,14 +209,14 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -224,7 +224,7 @@ struct CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -281,7 +281,7 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
 : CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index a33b564309..9c904ea45a 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -388,7 +388,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -402,7 +402,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -418,7 +418,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -436,7 +436,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -451,7 +451,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -469,7 +469,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -488,7 +488,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -508,7 +508,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -527,7 +527,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 26e56e5cda..602221e58a 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -433,7 +433,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -457,7 +457,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -493,7 +493,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -625,7 +625,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -649,7 +649,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -686,7 +686,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -810,18 +810,18 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 };
 
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -852,7 +852,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -914,7 +914,7 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -964,7 +964,7 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 92c1f1c701..84cd8a301c 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -22,6 +22,7 @@
 
 #if defined(RAJA_CUDA_ACTIVE)
 
+#include <cstddef>
 #include <utility>
 
 #include "RAJA/pattern/reduce.hpp"
@@ -78,6 +79,110 @@ struct IndexGlobal;
 template<typename ...indexers>
 struct IndexFlatten;
 
+/*!
+ * Use the max occupancy of a kernel on the current device when launch
+ * parameters are not fully determined.
+ * Note that the maximum occupancy of the kernel may be less than the maximum
+ * occupancy of the device in terms of total threads.
+ */
+struct MaxOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use a fraction and an offset of the max occupancy of a kernel on the current
+ * device when launch parameters are not fully determined.
+ * The following formula is used, with care to avoid zero, to determine the
+ * maximum grid size:
+ * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
+ */
+template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+struct FractionOffsetOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    using Fraction = typename t_Fraction::template rebind<IdxT>;
+
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+      func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
+    }
+
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
+      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    }
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use an occupancy that is less than the max occupancy of the device when
+ * launch parameters are not fully determined.
+ * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is
+ * below the maximum occupancy of the device.
+ * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
+ * maximum grid size.
+ */
+template < typename AvoidMaxOccupancyConcretizer >
+struct AvoidDeviceMaxThreadOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block = data.func_threads_per_block;
+
+    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+
+    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+      return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    } else {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+  }
+};
+
+
+enum struct reduce_algorithm : int
+{
+  combine_last_block,
+  init_device_combine_atomic_block,
+  init_host_combine_atomic_block
+};
+
+enum struct block_communication_mode : int
+{
+  device_fence,
+  block_fence
+};
+
+template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
+           size_t t_replication, size_t t_atomic_stride >
+struct ReduceTuning
+{
+  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr block_communication_mode comm_mode = t_comm_mode;
+  static constexpr size_t replication = t_replication;
+  static constexpr size_t atomic_stride = t_atomic_stride;
+};
+
 }  // namespace cuda
 
 namespace policy
@@ -100,7 +205,8 @@ struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::cuda,
                        RAJA::Pattern::forall,
@@ -108,9 +214,11 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::cuda> {
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
+  using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <bool Async, int num_threads = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::region,
@@ -119,8 +227,6 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 };
 
 
-
-
 //
 // NOTE: There is no Index set segment iteration policy for CUDA
 //
@@ -156,8 +262,8 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <bool maybe_atomic>
-struct cuda_reduce_base
+template < typename tuning >
+struct cuda_reduce_policy
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
                                                 RAJA::Pattern::reduce,
@@ -178,9 +284,73 @@ struct cuda_atomic_explicit{};
  */
 using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
-using cuda_reduce = cuda_reduce_base<false>;
 
-using cuda_reduce_atomic = cuda_reduce_base<true>;
+template < RAJA::cuda::reduce_algorithm algorithm,
+           RAJA::cuda::block_communication_mode comm_mode,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified >
+using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
+    algorithm, comm_mode, replication, atomic_stride> >;
+
+// Policies for RAJA::Reduce* objects with specific behaviors.
+// - *atomic* policies may use atomics to combine partial results and falls back
+//   on a non-atomic policy when atomics can't be used with the given type. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions run to run. The memory used with
+//   atomics is initialized on the device which can be expensive on some HW.
+//   On some HW this is faster overall than the non-atomic policies.
+// - *atomic_host* policies are similar to the atomic policies above. However
+//   the memory used with atomics is initialized on the host which is
+//   significantly cheaper on some HW. On some HW this is faster overall than
+//   the non-atomic and atomic policies.
+// - *device_fence policies use normal memory accesses with device scope fences
+//                in the implementation. This works on all HW.
+// - *block_fence policies use special (atomic) memory accesses that only cache
+//                 in a cache shared by the whole device to avoid having to use
+//                 device scope fences. This improves performance on some HW but
+//                 is more difficult to code correctly.
+using cuda_reduce_device_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::combine_last_block,
+    RAJA::cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_block_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::combine_last_block,
+    RAJA::cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block,
+    RAJA::cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block,
+    RAJA::cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block,
+    RAJA::cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block,
+    RAJA::cuda::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+
+// Policy for RAJA::Reduce* objects that gives the same answer every time when
+// used in the same way
+using cuda_reduce = cuda_reduce_device_fence;
+
+// Policy for RAJA::Reduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
+
+// Policy for RAJA::Reduce* objects that lets you select the default atomic or
+// non-atomic policy with a bool
+template < bool with_atomic >
+using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
@@ -235,6 +405,7 @@ struct cuda_thread_masked_loop {};
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
+constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 32;
 constexpr const RAJA::Index_type WARP_SIZE = 32;
 constexpr const RAJA::Index_type MAX_BLOCK_SIZE = 1024;
 constexpr const RAJA::Index_type MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
@@ -882,54 +1053,181 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 } // namespace cuda
 
+// contretizers used in forall, scan, and sort policies
+
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+
+template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+
+using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
+
+using CudaReduceDefaultConcretizer = CudaMaxOccupancyConcretizer;
+
+using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
+
 // policies usable with forall, scan, and sort
+
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE>
+using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
+using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
+using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
+using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, typename Fraction>
+using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
+using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
+using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
+using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, typename Concretizer>
+using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE>
+using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+    cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_base = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+    cuda_exec<BLOCK_SIZE, Async>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE>
+using cuda_exec_base_async = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce_async<BLOCK_SIZE>,
+    cuda_exec_async<BLOCK_SIZE>>;
+
 
 // policies usable with WorkGroup
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
@@ -951,6 +1249,12 @@ using policy::cuda::cuda_atomic;
 using policy::cuda::cuda_atomic_explicit;
 
 // policies usable with reducers
+using policy::cuda::cuda_reduce_device_fence;
+using policy::cuda::cuda_reduce_block_fence;
+using policy::cuda::cuda_reduce_atomic_device_init_device_fence;
+using policy::cuda::cuda_reduce_atomic_device_init_block_fence;
+using policy::cuda::cuda_reduce_atomic_host_init_device_fence;
+using policy::cuda::cuda_reduce_atomic_host_init_block_fence;
 using policy::cuda::cuda_reduce_base;
 using policy::cuda::cuda_reduce;
 using policy::cuda::cuda_reduce_atomic;
@@ -964,7 +1268,7 @@ using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
     kernel_sync_requirement::none,
     cuda::thread_x<RAJA::policy::cuda::WARP_SIZE>>;
 using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     cuda::thread_x<RAJA::policy::cuda::WARP_SIZE>>;
 
@@ -996,13 +1300,13 @@ using cuda_indexer_direct = policy::cuda::cuda_indexer<
 
 template < typename ... indexers >
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
@@ -1014,7 +1318,7 @@ using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
 
 template < typename ... indexers >
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 115f652e11..516b02383c 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -25,6 +25,8 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
+#include <type_traits>
+
 #include <cuda.h>
 
 #include "RAJA/util/macros.hpp"
@@ -33,11 +35,13 @@
 #include "RAJA/util/basic_mempool.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/reduce.hpp"
 
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
 
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
   #include "RAJA/policy/desul/atomic.hpp"
@@ -56,6 +60,7 @@ namespace reduce
 
 namespace cuda
 {
+
 //! atomic operator version of Combiner object
 template <typename Combiner>
 struct atomic;
@@ -64,7 +69,7 @@ template <typename T>
 struct atomic<sum<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
@@ -72,7 +77,7 @@ template <typename T>
 struct atomic<min<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
@@ -80,7 +85,23 @@ template <typename T>
 struct atomic<max<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
+  }
+};
+
+template <typename T>
+struct atomic<and_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
+  }
+};
+
+template <typename T>
+struct atomic<or_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
@@ -101,418 +122,69 @@ namespace cuda
 namespace impl
 {
 
-/*!
- * \brief Abstracts T into an equal or greater size array of integers whose
- * size is between min_integer_type_size and max_interger_type_size inclusive.
- */
-template <typename T,
-          size_t min_integer_type_size = 1,
-          size_t max_integer_type_size = sizeof(long long)>
-union AsIntegerArray {
-
-  static_assert(min_integer_type_size <= max_integer_type_size,
-                "incompatible min and max integer type size");
-  using integer_type = typename std::conditional<
-      ((alignof(T) >= alignof(long long) &&
-        sizeof(long long) <= max_integer_type_size) ||
-       sizeof(long) < min_integer_type_size),
-      long long,
-      typename std::conditional<
-          ((alignof(T) >= alignof(long) &&
-            sizeof(long) <= max_integer_type_size) ||
-           sizeof(int) < min_integer_type_size),
-          long,
-          typename std::conditional<
-              ((alignof(T) >= alignof(int) &&
-                sizeof(int) <= max_integer_type_size) ||
-               sizeof(short) < min_integer_type_size),
-              int,
-              typename std::conditional<
-                  ((alignof(T) >= alignof(short) &&
-                    sizeof(short) <= max_integer_type_size) ||
-                   sizeof(char) < min_integer_type_size),
-                  short,
-                  typename std::conditional<
-                      ((alignof(T) >= alignof(char) &&
-                        sizeof(char) <= max_integer_type_size)),
-                      char,
-                      void>::type>::type>::type>::type>::type;
-  static_assert(!std::is_same<integer_type, void>::value,
-                "could not find a compatible integer type");
-  static_assert(sizeof(integer_type) >= min_integer_type_size,
-                "integer_type smaller than min integer type size");
-  static_assert(sizeof(integer_type) <= max_integer_type_size,
-                "integer_type greater than max integer type size");
-
-  static constexpr size_t num_integer_type =
-      (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
-
-  T value;
-  integer_type array[num_integer_type];
-
-  RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){};
-
-  RAJA_HOST_DEVICE constexpr size_t array_size() const
-  {
-    return num_integer_type;
-  }
-};
-
-// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits
-constexpr const size_t min_shfl_int_type_size = sizeof(int);
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-constexpr const size_t max_shfl_int_type_size = sizeof(long long);
-#else
-constexpr const size_t max_shfl_int_type_size = sizeof(int);
-#endif
-
-/*!
- ******************************************************************************
- *
- * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
- *
- * \Note Returns an undefined value if src lane is inactive (divergence).
- *       Returns this lane's value if src lane is out of bounds or has exited.
- *
- ******************************************************************************
- */
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-    u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
-#else
-    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
-#endif
-  }
-  return u.value;
-}
-
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-    u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
-#else
-    u.array[i] = ::__shfl(u.array[i], srcLane);
-#endif
-  }
-  return u.value;
-}
-
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE double shfl_xor_sync<double>(double var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-#else
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-#endif
-
-
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE double shfl_sync<double>(double var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-#else
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
+//! reduce values in grid into thread 0 of last running block
+//  returns true if put reduced value in val
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
+          typename T, typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
+                                        T identity,
+                                        TempIterator in_device_mem,
+                                        unsigned int* device_count)
 {
-  return ::__shfl(var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
-{
-  return ::__shfl(var, srcLane);
-}
-
-#endif
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
-{
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
-
-  T temp = val;
-
-  if (numThreads % policy::cuda::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  return temp;
-}
-
-/*!
- * Allreduce values in a warp.
- *
- *
- * This does a butterfly pattern leaving each lane with the full reduction
- *
- */
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
-{
-  T temp = val;
-
-  for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-    T rhs = __shfl_xor_sync(0xffffffff, temp, i);
-    Combiner{}(temp, rhs);
-  }
-
-  return temp;
-}
-
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
-{
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  int warpId = threadId % policy::cuda::WARP_SIZE;
-  int warpNum = threadId / policy::cuda::WARP_SIZE;
-
-  T temp = val;
-
-  if (numThreads % policy::cuda::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  // reduce per warp values
-  if (numThreads > policy::cuda::WARP_SIZE) {
-
-    static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE,
-        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
-
-    // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>)];
-
-    // Partial placement new: Should call new(tmpsd) here but recasting memory
-    // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS> *>(tmpsd);
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-    // write per warp values to shared memory
-    if (warpId == 0) {
-      sd->set(warpNum, temp);
-    }
+  int replicationId = blockId % replication;
+  int slotId = blockId / replication;
 
-    __syncthreads();
+  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-    if (warpNum == 0) {
+  int atomicOffset = replicationId * atomic_stride;
+  int beginSlots = replicationId * maxNumSlots;
+  int blockSlot = beginSlots + slotId;
 
-      // read per warp values
-      if (warpId * policy::cuda::WARP_SIZE < numThreads) {
-        temp = sd->get(warpId);
-      } else {
-        temp = identity;
-      }
+  T temp = block_reduce<Combiner>(val, identity);
 
-      for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) {
-        T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
-      }
+  if (numSlots <= 1u) {
+    if (threadId == 0) {
+      val = temp;
     }
-
-    __syncthreads();
+    return (threadId == 0) ? replicationId : replication;
   }
 
-  return temp;
-}
-
-
-//! reduce values in grid into thread 0 of last running block
-//  returns true if put reduced value in val
-template <typename Combiner, typename T, typename TempIterator>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
-                                         T identity,
-                                         TempIterator device_mem,
-                                         unsigned int* device_count)
-{
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-  unsigned int wrap_around = numBlocks - 1;
-
-  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                (gridDim.x * gridDim.y) * blockIdx.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  T temp = block_reduce<Combiner>(val, identity);
-
   // one thread per block writes to device_mem
-  bool lastBlock = false;
+  bool isLastBlock = false;
   if (threadId == 0) {
-    device_mem.set(blockId, temp);
+    device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
-    __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    Accessor::fence_release();
+    // increment counter, (wraps back to zero if old count == (numSlots-1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
+    isLastBlock = (old_count == (numSlots-1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
-  lastBlock = __syncthreads_or(lastBlock);
+  isLastBlock = __syncthreads_or(isLastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (isLastBlock) {
     temp = identity;
+    Accessor::fence_acquire();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      Combiner{}(temp, device_mem.get(i));
+    for (unsigned int i = threadId;
+                      i < numSlots;
+                      i += numThreads) {
+      Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
@@ -523,7 +195,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
     }
   }
 
-  return lastBlock && threadId == 0;
+  return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
 namespace expt {
@@ -634,6 +306,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
   // last block accumulates values from device_mem
   if (lastBlock) {
     temp = OP::identity();
+    __threadfence();
 
     for (int i = threadId; i < numBlocks; i += numThreads) {
       temp = OP{}(temp, red.device_mem.get(i));
@@ -653,64 +326,104 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val,
-                                                T identity,
-                                                T* device_mem,
-                                                unsigned int* device_count)
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
+                                               T identity,
+                                               T* device_mem,
+                                               unsigned int* device_count)
 {
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  unsigned int wrap_around = numBlocks + 1;
-
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  // one thread in first block initializes device_mem
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+
+  if (numSlots <= 1u) {
+    T temp = block_reduce<Combiner>(val, identity);
+    if (threadId == 0) {
+      val = temp;
+    }
+    return (threadId == 0) ? replicationId : replication;
+  }
+
+  // the first block of each replication initializes device_mem
   if (threadId == 0) {
-    unsigned int old_val = ::atomicCAS(device_count, 0u, 1u);
+    unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
     if (old_val == 0u) {
-      device_mem[0] = identity;
-      __threadfence();
-      ::atomicAdd(device_count, 1u);
+      Accessor::set(device_mem, atomicOffset, identity);
+      Accessor::fence_release();
+      ::atomicAdd(&device_count[atomicOffset], 1u);
     }
   }
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  // one thread per block performs atomic on device_mem
-  bool lastBlock = false;
+  // one thread per block performs an atomic on device_mem
+  bool isLastBlock = false;
   if (threadId == 0) {
-    // thread waits for device_mem to be initialized
-    while (static_cast<volatile unsigned int*>(device_count)[0] < 2u)
+    // wait for device_mem to be initialized
+    while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
-    __threadfence();
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[0], temp);
-    __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
-
-    // last block gets value from device_mem
-    if (lastBlock) {
-      val = device_mem[0];
+    Accessor::fence_acquire();
+    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    Accessor::fence_release();
+    // increment counter, (wraps back to zero if old count == (numSlots+1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
+    isLastBlock = (old_count == (numSlots+1));
+
+    // the last block for each replication gets the value from device_mem
+    if (isLastBlock) {
+      Accessor::fence_acquire();
+      val = Accessor::get(device_mem, atomicOffset);
     }
   }
 
-  return lastBlock;
+  return isLastBlock ? replicationId : replication;
+}
+
+//! reduce values in block into thread 0 and atomically combines into device_mem
+template <typename Combiner, int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
+                                                            T identity,
+                                                            T* device_mem)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  T temp = block_reduce<Combiner>(val, identity);
+
+  // one thread per block performs an atomic on device_mem
+  if (threadId == 0 && temp != identity) {
+    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  }
 }
 
 }  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T>
+template <typename T, size_t num_slots, typename mempool>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
   struct Node {
     Node* next;
-    T value;
+    T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode {
@@ -785,7 +498,7 @@ class PinnedTally
       return ret;
     }
 
-    T& operator*() { return m_n->value; }
+    auto operator*() -> T(&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -822,7 +535,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  T* new_value(::RAJA::resources::Cuda res)
+  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -839,10 +552,10 @@ class PinnedTally
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = cuda::pinned_mempool_type::getInstance().template malloc<Node>(1);
+    Node* n = mempool::getInstance().template malloc<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
-    return &n->value;
+    return n->values;
   }
 
   //! synchronize all resources used
@@ -862,7 +575,7 @@ class PinnedTally
       while (rn->node_list) {
         Node* n = rn->node_list;
         rn->node_list = n->next;
-        cuda::pinned_mempool_type::getInstance().free(n);
+        mempool::getInstance().free(n);
       }
       resource_list = rn->next;
       free(rn);
@@ -889,46 +602,59 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename T>
-struct Reduce_Data {
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceLastBlock_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
   unsigned int* device_count;
-  RAJA::detail::SoAPtr<T, device_mempool_type> device;
-  bool own_device_ptr;
+  RAJA::detail::SoAPtr<T, data_mempool_type> device;
+  bool owns_device_pointer;
 
-  Reduce_Data() : Reduce_Data(T(), T()){};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
 
   /*! \brief create from a default value and offload information
    *
    *  allocates PinnedTally to hold device values
    */
 
-  Reduce_Data(T initValue, T identity_)
+  ReduceLastBlock_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
         device{},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
   RAJA_HOST_DEVICE
-  Reduce_Data(const Reduce_Data& other)
+  ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
         device{other.device},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
-  Reduce_Data& operator=(const Reduce_Data&) = default;
+  ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
@@ -936,8 +662,11 @@ struct Reduce_Data {
   {
     T temp = value;
 
-    if (impl::grid_reduce<Combiner>(temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce_last_block<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -949,10 +678,11 @@ struct Reduce_Data {
     if (act) {
       cuda_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      device.allocate(numBlocks);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
-      own_device_ptr = true;
+      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
+      device.allocate(maxNumSlots*replication);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
+      owns_device_pointer = true;
     }
     return act;
   }
@@ -961,54 +691,147 @@ struct Reduce_Data {
   //  free device pointers
   bool teardownForDevice()
   {
-    bool act = own_device_ptr;
+    bool act = owns_device_pointer;
     if (act) {
       device.deallocate();
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
-      own_device_ptr = false;
+      owns_device_pointer = false;
     }
     return act;
   }
 };
 
+//! Reduction data for Cuda Offload -- stores value, host pointer
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceAtomicHostInit_Data
+{
+  using tally_mempool_type = device_pinned_mempool_type;
+
+  static constexpr size_t tally_slots = replication * atomic_stride;
+
+  mutable T value;
+  T identity;
+  bool is_setup;
+  bool owns_device_pointer;
+
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
+
+  ReduceAtomicHostInit_Data(T initValue, T identity_)
+      : value{initValue},
+        identity{identity_},
+        is_setup{false},
+        owns_device_pointer{false}
+  {
+  }
+
+  RAJA_HOST_DEVICE
+  ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
+      : value{other.identity},
+        identity{other.identity},
+        is_setup{other.is_setup},
+        owns_device_pointer{false}
+  {
+  }
+
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+
+  //! initialize output to identity to ensure never read
+  //  uninitialized memory
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
+
+  //! reduce values in grid to single value, store in output
+  RAJA_DEVICE
+  void grid_reduce(T* output)
+  {
+    T temp = value;
+
+    impl::grid_reduce_atomic_host_init<Combiner,
+        replication, atomic_stride>(
+            temp, identity, output);
+  }
+
+  //! check and setup for device
+  //  allocate device pointers and get a new result buffer from the pinned tally
+  bool setupForDevice()
+  {
+    bool act = !is_setup && setupReducers();
+    if (act) {
+      is_setup = true;
+      owns_device_pointer = true;
+    }
+    return act;
+  }
+
+  //! if own resources teardown device setup
+  //  free device pointers
+  bool teardownForDevice()
+  {
+    bool act = owns_device_pointer;
+    if (act) {
+      is_setup = false;
+      owns_device_pointer = false;
+    }
+    return act;
+  }
+};
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T>
-struct ReduceAtomic_Data {
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceAtomicDeviceInit_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
   unsigned int* device_count;
   T* device;
-  bool own_device_ptr;
+  bool owns_device_pointer;
 
-  ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
 
-  ReduceAtomic_Data(T initValue, T identity_)
+  ReduceAtomicDeviceInit_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
         device{nullptr},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
   RAJA_HOST_DEVICE
-  ReduceAtomic_Data(const ReduceAtomic_Data& other)
+  ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
         device{other.device},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
-  ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
@@ -1016,9 +839,11 @@ struct ReduceAtomic_Data {
   {
     T temp = value;
 
-    if (impl::grid_reduce_atomic<Combiner>(
-            temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -1028,10 +853,10 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(1);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
-      own_device_ptr = true;
+      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
+      owns_device_pointer = true;
     }
     return act;
   }
@@ -1040,22 +865,68 @@ struct ReduceAtomic_Data {
   //  free device pointers
   bool teardownForDevice()
   {
-    bool act = own_device_ptr;
+    bool act = owns_device_pointer;
     if (act) {
-      device_mempool_type::getInstance().free(device);
+      data_mempool_type::getInstance().free(device);
       device = nullptr;
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
-      own_device_ptr = false;
+      owns_device_pointer = false;
     }
     return act;
   }
 };
 
+
 //! Cuda Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T, bool maybe_atomic>
+template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
+  static constexpr size_t replication = (tuning::replication > 0)
+      ? tuning::replication
+      : 1;
+  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
+      ? tuning::atomic_stride
+      : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+      impl::AccessorDeviceScopeUseBlockFence,
+      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
+        impl::AccessorDeviceScopeUseDeviceFence,
+        void>>;
+
+  static constexpr bool atomic_policy =
+      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
+  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+
+  //! cuda reduction data storage class and folding algorithm
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
+                                              (atomic_policy && !atomic_available),
+      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      std::conditional_t<atomic_available,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
+          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
+            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
+            void>>,
+        void>>;
+
+  static constexpr size_t tally_slots = reduce_data_type::tally_slots;
+
+  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+
+  //! union to hold either pointer to PinnedTally or pointer to value
+  //  only use list before setup for device and only use val_ptr after
+  union tally_u {
+    TallyType* list;
+    T* val_ptr;
+    constexpr tally_u(TallyType* l) : list(l){};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+  };
+
 public:
   Reduce() : Reduce(T(), Combiner::identity()) {}
 
@@ -1063,7 +934,7 @@ class Reduce
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
       : parent{this},
-        tally_or_val_ptr{new PinnedTally<T>},
+        tally_or_val_ptr{new TallyType},
         val(init_val, identity_)
   {
   }
@@ -1090,9 +961,8 @@ class Reduce
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     if (parent) {
       if (val.setupForDevice()) {
-        tally_or_val_ptr.val_ptr =
-            tally_or_val_ptr.list->new_value(currentResource());
-        val.init_grid_val(tally_or_val_ptr.val_ptr);
+        tally_or_val_ptr.val_ptr = val.init_grid_vals(
+            tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
       }
     }
@@ -1136,9 +1006,15 @@ class Reduce
     auto end = tally_or_val_ptr.list->end();
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
+      ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
+          reducer(std::move(val.value));
       for (; n != end; ++n) {
-        Combiner{}(val.value, *n);
+        T(&values)[tally_slots] = *n;
+        for (size_t r = 0; r < tally_slots; ++r) {
+          reducer.combine(std::move(values[r]));
+        }
       }
+      val.value = reducer.get_and_clear();
       tally_or_val_ptr.list->free_list();
     }
     return val.value;
@@ -1159,38 +1035,20 @@ class Reduce
 
 private:
   const Reduce* parent;
-
-  //! union to hold either pointer to PinnedTally or poiter to value
-  //  only use list before setup for device and only use val_ptr after
-  union tally_u {
-    PinnedTally<T>* list;
-    T* val_ptr;
-    constexpr tally_u(PinnedTally<T>* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
-  };
-
   tally_u tally_or_val_ptr;
-
-  //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = typename std::conditional<
-      maybe_atomic && RAJA::reduce::cuda::cuda_atomic_available<T>::value,
-      cuda::ReduceAtomic_Data<Combiner, T>,
-      cuda::Reduce_Data<Combiner, T>>::type;
-
-  //! storage for reduction data
   reduce_data_type val;
 };
 
 }  // end namespace cuda
 
 //! specialization of ReduceSum for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceSum<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceSum<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>;
   using Base::Base;
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1202,13 +1060,13 @@ class ReduceSum<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitOr for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitOr<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceBitOr<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1220,13 +1078,13 @@ class ReduceBitOr<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitAnd for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitAnd<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceBitAnd<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1238,13 +1096,13 @@ class ReduceBitAnd<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMin for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMin<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceMin<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::min<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::min<T>, T, tuning>;
   using Base::Base;
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1256,13 +1114,13 @@ class ReduceMin<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMax for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMax<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::max<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::max<T>, T, tuning>;
   using Base::Base;
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1274,18 +1132,18 @@ class ReduceMax<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMinLoc for cuda_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMinLoc<cuda_reduce_base<maybe_atomic>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
     : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
                           RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          maybe_atomic>
+                          tuning>
 {
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
   using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
@@ -1324,18 +1182,18 @@ class ReduceMinLoc<cuda_reduce_base<maybe_atomic>, T, IndexType>
 };
 
 //! specialization of ReduceMaxLoc for cuda_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMaxLoc<cuda_reduce_base<maybe_atomic>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
     : public cuda::
           Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
                  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 maybe_atomic>
+                 tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
   using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 5d89844e3c..0a9b0bf305 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -44,6 +44,7 @@ namespace scan
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -52,7 +53,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 inclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -96,6 +97,7 @@ inclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -105,7 +107,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 exclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -152,6 +154,7 @@ exclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -161,7 +164,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 inclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -206,6 +209,7 @@ inclusive(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -216,7 +220,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 exclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index 6e6e4c5696..c5a353b704 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,7 +44,9 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -54,7 +56,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)
@@ -75,13 +77,15 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>>)
@@ -143,13 +147,15 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>>)
@@ -212,7 +218,9 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -222,7 +230,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)
@@ -243,13 +251,15 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>> comp)
@@ -260,13 +270,15 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>> comp)
@@ -278,7 +290,8 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
@@ -290,7 +303,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -314,7 +327,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -322,7 +336,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -396,7 +410,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -404,7 +419,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -479,7 +494,8 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
@@ -491,7 +507,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -515,7 +531,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -523,7 +540,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -535,7 +552,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -543,7 +561,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index e45d3a6aff..84c6d1fa38 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -63,7 +63,7 @@ struct PinnedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     hipErrchk(hipHostFree(ptr));
@@ -82,7 +82,7 @@ struct DeviceAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     hipErrchk(hipFree(ptr));
@@ -105,7 +105,26 @@ struct DeviceZeroedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
+  bool free(void* ptr)
+  {
+    hipErrchk(hipFree(ptr));
+    return true;
+  }
+};
+
+//! Allocator for device pinned memory for use in basic_mempool
+struct DevicePinnedAllocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    void* ptr;
+    hipErrchk(hipMalloc(&ptr, nbytes));
+    return ptr;
+  }
+
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     hipErrchk(hipFree(ptr));
@@ -116,6 +135,7 @@ struct DeviceZeroedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
+using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
@@ -281,6 +301,7 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
+//! Get the properties of the current device
 RAJA_INLINE
 hipDeviceProp_t get_device_prop()
 {
@@ -291,213 +312,236 @@ hipDeviceProp_t get_device_prop()
   return prop;
 }
 
+//! Get a copy of the device properties, this copy is cached on first use to speedup later calls
 RAJA_INLINE
 hipDeviceProp_t& device_prop()
 {
-  static hipDeviceProp_t prop = get_device_prop();
+  static thread_local hipDeviceProp_t prop = get_device_prop();
   return prop;
 }
 
 
+static constexpr int hip_occupancy_uninitialized_int = -1;
+static constexpr size_t hip_occupancy_uninitialized_size_t =
+    std::numeric_limits<size_t>::max();
+
+//! Struct with the maximum theoretical occupancy of the device
 struct HipFixedMaxBlocksData
 {
-  int multiProcessorCount;
-  int maxThreadsPerMultiProcessor;
+  int device_sm_per_device = hip::device_prop().multiProcessorCount;
+  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
+//! Get the maximum theoretical occupancy of the device
 RAJA_INLINE
-int hip_max_blocks(int block_size)
+HipFixedMaxBlocksData hip_max_blocks()
 {
-  static HipFixedMaxBlocksData data = []() {
-    hipDeviceProp_t& prop = hip::device_prop();
-    return HipFixedMaxBlocksData{prop.multiProcessorCount,
-                                 prop.maxThreadsPerMultiProcessor};
-  }();
-
-  int max_blocks = data.multiProcessorCount *
-                  (data.maxThreadsPerMultiProcessor / block_size);
+  static thread_local HipFixedMaxBlocksData data;
 
-  return max_blocks;
+  return data;
 }
 
+//! Struct with the maximum occupancy of a kernel in simple terms
 struct HipOccMaxBlocksThreadsData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int max_threads;
+  size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
+  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
+  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+//! Get the maximum occupancy of a kernel with unknown threads per block
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void hip_occupancy_max_blocks_threads(Func&& func, size_t shmem_size,
-                                       int &max_blocks, int &max_threads)
+HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local HipOccMaxBlocksThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
+  static thread_local HipOccMaxBlocksThreadsData data;
 
-  if (data.prev_shmem_size != shmem_size) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.max_blocks, &data.max_threads, func, shmem_size));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
     hipDeviceProp_t& prop = hip::device_prop();
-    data.max_blocks = prop.multiProcessorCount;
-    data.max_threads = 1024;
+    data.func_max_blocks_per_device = prop.multiProcessorCount;
+    data.func_max_threads_per_block = 1024;
 #endif
 
-    data.prev_shmem_size = shmem_size;
-
   }
 
-  max_blocks  = data.max_blocks;
-  max_threads = data.max_threads;
-
+  return data;
 }
 
-struct HipOccMaxBlocksFixedThreadsData
+//! Struct with the maximum occupancy of a kernel in specific terms
+struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int multiProcessorCount;
+  size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
+  int func_threads_per_block = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func >
+//! Get the maximum occupancy of a kernel with compile time threads per block
+template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
 RAJA_INLINE
-void hip_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks)
+HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local HipOccMaxBlocksFixedThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
+  static thread_local HipOccMaxBlocksData data;
+
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
-  if (data.prev_shmem_size != shmem_size) {
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.max_blocks, func, num_threads, shmem_size));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.max_blocks <= 0) { data.max_blocks = 1 }
+    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
+    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
 #endif
 
-    if (data.multiProcessorCount == uninitialized) {
-
-      data.multiProcessorCount = hip::device_prop().multiProcessorCount;
-
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
-
-    data.prev_shmem_size = shmem_size;
-
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct HipOccMaxBlocksVariableThreadsData
-{
-  size_t prev_shmem_size;
-  int prev_num_threads;
-  int max_blocks;
-  int multiProcessorCount;
-};
-
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+//! Get the maximum occupancy of a kernel with runtime threads per block
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void hip_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks, int num_threads)
+HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
-  static constexpr int uninitialized = 0;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local HipOccMaxBlocksVariableThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized, uninitialized};
+  static thread_local HipOccMaxBlocksData data;
 
-  if ( data.prev_shmem_size  != shmem_size ||
-       data.prev_num_threads != num_threads ) {
+  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+       data.func_threads_per_block != func_threads_per_block ) {
+
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.max_blocks, func, num_threads, shmem_size));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.max_blocks <= 0) { data.max_blocks = 1 }
+    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
+    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
 #endif
 
-    if (data.multiProcessorCount == uninitialized) {
-
-      data.multiProcessorCount = hip::device_prop().multiProcessorCount;
-
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
-
-    data.prev_shmem_size  = shmem_size;
-    data.prev_num_threads = num_threads;
-
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct HipOccupancyDefaults
+
+/*!
+ ******************************************************************************
+ *
+ * \brief  Concretizer Implementation that chooses block size and/or grid
+ *         size when one or both has not been specified at compile time.
+ *
+ * \tparam IdxT Index type to use for integer calculations.
+ * \tparam Concretizer Class that determines the max number of blocks to use
+ *         when fitting for the device.
+ * \tparam UniqueMarker A type that is unique to each global function, used to
+ *         help cache the occupancy data for that global function.
+ *
+ * The methods come in two flavors:
+ * - The fit_len methods choose grid and block sizes that result in a total
+ *   number of threads of at least the len given in the constructor or 0 if
+ *   that is not possible.
+ * - The fit_device methods choose grid and block sizes that best fit the
+ *   occupancy of the global function according to the occupancy calculator and
+ *   the Concretizer class.
+ *
+ * Common terms:
+ * - block size - threads per block
+ * - grid size - blocks per device
+ *
+ ******************************************************************************
+ */
+template < typename IdxT, typename Concretizer, typename UniqueMarker>
+struct ConcretizerImpl
 {
-  HipOccupancyDefaults(const void* RAJA_UNUSED_ARG(func))
+  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
+    : m_func(func)
+    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
+    , m_len(len)
   { }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size),
-                                IdxT RAJA_UNUSED_ARG(block_size)) const
+  IdxT get_max_block_size() const
   {
-    return std::numeric_limits<IdxT>::max();
+    auto data = hip_occupancy_max_blocks_threads<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block);
+    IdxT func_max_threads_per_block = data.func_max_threads_per_block;
+    return func_max_threads_per_block;
   }
 
-  template < typename IdxT = hip_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const
+  //! Get a block size when grid size is specified
+  IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
-    return std::make_pair(static_cast<IdxT>(::RAJA::policy::hip::MAX_BLOCK_SIZE),
-                          std::numeric_limits<IdxT>::max());
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block) {
+      return func_threads_per_block;
+    } else {
+      return IdxT(0);
+    }
   }
-};
 
-template < typename UniqueMarker >
-struct HipOccupancyCalculator
-{
-  HipOccupancyCalculator(const void* func)
-    : m_func(func)
-  { }
+  //! Get a grid size when block size is specified
+  IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
+  {
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return func_blocks_per_device;
+  }
+
+  //! Get a block size and grid size when neither is specified
+  auto get_block_and_grid_size_to_fit_len() const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
+  }
+
+  //! Get a block size when grid size is specified
+  IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    return std::min(func_threads_per_block, func_max_threads_per_block);
+  }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const
+  //! Get a grid size when block size is specified
+  IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
-    int max_grid_size = -1;
-    ::RAJA::hip::hip_occupancy_max_blocks<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, block_size);
-    return static_cast<IdxT>(max_grid_size);
+    auto data = hip_occupancy_max_blocks<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
+    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
-  template < typename IdxT = hip_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const
+  //! Get a block size and grid size when neither is specified
+  auto get_block_and_grid_size_to_fit_device() const
   {
-    int max_block_size = -1;
-    int max_grid_size = -1;
-    ::RAJA::hip::hip_occupancy_max_blocks_threads<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, max_block_size);
-    return std::make_pair(static_cast<IdxT>(max_block_size),
-                          static_cast<IdxT>(max_grid_size));
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
 private:
   const void* m_func;
+  size_t m_func_dynamic_shmem_per_block;
+  IdxT m_len;
 };
 
 }  // namespace hip
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index b0b86131ef..6fa21f9217 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,16 +71,17 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename UniqueMarker>
+template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
@@ -92,8 +93,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    if ( len > (static_cast<IdxT>(IndexGetter::block_size) *
-                static_cast<IdxT>(IndexGetter::grid_size)) ) {
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    if ( len > (block_size * grid_size) ) {
       RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
@@ -102,9 +105,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
@@ -113,17 +117,26 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    internal::set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::grid_size)));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
+
+    if ( block_size == IdxT(0) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    }
+
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
@@ -132,16 +145,22 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::block_size)));
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
@@ -150,104 +169,104 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)));
+    internal::set_hip_dim<dim>(dims.threads, sizes.first);
+    internal::set_hip_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
 
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_block_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)),
-        static_cast<IdxT>(max_sizes.first));
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
-    internal::set_hip_dim<dim>(dims.threads, calculated_block_size);
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
 
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size,
-                                              static_cast<IdxT>(IndexMapper::block_size));
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)),
-        static_cast<IdxT>(max_grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)),
-        static_cast<IdxT>(max_sizes.second));
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_hip_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_hip_dim<dim>(dims.threads, sizes.first);
+    internal::set_hip_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
@@ -273,7 +292,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -297,7 +316,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -322,7 +341,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -349,7 +368,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -375,7 +394,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -400,7 +420,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -427,7 +448,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -455,7 +477,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -487,7 +510,7 @@ void forallp_hip_kernel(LOOP_BODY loop_body,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          bool Async,
+          typename Concretizer, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -495,7 +518,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>const&,
+            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
@@ -503,9 +526,9 @@ forall_impl(resources::Hip hip_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -556,7 +579,7 @@ forall_impl(resources::Hip hip_res,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          bool Async,
+          typename Concretizer, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -564,7 +587,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
 forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> const&,
+            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
@@ -572,9 +595,9 @@ forall_impl(resources::Hip hip_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -652,11 +675,11 @@ forall_impl(resources::Hip hip_res,
  */
 template <typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          bool Async,
+          typename Concretizer, bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Hip>
 forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>>,
+            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -665,7 +688,7 @@ forall_impl(resources::Hip r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, true>(),
+                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
new file mode 100644
index 0000000000..354e5d7278
--- /dev/null
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -0,0 +1,362 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA intrinsics templates for HIP execution.
+ *
+ *          These methods should work on any platform that supports
+ *          HIP devices.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_hip_intrinsics_HPP
+#define RAJA_hip_intrinsics_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include <type_traits>
+
+#include <hip/hip_runtime.h>
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/policy/hip/policy.hpp"
+
+
+namespace RAJA
+{
+
+namespace hip
+{
+
+namespace impl
+{
+
+/*!
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
+ *
+ * \Note This uses device scope fences to ensure ordering and to flush local
+ *       caches so that memory accesses become visible to the whole device.
+ * \Note This class uses normal memory accesses that are cached in local caches
+ *       so device scope fences are required to make memory accesses visible
+ *       to the whole device.
+ */
+struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
+{
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
+};
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
+ *
+ * \Note This may use block scope fences to ensure ordering and avoid flushing
+ *       local caches so special memory accesses are used to ensure visibility
+ *       to the whole device.
+ * \Note This class uses device scope atomic memory accesses to bypass local
+ *       caches so memory accesses are visible to the whole device without
+ *       device scope fences.
+ * \Note A memory access may be split into multiple memory accesses, so
+ *       even though atomic instructions are used concurrent accesses between
+ *       different threads are not thread safe.
+ *
+ ******************************************************************************
+ */
+struct AccessorDeviceScopeUseBlockFence
+{
+  // hip has 32 and 64 bit atomics
+  static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
+  static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+      u.array[i] = atomicAdd(&ptr[i], integer_type(0));
+#endif
+    }
+
+    return u.get_value();
+  }
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    u.set_value(val);
+    auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+      atomicExch(&ptr[i], u.array[i]);
+#endif
+    }
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+#else
+    __threadfence();
+#endif
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
+                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
+    __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+    // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+#else
+    __threadfence();
+#endif
+  }
+};
+
+
+// hip only has shfl primitives for 32 bits
+constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
+constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
+ *
+ * \Note Returns an undefined value if src lane is inactive (divergence).
+ *       Returns this lane's value if src lane is out of bounds or has exited.
+ *
+ ******************************************************************************
+ */
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
+  }
+  return u.get_value();
+}
+
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+    u.array[i] = ::__shfl(u.array[i], srcLane);
+  }
+  return u.get_value();
+}
+
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  T temp = val;
+
+  if (numThreads % policy::hip::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  return temp;
+}
+
+/*!
+ * Allreduce values in a warp.
+ *
+ *
+ * This does a butterfly pattern leaving each lane with the full reduction
+ *
+ */
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
+{
+  T temp = val;
+
+  for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+    T rhs = shfl_xor_sync(temp, i);
+    Combiner{}(temp, rhs);
+  }
+
+  return temp;
+}
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int warpId = threadId % policy::hip::WARP_SIZE;
+  int warpNum = threadId / policy::hip::WARP_SIZE;
+
+  T temp = val;
+
+  if (numThreads % policy::hip::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  // reduce per warp values
+  if (numThreads > policy::hip::WARP_SIZE) {
+
+    static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE,
+        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>)];
+    RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>* sd =
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS> *>(tmpsd);
+
+    // write per warp values to shared memory
+    if (warpId == 0) {
+      sd->set(warpNum, temp);
+    }
+
+    __syncthreads();
+
+    if (warpNum == 0) {
+
+      // read per warp values
+      if (warpId * policy::hip::WARP_SIZE < numThreads) {
+        temp = sd->get(warpId);
+      } else {
+        temp = identity;
+      }
+
+      for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) {
+        T rhs = shfl_xor_sync(temp, i);
+        Combiner{}(temp, rhs);
+      }
+    }
+
+    __syncthreads();
+  }
+
+  return temp;
+}
+
+}  // end namespace impl
+
+}  // end namespace hip
+
+}  // end namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_HIP guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/kernel.hpp b/include/RAJA/policy/hip/kernel.hpp
index 678d48e3c1..4f907f5f5f 100644
--- a/include/RAJA/policy/hip/kernel.hpp
+++ b/include/RAJA/policy/hip/kernel.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   RAJA header file containing constructs used to run kernel::forall
- *          traversals on GPU with CUDA.
+ *          traversals on GPU with HIP.
  *
  ******************************************************************************
  */
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index ce8e87d869..848ea42edf 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -108,7 +108,7 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -123,7 +123,7 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -180,7 +180,7 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -195,7 +195,7 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -246,7 +246,7 @@ struct HipStatementExecutor<
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
 : HipStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 001cc28b77..014b4db3ac 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -103,20 +103,20 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -166,20 +166,20 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -226,7 +226,7 @@ struct HipStatementExecutor<
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
 : HipStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 67bea1299a..68156600b2 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -87,7 +87,7 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, true>, EnclosedStmts...> {
+    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
 };
 
 
@@ -263,7 +263,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   inline static void recommended_blocks_threads(size_t shmem_size,
       int &recommended_blocks, int &recommended_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -273,8 +273,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         // determine blocks at runtime
         // determine threads at runtime
         //
-        ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
-            func, shmem_size, recommended_blocks, recommended_threads);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_threads = data.func_max_threads_per_block;
 
       } else {
 
@@ -284,8 +286,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         //
         recommended_threads = num_threads;
 
-        ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, recommended_blocks);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
@@ -339,7 +342,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   inline static void max_blocks(size_t shmem_size,
       int &max_blocks, int actual_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -352,16 +355,18 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         //
         // determine blocks when actual_threads != num_threads
         //
-        ::RAJA::hip::hip_occupancy_max_blocks<Self>(
-            func, shmem_size, max_blocks, actual_threads);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
+            func, shmem_size, actual_threads);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       } else {
 
         //
         // determine blocks when actual_threads == num_threads
         //
-        ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, max_blocks);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 24f38b7647..62dda7f20d 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -143,7 +143,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -153,7 +153,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -233,7 +233,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -243,7 +243,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -318,7 +318,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
 : HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index c92f92fb71..07637fbd8f 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -131,14 +131,14 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -146,7 +146,7 @@ struct HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -209,14 +209,14 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -224,7 +224,7 @@ struct HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -281,7 +281,7 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
 : HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 2c93520b93..aa0610d736 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -388,7 +388,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -402,7 +402,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -418,7 +418,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -436,7 +436,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -451,7 +451,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -469,7 +469,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -488,7 +488,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -508,7 +508,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -527,7 +527,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 2e54b16a81..76f592d20b 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -433,7 +433,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -457,7 +457,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -493,7 +493,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -625,7 +625,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -649,7 +649,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -686,7 +686,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -810,18 +810,18 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 };
 
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -852,7 +852,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -914,7 +914,7 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -964,7 +964,7 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 75f9abd878..c359a68de0 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -74,6 +74,110 @@ struct IndexGlobal;
 template<typename ...indexers>
 struct IndexFlatten;
 
+/*!
+ * Use the max occupancy of a kernel on the current device when launch
+ * parameters are not fully determined.
+ * Note that the maximum occupancy of the kernel may be less than the maximum
+ * occupancy of the device in terms of total threads.
+ */
+struct MaxOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use a fraction and an offset of the max occupancy of a kernel on the current
+ * device when launch parameters are not fully determined.
+ * The following formula is used, with care to avoid zero, to determine the
+ * maximum grid size:
+ * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
+ */
+template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+struct FractionOffsetOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    using Fraction = typename t_Fraction::template rebind<IdxT>;
+
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+      func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
+    }
+
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
+      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    }
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use an occupancy that is less than the max occupancy of the device when
+ * launch parameters are not fully determined.
+ * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is
+ * below the maximum occupancy of the device.
+ * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
+ * maximum grid size.
+ */
+template < typename AvoidMaxOccupancyConcretizer >
+struct AvoidDeviceMaxThreadOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block = data.func_threads_per_block;
+
+    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+
+    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+      return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    } else {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+  }
+};
+
+
+enum struct reduce_algorithm : int
+{
+  combine_last_block,
+  init_device_combine_atomic_block,
+  init_host_combine_atomic_block
+};
+
+enum struct block_communication_mode : int
+{
+  device_fence,
+  block_fence
+};
+
+template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
+           size_t t_replication, size_t t_atomic_stride >
+struct ReduceTuning
+{
+  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr block_communication_mode comm_mode = t_comm_mode;
+  static constexpr size_t replication = t_replication;
+  static constexpr size_t atomic_stride = t_atomic_stride;
+};
+
 }  // namespace hip
 
 namespace policy
@@ -93,7 +197,8 @@ struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, bool Async = false>
+template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+          bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::hip,
                        RAJA::Pattern::forall,
@@ -101,6 +206,7 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::hip> {
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
+  using LaunchConcretizer = _LaunchConcretizer;
 };
 
 template <bool Async, int num_threads = named_usage::unspecified>
@@ -147,8 +253,9 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <bool maybe_atomic>
-struct hip_reduce_base
+
+template < typename tuning >
+struct hip_reduce_policy
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
                                                 RAJA::Pattern::reduce,
@@ -169,9 +276,73 @@ struct hip_atomic_explicit{};
  */
 using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
-using hip_reduce = hip_reduce_base<false>;
 
-using hip_reduce_atomic = hip_reduce_base<true>;
+template < RAJA::hip::reduce_algorithm algorithm,
+           RAJA::hip::block_communication_mode comm_mode,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified >
+using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning<
+    algorithm, comm_mode, replication, atomic_stride> >;
+
+// Policies for RAJA::Reduce* objects with specific behaviors.
+// - *atomic* policies may use atomics to combine partial results and falls back
+//   on a non-atomic policy when atomics can't be used with the given type. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions run to run. The memory used with
+//   atomics is initialized on the device which can be expensive on some HW.
+//   On some HW this is faster overall than the non-atomic policies.
+// - *atomic_host* policies are similar to the atomic policies above. However
+//   the memory used with atomics is initialized on the host which is
+//   significantly cheaper on some HW. On some HW this is faster overall than
+//   the non-atomic and atomic policies.
+// - *device_fence policies use normal memory accesses with device scope fences
+//                in the implementation. This works on all HW.
+// - *block_fence policies use special (atomic) memory accesses that only cache
+//                 in a cache shared by the whole device to avoid having to use
+//                 device scope fences. This improves performance on some HW but
+//                 is more difficult to code correctly.
+using hip_reduce_device_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::combine_last_block,
+    RAJA::hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_block_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::combine_last_block,
+    RAJA::hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_device_combine_atomic_block,
+    RAJA::hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_device_combine_atomic_block,
+    RAJA::hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_host_combine_atomic_block,
+    RAJA::hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+///
+using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_host_combine_atomic_block,
+    RAJA::hip::block_communication_mode::block_fence,
+    named_usage::unspecified, named_usage::unspecified>;
+
+// Policy for RAJA::Reduce* objects that gives the same answer every time when
+// used in the same way
+using hip_reduce = hip_reduce_block_fence;
+
+// Policy for RAJA::Reduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
+using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
+
+// Policy for RAJA::Reduce* objects that lets you select the default atomic or
+// non-atomic policy with a bool
+template < bool with_atomic >
+using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
@@ -226,6 +397,7 @@ struct hip_thread_masked_loop {};
 // Operations in the included files are parametrized using the following
 // values for HIP warp size and max block size.
 //
+constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 64; // 128 on gfx90a
 #if defined(__HIP_PLATFORM_AMD__)
 constexpr const RAJA::Index_type WARP_SIZE = 64;
 #elif defined(__HIP_PLATFORM_NVIDIA__)
@@ -816,6 +988,7 @@ struct IndexFlatten<x_index, y_index, z_index>
 
 };
 
+
 // helper to get just the thread indexing part of IndexGlobal
 template < typename index_global >
 struct get_index_thread;
@@ -876,30 +1049,100 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 } // namespace hip
 
+// contretizers used in forall, scan, and sort policies
+
+using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+
+template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+
+using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
+
+using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+
+using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
+
 // policies usable with forall, scan, and sort
+
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE, GRID_SIZE>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE, GRID_SIZE>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>, Async>;
+    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>, true>;
+    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_occ_max = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer, Async>;
+
+template <size_t BLOCK_SIZE>
+using hip_exec_occ_max_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer, true>;
+
+template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
+using hip_exec_occ_fraction = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
+
+template <size_t BLOCK_SIZE, typename Fraction>
+using hip_exec_occ_fraction_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
+
+template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
+using hip_exec_occ_custom = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    Concretizer, Async>;
+
+template <size_t BLOCK_SIZE, typename Concretizer>
+using hip_exec_occ_custom_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    Concretizer, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_with_reduce = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer, Async>;
+
+template <size_t BLOCK_SIZE>
+using hip_exec_with_reduce_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer, true>;
+
+template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_base = std::conditional_t<with_reduce,
+    hip_exec_with_reduce<BLOCK_SIZE, Async>,
+    hip_exec<BLOCK_SIZE, Async>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE>
+using hip_exec_base_async = std::conditional_t<with_reduce,
+    hip_exec_with_reduce_async<BLOCK_SIZE>,
+    hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -914,6 +1157,12 @@ using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
 
 // policies usable with reducers
+using policy::hip::hip_reduce_device_fence;
+using policy::hip::hip_reduce_block_fence;
+using policy::hip::hip_reduce_atomic_device_init_device_fence;
+using policy::hip::hip_reduce_atomic_device_init_block_fence;
+using policy::hip::hip_reduce_atomic_host_init_device_fence;
+using policy::hip::hip_reduce_atomic_host_init_block_fence;
 using policy::hip::hip_reduce_base;
 using policy::hip::hip_reduce;
 using policy::hip::hip_reduce_atomic;
@@ -927,7 +1176,7 @@ using hip_warp_direct = RAJA::policy::hip::hip_indexer<
     kernel_sync_requirement::none,
     hip::thread_x<RAJA::policy::hip::WARP_SIZE>>;
 using hip_warp_loop = RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     hip::thread_x<RAJA::policy::hip::WARP_SIZE>>;
 
@@ -953,13 +1202,13 @@ using hip_indexer_direct = policy::hip::hip_indexer<
 
 template < typename ... indexers >
 using hip_indexer_loop = policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
@@ -971,7 +1220,7 @@ using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
 
 template < typename ... indexers >
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index df47616cb6..2dbaf9f7e5 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -35,11 +35,13 @@
 #include "RAJA/util/basic_mempool.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/reduce.hpp"
 
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
 
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/policy/hip/intrinsics.hpp"
 #include "RAJA/policy/hip/atomic.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
@@ -52,6 +54,7 @@ namespace reduce
 
 namespace hip
 {
+
 //! atomic operator version of Combiner object
 template <typename Combiner>
 struct atomic;
@@ -60,7 +63,7 @@ template <typename T>
 struct atomic<sum<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
   }
 };
 
@@ -68,7 +71,7 @@ template <typename T>
 struct atomic<min<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
   }
 };
 
@@ -76,7 +79,23 @@ template <typename T>
 struct atomic<max<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
+  }
+};
+
+template <typename T>
+struct atomic<and_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
+  }
+};
+
+template <typename T>
+struct atomic<or_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
   }
 };
 
@@ -97,295 +116,69 @@ namespace hip
 namespace impl
 {
 
-/*!
- * \brief Abstracts T into an equal or greater size array of integers whose
- * size is between min_integer_type_size and max_interger_type_size inclusive.
- */
-template <typename T,
-          size_t min_integer_type_size = 1,
-          size_t max_integer_type_size = sizeof(long long)>
-union AsIntegerArray {
-
-  static_assert(min_integer_type_size <= max_integer_type_size,
-                "incompatible min and max integer type size");
-  using integer_type = typename std::conditional<
-      ((alignof(T) >= alignof(long long) &&
-        sizeof(long long) <= max_integer_type_size) ||
-       sizeof(long) < min_integer_type_size),
-      long long,
-      typename std::conditional<
-          ((alignof(T) >= alignof(long) &&
-            sizeof(long) <= max_integer_type_size) ||
-           sizeof(int) < min_integer_type_size),
-          long,
-          typename std::conditional<
-              ((alignof(T) >= alignof(int) &&
-                sizeof(int) <= max_integer_type_size) ||
-               sizeof(short) < min_integer_type_size),
-              int,
-              typename std::conditional<
-                  ((alignof(T) >= alignof(short) &&
-                    sizeof(short) <= max_integer_type_size) ||
-                   sizeof(char) < min_integer_type_size),
-                  short,
-                  typename std::conditional<
-                      ((alignof(T) >= alignof(char) &&
-                        sizeof(char) <= max_integer_type_size)),
-                      char,
-                      void>::type>::type>::type>::type>::type;
-  static_assert(!std::is_same<integer_type, void>::value,
-                "could not find a compatible integer type");
-  static_assert(sizeof(integer_type) >= min_integer_type_size,
-                "integer_type smaller than min integer type size");
-  static_assert(sizeof(integer_type) <= max_integer_type_size,
-                "integer_type greater than max integer type size");
-
-  constexpr static size_t num_integer_type =
-      (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
-
-  T value;
-  integer_type array[num_integer_type];
-
-  RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){};
-
-  RAJA_HOST_DEVICE constexpr size_t array_size() const
-  {
-    return num_integer_type;
-  }
-};
-
-// hip only has shfl primitives for 32 bits
-constexpr const size_t min_shfl_int_type_size = sizeof(int);
-constexpr const size_t max_shfl_int_type_size = sizeof(int);
-
-/*!
- ******************************************************************************
- *
- * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
- *
- * \Note Returns an undefined value if src lane is inactive (divergence).
- *       Returns this lane's value if src lane is out of bounds or has exited.
- *
- ******************************************************************************
- */
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
-  }
-  return u.value;
-}
-
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-    u.array[i] = ::__shfl(u.array[i], srcLane);
-  }
-  return u.value;
-}
-
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
-{
-  return ::__shfl(var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
-{
-  return ::__shfl(var, srcLane);
-}
-
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
+//! reduce values in grid into thread 0 of last running block
+//  returns true if put reduced value in val
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
+          typename T, typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
+                                        T identity,
+                                        TempIterator in_device_mem,
+                                        unsigned int* device_count)
 {
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
-
-  T temp = val;
-
-  if (numThreads % policy::hip::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  return temp;
-}
-
-/*!
- * Allreduce values in a warp.
- *
- *
- * This does a butterfly pattern leaving each lane with the full reduction
- *
- */
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
-{
-  T temp = val;
-
-  for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-    T rhs = shfl_xor_sync(temp, i);
-    Combiner{}(temp, rhs);
-  }
-
-  return temp;
-}
-
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
-{
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  int warpId = threadId % policy::hip::WARP_SIZE;
-  int warpNum = threadId / policy::hip::WARP_SIZE;
-
-  T temp = val;
-
-  if (numThreads % policy::hip::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  // reduce per warp values
-  if (numThreads > policy::hip::WARP_SIZE) {
-
-    static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE,
-        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
-
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>)];
-    RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS> *>(tmpsd);
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-    // write per warp values to shared memory
-    if (warpId == 0) {
-      sd->set(warpNum, temp);
-    }
+  int replicationId = blockId % replication;
+  int slotId = blockId / replication;
 
-    __syncthreads();
+  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-    if (warpNum == 0) {
+  int atomicOffset = replicationId * atomic_stride;
+  int beginSlots = replicationId * maxNumSlots;
+  int blockSlot = beginSlots + slotId;
 
-      // read per warp values
-      if (warpId * policy::hip::WARP_SIZE < numThreads) {
-        temp = sd->get(warpId);
-      } else {
-        temp = identity;
-      }
+  T temp = block_reduce<Combiner>(val, identity);
 
-      for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) {
-        T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
-      }
+  if (numSlots <= 1u) {
+    if (threadId == 0) {
+      val = temp;
     }
-
-    __syncthreads();
+    return (threadId == 0) ? replicationId : replication;
   }
 
-  return temp;
-}
-
-
-//! reduce values in grid into thread 0 of last running block
-//  returns true if put reduced value in val
-template <typename Combiner, typename T, typename TempIterator>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
-                                         T identity,
-                                         TempIterator device_mem,
-                                         unsigned int* device_count)
-{
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-  unsigned int wrap_around = numBlocks - 1;
-
-  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                (gridDim.x * gridDim.y) * blockIdx.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  T temp = block_reduce<Combiner>(val, identity);
-
   // one thread per block writes to device_mem
-  __shared__ bool lastBlock;
+  __shared__ bool isLastBlock;
   if (threadId == 0) {
-    device_mem.set(blockId, temp);
+    device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
-    __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around) ? 1: 0;
+    Accessor::fence_release();
+    // increment counter, (wraps back to zero if old count == (numSlots-1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
+    isLastBlock = (old_count == (numSlots-1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   __syncthreads();
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (isLastBlock) {
     temp = identity;
+    Accessor::fence_acquire();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      Combiner{}(temp, device_mem.get(i));
+    for (unsigned int i = threadId;
+                      i < numSlots;
+                      i += numThreads) {
+      Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
@@ -396,7 +189,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
     }
   }
 
-  return lastBlock && threadId == 0;
+  return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
 namespace expt {
@@ -507,6 +300,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
   // last block accumulates values from device_mem
   if (lastBlock) {
     temp = OP::identity();
+    __threadfence();
 
     for (int i = threadId; i < numBlocks; i += numThreads) {
       temp = OP{}(temp, red.device_mem.get(i));
@@ -526,64 +320,106 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val,
-                                                T identity,
-                                                T* device_mem,
-                                                unsigned int* device_count)
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
+          typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
+                                               T identity,
+                                               T* device_mem,
+                                               unsigned int* device_count)
 {
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  unsigned int wrap_around = numBlocks + 1;
-
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  // one thread in first block initializes device_mem
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+
+  if (numSlots <= 1u) {
+    T temp = block_reduce<Combiner>(val, identity);
+    if (threadId == 0) {
+      val = temp;
+    }
+    return (threadId == 0) ? replicationId : replication;
+  }
+
+  // the first block of each replication initializes device_mem
   if (threadId == 0) {
-    unsigned int old_val = ::atomicCAS(device_count, 0u, 1u);
+    unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
     if (old_val == 0u) {
-      device_mem[0] = identity;
-      __threadfence();
-      ::atomicAdd(device_count, 1u);
+      Accessor::set(device_mem, atomicOffset, identity);
+      Accessor::fence_release();
+      ::atomicAdd(&device_count[atomicOffset], 1u);
     }
   }
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  // one thread per block performs atomic on device_mem
-  bool lastBlock = false;
+  // one thread per block performs an atomic on device_mem
+  bool isLastBlock = false;
   if (threadId == 0) {
-    // thread waits for device_mem to be initialized
-    while (static_cast<volatile unsigned int*>(device_count)[0] < 2u)
+    // wait for device_mem to be initialized
+    while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
-    __threadfence();
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[0], temp);
-    __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
-
-    // last block gets value from device_mem
-    if (lastBlock) {
-      val = device_mem[0];
+    Accessor::fence_acquire();
+    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    Accessor::fence_release();
+    // increment counter, (wraps back to zero if old count == (numSlots+1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
+    isLastBlock = (old_count == (numSlots+1));
+
+    // the last block for each replication gets the value from device_mem
+    if (isLastBlock) {
+      Accessor::fence_acquire();
+      val = Accessor::get(device_mem, atomicOffset);
     }
   }
 
-  return lastBlock;
+  return isLastBlock ? replicationId : replication;
+}
+
+//! reduce values in block into thread 0 and atomically combines into device_mem
+template <typename Combiner, int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
+                                                            T identity,
+                                                            T* device_mem)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  T temp = block_reduce<Combiner>(val, identity);
+
+  // one thread per block performs an atomic on device_mem
+  if (threadId == 0 && temp != identity) {
+    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  }
+
 }
 
 }  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T>
+template <typename T, size_t num_slots, typename mempool>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
   struct Node {
     Node* next;
-    T value;
+    T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode {
@@ -658,7 +494,7 @@ class PinnedTally
       return ret;
     }
 
-    T& operator*() { return m_n->value; }
+    auto operator*() -> T(&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -695,7 +531,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  T* new_value(::RAJA::resources::Hip res)
+  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -712,10 +548,10 @@ class PinnedTally
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = hip::pinned_mempool_type::getInstance().template malloc<Node>(1);
+    Node* n = mempool::getInstance().template malloc<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
-    return &n->value;
+    return n->values;
   }
 
   //! synchronize all resources used
@@ -735,7 +571,7 @@ class PinnedTally
       while (rn->node_list) {
         Node* n = rn->node_list;
         rn->node_list = n->next;
-        hip::pinned_mempool_type::getInstance().free(n);
+        mempool::getInstance().free(n);
       }
       resource_list = rn->next;
       free(rn);
@@ -762,23 +598,30 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename T>
-struct Reduce_Data {
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceLastBlock_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
   unsigned int* device_count;
-  RAJA::detail::SoAPtr<T, device_mempool_type> device;
+  RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool own_device_ptr;
 
-  Reduce_Data() : Reduce_Data(T(), T()){};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
 
   /*! \brief create from a default value and offload information
    *
    *  allocates PinnedTally to hold device values
    */
 
-  Reduce_Data(T initValue, T identity_)
+  ReduceLastBlock_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
@@ -788,7 +631,7 @@ struct Reduce_Data {
   }
 
   RAJA_HOST_DEVICE
-  Reduce_Data(const Reduce_Data& other)
+  ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
@@ -797,20 +640,28 @@ struct Reduce_Data {
   {
   }
 
-  Reduce_Data& operator=(const Reduce_Data&) = default;
+  ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
   void grid_reduce(T* output)
   {
     T temp = value;
-
-    if (impl::grid_reduce<Combiner>(temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce_last_block<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -822,9 +673,10 @@ struct Reduce_Data {
     if (act) {
       hip_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      device.allocate(numBlocks);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
+      device.allocate(maxNumSlots*replication);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -837,7 +689,7 @@ struct Reduce_Data {
     bool act = own_device_ptr;
     if (act) {
       device.deallocate();
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -847,8 +699,95 @@ struct Reduce_Data {
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T>
-struct ReduceAtomic_Data {
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceAtomicHostInit_Data
+{
+  using tally_mempool_type = device_pinned_mempool_type;
+
+  static constexpr size_t tally_slots = replication * atomic_stride;
+
+  mutable T value;
+  T identity;
+  bool is_setup;
+  bool own_device_ptr;
+
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
+
+  ReduceAtomicHostInit_Data(T initValue, T identity_)
+      : value{initValue},
+        identity{identity_},
+        is_setup{false},
+        own_device_ptr{false}
+  {
+  }
+
+  RAJA_HOST_DEVICE
+  ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
+      : value{other.identity},
+        identity{other.identity},
+        is_setup{other.is_setup},
+        own_device_ptr{false}
+  {
+  }
+
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+
+  //! initialize output to identity to ensure never read
+  //  uninitialized memory
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
+
+  //! reduce values in grid to single value, store in output
+  RAJA_DEVICE
+  void grid_reduce(T* output)
+  {
+    T temp = value;
+
+    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
+            temp, identity, output);
+  }
+
+  //! check and setup for device
+  //  allocate device pointers and get a new result buffer from the pinned tally
+  bool setupForDevice()
+  {
+    bool act = !is_setup && setupReducers();
+    if (act) {
+      is_setup = true;
+      own_device_ptr = true;
+    }
+    return act;
+  }
+
+  //! if own resources teardown device setup
+  //  free device pointers
+  bool teardownForDevice()
+  {
+    bool act = own_device_ptr;
+    if (act) {
+      is_setup = false;
+      own_device_ptr = false;
+    }
+    return act;
+  }
+};
+
+//! Reduction data for Hip Offload -- stores value, host pointer
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceAtomicDeviceInit_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
@@ -856,9 +795,9 @@ struct ReduceAtomic_Data {
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
 
-  ReduceAtomic_Data(T initValue, T identity_)
+  ReduceAtomicDeviceInit_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
@@ -868,7 +807,7 @@ struct ReduceAtomic_Data {
   }
 
   RAJA_HOST_DEVICE
-  ReduceAtomic_Data(const ReduceAtomic_Data& other)
+  ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
@@ -877,11 +816,17 @@ struct ReduceAtomic_Data {
   {
   }
 
-  ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
@@ -889,9 +834,11 @@ struct ReduceAtomic_Data {
   {
     T temp = value;
 
-    if (impl::grid_reduce_atomic<Combiner>(
-            temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -901,9 +848,9 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(1);
-      device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
+      device_count = count_mempool_type::getInstance()
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -915,9 +862,9 @@ struct ReduceAtomic_Data {
   {
     bool act = own_device_ptr;
     if (act) {
-      device_mempool_type::getInstance().free(device);
+      data_mempool_type::getInstance().free(device);
       device = nullptr;
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -925,10 +872,56 @@ struct ReduceAtomic_Data {
   }
 };
 
+
 //! Hip Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T, bool maybe_atomic>
+template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
+  static constexpr size_t replication = (tuning::replication > 0)
+      ? tuning::replication
+      : 32;
+  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
+      ? tuning::atomic_stride
+      : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+      impl::AccessorDeviceScopeUseBlockFence,
+      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
+        impl::AccessorDeviceScopeUseDeviceFence,
+        void>>;
+
+  static constexpr bool atomic_policy =
+      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
+  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+
+  //! hip reduction data storage class and folding algorithm
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
+                                              (atomic_policy && !atomic_available),
+      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      std::conditional_t<atomic_available,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
+          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
+            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
+            void>>,
+        void>>;
+
+  static constexpr size_t tally_slots = reduce_data_type::tally_slots;
+
+  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+
+  //! union to hold either pointer to PinnedTally or pointer to value
+  //  only use list before setup for device and only use val_ptr after
+  union tally_u {
+    TallyType* list;
+    T* val_ptr;
+    constexpr tally_u(TallyType* l) : list(l){};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+  };
+
 public:
   Reduce() : Reduce(T(), Combiner::identity()) {}
 
@@ -936,7 +929,7 @@ class Reduce
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
       : parent{this},
-        tally_or_val_ptr{new PinnedTally<T>},
+        tally_or_val_ptr{new TallyType},
         val(init_val, identity_)
   {
   }
@@ -963,9 +956,8 @@ class Reduce
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     if (parent) {
       if (val.setupForDevice()) {
-        tally_or_val_ptr.val_ptr =
-            tally_or_val_ptr.list->new_value(currentResource());
-        val.init_grid_val(tally_or_val_ptr.val_ptr);
+        tally_or_val_ptr.val_ptr = val.init_grid_vals(
+            tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
       }
     }
@@ -1009,9 +1001,15 @@ class Reduce
     auto end = tally_or_val_ptr.list->end();
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
+      ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
+          reducer(std::move(val.value));
       for (; n != end; ++n) {
-        Combiner{}(val.value, *n);
+        T(&values)[tally_slots] = *n;
+        for (size_t r = 0; r < tally_slots; ++r) {
+          reducer.combine(std::move(values[r]));
+        }
       }
+      val.value = reducer.get_and_clear();
       tally_or_val_ptr.list->free_list();
     }
     return val.value;
@@ -1032,38 +1030,20 @@ class Reduce
 
 private:
   const Reduce* parent;
-
-  //! union to hold either pointer to PinnedTally or poiter to value
-  //  only use list before setup for device and only use val_ptr after
-  union tally_u {
-    PinnedTally<T>* list;
-    T* val_ptr;
-    constexpr tally_u(PinnedTally<T>* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
-  };
-
   tally_u tally_or_val_ptr;
-
-  //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = typename std::conditional<
-      maybe_atomic && RAJA::reduce::hip::hip_atomic_available<T>::value,
-      hip::ReduceAtomic_Data<Combiner, T>,
-      hip::Reduce_Data<Combiner, T>>::type;
-
-  //! storage for reduction data
   reduce_data_type val;
 };
 
 }  // end namespace hip
 
 //! specialization of ReduceSum for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceSum<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceSum<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::sum<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::sum<T>, T, tuning>;
   using Base::Base;
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1075,13 +1055,13 @@ class ReduceSum<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitOr for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitOr<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceBitOr<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1093,13 +1073,13 @@ class ReduceBitOr<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitAnd for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitAnd<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceBitAnd<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1111,13 +1091,13 @@ class ReduceBitAnd<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMin for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMin<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceMin<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::min<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::min<T>, T, tuning>;
   using Base::Base;
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1129,13 +1109,13 @@ class ReduceMin<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMax for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMax<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>
+template <typename tuning, typename T>
+class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::max<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::max<T>, T, tuning>;
   using Base::Base;
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1147,18 +1127,18 @@ class ReduceMax<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMinLoc for hip_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMinLoc<hip_reduce_base<maybe_atomic>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
     : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
                           RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          maybe_atomic>
+                          tuning>
 {
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
   using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = hip::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
@@ -1197,18 +1177,18 @@ class ReduceMinLoc<hip_reduce_base<maybe_atomic>, T, IndexType>
 };
 
 //! specialization of ReduceMaxLoc for hip_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMaxLoc<hip_reduce_base<maybe_atomic>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
     : public hip::
           Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
                  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 maybe_atomic>
+                 tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
   using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = hip::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 40e44c2e19..cdf0a9b82d 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -49,6 +49,7 @@ namespace scan
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename Function>
@@ -56,7 +57,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -121,6 +122,7 @@ inclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename Function,
@@ -129,7 +131,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -198,6 +200,7 @@ exclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -206,7 +209,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -271,6 +274,7 @@ inclusive(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -280,7 +284,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index a6918968c8..eb16246623 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -73,7 +73,9 @@ namespace detail
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -83,7 +85,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter,
     Iter,
     Compare)
@@ -102,13 +104,15 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>>)
@@ -190,13 +194,15 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>>)
@@ -279,7 +285,9 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -289,7 +297,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter,
     Iter,
     Compare)
@@ -308,13 +316,15 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>> comp)
@@ -325,13 +335,15 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>> comp)
@@ -343,7 +355,8 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -355,7 +368,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -379,7 +392,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -387,7 +401,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -483,7 +497,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -491,7 +506,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -588,7 +603,8 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -600,7 +616,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -624,7 +640,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -632,7 +649,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -644,7 +661,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -652,7 +670,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/loop.hpp b/include/RAJA/policy/loop.hpp
deleted file mode 100644
index 2cd9525dcd..0000000000
--- a/include/RAJA/policy/loop.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
-******************************************************************************
-*
-* \file
-*
-* \brief   Header file containing RAJA headers for sequential execution.
-*
-*          These methods work on all platforms.
-*
-******************************************************************************
-*/
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_loop_HPP
-#define RAJA_loop_HPP
-
-#if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
-#endif
-
-#include "RAJA/policy/sequential/forall.hpp"
-#include "RAJA/policy/sequential/kernel.hpp"
-#include "RAJA/policy/loop/policy.hpp"
-#include "RAJA/policy/sequential/scan.hpp"
-#include "RAJA/policy/sequential/sort.hpp"
-#include "RAJA/policy/sequential/launch.hpp"
-#include "RAJA/policy/sequential/WorkGroup.hpp"
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/policy.hpp b/include/RAJA/policy/loop/policy.hpp
deleted file mode 100644
index 1bf34250bb..0000000000
--- a/include/RAJA/policy/loop/policy.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA sequential policy definitions.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef policy_loop_HPP
-#define policy_loop_HPP
-
-#include "RAJA/policy/PolicyBase.hpp"
-
-#include "RAJA/policy/sequential/policy.hpp"
-
-namespace RAJA
-{
-namespace policy
-{
-namespace loop
-{
-
-//
-//////////////////////////////////////////////////////////////////////
-//
-// Execution policies
-//
-//////////////////////////////////////////////////////////////////////
-//
-
-///
-/// Segment execution policies
-///
-
-using loop_exec = seq_exec;
-
-///
-/// Index set segment iteration policies
-///
-using loop_segit = seq_exec;
-
-///
-/// WorkGroup execution policies
-///
-using loop_work = seq_work;
-
-///
-///////////////////////////////////////////////////////////////////////
-///
-/// Reduction execution policies
-///
-///////////////////////////////////////////////////////////////////////
-///
-using loop_reduce = seq_reduce;
-
-
-///
-///////////////////////////////////////////////////////////////////////
-///
-/// Atomic execution policies
-///
-///////////////////////////////////////////////////////////////////////
-///
-using loop_atomic = seq_atomic;
-
-}  // end namespace loop
-
-}  // end namespace policy
-
-using policy::loop::loop_atomic;
-using policy::loop::loop_exec;
-using policy::loop::loop_reduce;
-using policy::loop::loop_segit;
-using policy::loop::loop_work;
-
-}  // namespace RAJA
-
-#endif
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 0dffee6a21..9176444cd4 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -56,13 +56,13 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(params.threads.value[0],
+    const ::sycl::range<3> blockSize(params.threads.value[2],
 				     params.threads.value[1],
-				     params.threads.value[2]);
+				     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
 				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[2] * params.teams.value[2]);
+				    params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
@@ -138,13 +138,13 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(params.threads.value[0],
+    const ::sycl::range<3> blockSize(params.threads.value[2],
 				     params.threads.value[1],
-				     params.threads.value[2]);
+				     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
 				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[2] * params.teams.value[2]);
+				    params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index d76b862c22..b4249e7182 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -42,9 +42,20 @@ namespace operators
 namespace detail
 {
 
+// truly associative (does not include fp add/multiply)
 struct associative_tag {
 };
 
+// associative up to floating point rounding differences
+struct fp_associative_tag : associative_tag {
+};
+
+// get associativity tag appropriate for the type
+template < typename T >
+using associative_or_fp_associative_tag =
+  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                     fp_associative_tag, associative_tag>;
+
 template <typename Arg1, typename Arg2, typename Result>
 struct binary_function {
   using first_argument_type = Arg1;
@@ -327,7 +338,7 @@ static_assert(check<unsigned long long>(),
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
-              detail::associative_tag {
+              detail::associative_or_fp_associative_tag<Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -347,7 +358,7 @@ struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
-                    detail::associative_tag {
+                    detail::associative_or_fp_associative_tag<Ret> {
 
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
@@ -569,6 +580,12 @@ struct is_associative {
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
+template <typename T>
+struct is_fp_associative {
+  static constexpr const bool value =
+      std::is_base_of<detail::fp_associative_tag, T>::value;
+};
+
 template <typename Arg1, typename Arg2 = Arg1>
 struct safe_plus
     : public plus<Arg1,
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 616b8d21d4..47802d8f0a 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -20,8 +20,11 @@
 
 #include "RAJA/config.hpp"
 
+#include <type_traits>
+
 // for RAJA::reduce::detail::ValueLoc
 #include "RAJA/pattern/detail/reduce.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -38,18 +41,37 @@ namespace detail
  */
 template <typename T,
           typename mempool = RAJA::basic_mempool::MemPool<
-              RAJA::basic_mempool::generic_allocator> >
+              RAJA::basic_mempool::generic_allocator>,
+          typename accessor = DefaultAccessor >
 class SoAPtr
 {
-  using value_type = T;
+  template < typename, typename, typename >
+  friend class SoAPtr; // friend other instantiations of this class
 
 public:
+  using value_type = T;
+
+  template < typename rhs_accessor >
+  using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
+
   SoAPtr() = default;
+  SoAPtr(SoAPtr const&) = default;
+  SoAPtr(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr const&) = default;
+  SoAPtr& operator=(SoAPtr &&) = default;
+
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
   {
   }
 
+  template < typename rhs_accessor,
+             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
+  RAJA_HOST_DEVICE
+  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+    : mem(rhs.mem)
+  { }
+
   SoAPtr& allocate(size_t size)
   {
     mem = mempool::getInstance().template malloc<value_type>(size);
@@ -65,8 +87,8 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; }
+  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
 
 private:
   value_type* mem = nullptr;
@@ -75,21 +97,41 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
+template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
 {
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
   using first_type = T;
   using second_type = IndexType;
 
+  template < typename, typename, typename >
+  friend class SoAPtr; // fiend other instantiations of this class
+
 public:
+  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+
+  template < typename rhs_accessor >
+  using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
+
   SoAPtr() = default;
+  SoAPtr(SoAPtr const&) = default;
+  SoAPtr(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr const&) = default;
+  SoAPtr& operator=(SoAPtr &&) = default;
+
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
   {
   }
 
+  template < typename rhs_accessor,
+             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
+  RAJA_HOST_DEVICE
+  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+    : mem(rhs.mem)
+    , mem_idx(rhs.mem_idx)
+  { }
+
   SoAPtr& allocate(size_t size)
   {
     mem = mempool::getInstance().template malloc<first_type>(size);
@@ -110,12 +152,12 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
 
   RAJA_HOST_DEVICE value_type get(size_t i) const
   {
-    return value_type(mem[i], mem_idx[i]);
+    return value_type(accessor::get(mem, i), accessor::get(mem_idx, i));
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    mem[i] = val;
-    mem_idx[i] = val.getLoc();
+    accessor::set(mem, i, first_type(val));
+    accessor::set(mem_idx, i, val.getLoc());
   }
 
 private:
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index 61624e0725..f0208ccbd3 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -309,6 +309,7 @@ class MemPool
   }
 
 
+  /// Free all backing allocations, even if they are currently in use
   void free_chunks()
   {
 #if defined(RAJA_ENABLE_OPENMP)
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
new file mode 100644
index 0000000000..b279ec29ff
--- /dev/null
+++ b/include/RAJA/util/for_each.hpp
@@ -0,0 +1,95 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA for_each templates.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_for_each_HPP
+#define RAJA_util_for_each_HPP
+
+#include "RAJA/config.hpp"
+
+#include <iterator>
+#include <type_traits>
+
+#include "camp/list.hpp"
+
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+// runtime loop applying func to each element in the range in order
+template<typename Iter, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
+{
+  for (; begin != end; ++begin) {
+    func(*begin);
+  }
+
+  return func;
+}
+
+// compile time expansion applying func to a each type in the list in order
+template <typename UnaryFunc, typename... Ts>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+{
+  // braced init lists are evaluated in order
+  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
+  RAJA_UNUSED_VAR(seq_unused_array);
+
+  return func;
+}
+
+}  // namespace detail
+
+
+/*!
+  \brief Apply func to all the elements in the given range in order
+  using a sequential for loop in O(N) operations and O(1) extra memory
+    see https://en.cppreference.com/w/cpp/algorithm/for_each
+*/
+template <typename Container, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+    for_each(Container&& c, UnaryFunc func)
+{
+  using std::begin;
+  using std::end;
+
+  return detail::for_each(begin(c), end(c), std::move(func));
+}
+
+/*!
+  \brief Apply func to each type in the given list in order
+  using a compile-time expansion in O(N) operations and O(1) extra memory
+*/
+template <typename UnaryFunc, typename... Ts>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+{
+  return detail::for_each_type(c, std::move(func));
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index fc83f8999b..55e90010d8 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -56,6 +56,8 @@
 #define RAJA_HOST __host__
 #define RAJA_SUPPRESS_HD_WARN
 
+#define RAJA_USE_HIP_INTRINSICS
+
 #else
 
 #define RAJA_HOST_DEVICE
@@ -64,6 +66,13 @@
 #define RAJA_SUPPRESS_HD_WARN
 #endif
 
+
+#if defined(__has_builtin)
+#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) 0
+#endif
+
 /*!
  *******************************************************************************
  * \def RAJA_USED_ARG(x)
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
new file mode 100644
index 0000000000..36c7cca1a0
--- /dev/null
+++ b/include/RAJA/util/math.hpp
@@ -0,0 +1,75 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA math templates.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_math_HPP
+#define RAJA_util_math_HPP
+
+#include "RAJA/config.hpp"
+
+#include <type_traits>
+#include <climits>
+
+namespace RAJA
+{
+
+/*!
+    \brief evaluate log base 2 of n
+
+    For positive n calculate log base 2 of n, and round the result down to the
+    nearest integer.
+    For zero or negative n return 0
+
+*/
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr T log2(T n) noexcept
+{
+  T result = 0;
+  if (n > 0) {
+    while(n >>= 1) {
+      ++result;
+    }
+  }
+  return result;
+}
+
+/*!
+    \brief "round up" to the next greatest power of 2
+
+    For a integer n,
+      if n is non-negative,
+        if n is a power of 2, return n
+        if n is not a power of 2, return the next greater power of 2
+      if n is negative, return 0
+*/
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE
+constexpr T next_pow2(T n) noexcept
+{
+  --n;
+  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+    n |= n >> s;
+  }
+  ++n;
+  return n;
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
new file mode 100644
index 0000000000..6d0c28f861
--- /dev/null
+++ b/include/RAJA/util/reduce.hpp
@@ -0,0 +1,400 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort templates.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_reduce_HPP
+#define RAJA_util_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#include <climits>
+#include <iterator>
+#include <new>
+#include <type_traits>
+
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/math.hpp"
+#include "RAJA/util/Operators.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+    \brief Reduce class that does a reduction with a left fold.
+*/
+template <typename T, typename BinaryOp>
+struct LeftFoldReduce
+{
+  RAJA_HOST_DEVICE RAJA_INLINE
+  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
+                                      BinaryOp op = BinaryOp{}) noexcept
+    : m_op(std::move(op))
+    , m_accumulated_value(std::move(init))
+  {
+
+  }
+
+  LeftFoldReduce(LeftFoldReduce const&) = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
+  LeftFoldReduce(LeftFoldReduce &&) = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
+
+  ~LeftFoldReduce() = default;
+
+
+  /*!
+      \brief reset the combined value of the reducer to the identity
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void clear() noexcept
+  {
+    m_accumulated_value = BinaryOp::identity();
+  }
+
+  /*!
+      \brief return the combined value and clear the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get_and_clear()
+  {
+    T accumulated_value = std::move(m_accumulated_value);
+
+    clear();
+
+    return accumulated_value;
+  }
+
+  /*!
+      \brief return the combined value
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get()
+  {
+    return m_accumulated_value;
+  }
+
+  /*!
+      \brief combine a value into the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void combine(T val)
+  {
+    m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
+  }
+
+private:
+  BinaryOp m_op;
+  T m_accumulated_value;
+};
+
+/*!
+    \brief Reduce class that does a reduction with a binary tree.
+*/
+template <typename T, typename BinaryOp, typename SizeType = size_t,
+          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
+struct BinaryTreeReduce
+{
+  static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
+  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
+
+  static constexpr SizeType num_levels = t_num_levels;
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
+                                      BinaryOp op = BinaryOp{}) noexcept
+    : m_op(std::move(op))
+  {
+    combine(std::move(init));
+  }
+
+  BinaryTreeReduce(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  ~BinaryTreeReduce()
+  {
+    clear();
+  }
+
+
+  /*!
+      \brief reset the combined value of the reducer to the identity
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void clear() noexcept
+  {
+    // destroy all values on the tree stack and reset count to 0
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+
+      if (m_count & mask) {
+
+        get_value(level)->~T();
+
+        m_count ^= mask;
+
+      }
+    }
+  }
+
+  /*!
+      \brief return the combined value and clear the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get_and_clear()
+  {
+    // accumulate all values
+    T value = BinaryOp::identity();
+
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+
+      if (m_count & mask) {
+
+        value = m_op(std::move(value), std::move(*get_value(level)));
+        get_value(level)->~T();
+
+        m_count ^= mask;
+      }
+    }
+
+    return value;
+  }
+
+  /*!
+      \brief return the combined value
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get()
+  {
+    // accumulate all values
+    T value = BinaryOp::identity();
+
+    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
+
+      if (count & mask) {
+
+        value = m_op(std::move(value), *get_value(level));
+
+        count ^= mask;
+      }
+    }
+
+    return value;
+  }
+
+  /*!
+      \brief combine a value into the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void combine(T value)
+  {
+    // accumulate values and store in the first unused level found
+    // clear values from used levels along the way
+    SizeType level = 0;
+    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) {
+
+      value = m_op(std::move(*get_value(level)), std::move(value));
+      get_value(level)->~T();
+
+    }
+
+    new(get_storage(level)) T(std::move(value));
+
+    ++m_count;
+  }
+
+private:
+  BinaryOp m_op;
+
+  // A counter of the number of inputs combined.
+  // The bits of count indicate which levels of tree stack have a value
+  SizeType m_count = 0;
+
+  // Each level in tree stack has a value that holds the accumulation of 2^level
+  // values or is unused and has no value.
+  std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void* get_storage(SizeType level)
+  {
+    return &m_tree_stack[level];
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T* get_value(SizeType level)
+  {
+#if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    // TODO: check that launder is supported in device code
+    return std::launder(reinterpret_cast<T*>(&m_tree_stack[level]));
+#else
+    return reinterpret_cast<T*>(&m_tree_stack[level]);
+#endif
+  }
+};
+
+
+template <typename T, typename BinaryOp>
+using HighAccuracyReduce = std::conditional_t<
+    RAJA::operators::is_fp_associative<T>::value,
+      BinaryTreeReduce<T, BinaryOp>,
+      LeftFoldReduce<T, BinaryOp>>;
+
+
+/*!
+    \brief Combine into a single value using a left fold with the given
+           operation using O(N) operations and O(1) memory
+*/
+template <typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE
+T left_fold_reduce(Iter begin,
+                   Iter end,
+                   T init,
+                   BinaryOp op)
+{
+  LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
+
+  for (; begin != end; ++begin) {
+
+    reducer.combine(*begin);
+
+  }
+
+  return reducer.get_and_clear();
+}
+
+/*!
+    \brief reduce using a binary tree with the given operation
+           and using O(N) operations and O(lg(n)) memory
+
+    This is more accurate than sequentially adding into a single value for
+    floating point types.
+*/
+template <typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE
+T binary_tree_reduce(Iter begin,
+                     Iter end,
+                     T init,
+                     BinaryOp op)
+{
+  using std::distance;
+  using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
+
+  for (; begin != end; ++begin) {
+
+    reducer.combine(*begin);
+
+  }
+
+  return reducer.get_and_clear();
+}
+
+
+/*!
+    \brief reducer that uses a high accuracy implementation when round-off error
+    is a concern, or a faster algorithm with it is not a concern
+*/
+template <typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE
+T high_accuracy_reduce(Iter begin,
+                        Iter end,
+                        T init,
+                        BinaryOp op)
+{
+  HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
+
+  for (; begin != end; ++begin) {
+
+    reducer.combine(*begin);
+
+  }
+
+  return reducer.get_and_clear();
+}
+
+}  // namespace detail
+
+/*!
+  \brief Accumulate given range to a single value
+  using a left fold algorithm in O(N) operations and O(1) extra memory
+    see https://en.cppreference.com/w/cpp/algorithm/accumulate
+*/
+template <typename Container,
+          typename T = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+{
+  using std::begin;
+  using std::end;
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
+
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
+}
+
+/*!
+  \brief Reduce given range to a single value
+  using a binary tree algorithm in O(N) operations and O(lg(N)) extra memory
+    see https://en.cppreference.com/w/cpp/algorithm/reduce
+*/
+template <typename Container,
+          typename T = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+{
+  using std::begin;
+  using std::end;
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
+
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
+}
+
+/*!
+  \brief Reduce given range to a single value
+  using an algorithm with high accuracy when floating point round off is a
+  concern
+    see https://en.cppreference.com/w/cpp/algorithm/reduce
+*/
+template <typename Container,
+          typename T = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+{
+  using std::begin;
+  using std::end;
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
+
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index a54ce434a2..28a476d951 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -65,8 +65,9 @@ namespace RAJA
     using type = camp::resources::Cuda;
   };
 
-  template<typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>>{
+  template<typename IterationMapping, typename IterationGetter,
+           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
+  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
     using type = camp::resources::Cuda;
   };
 
@@ -75,8 +76,9 @@ namespace RAJA
     using type = camp::resources::Cuda;
   };
 
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>>>{
+  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
+           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
     using type = camp::resources::Cuda;
   };
 #endif
@@ -87,8 +89,9 @@ namespace RAJA
     using type = camp::resources::Hip;
   };
 
-  template<typename IterationMapping, typename IterationGetter, bool Async>
-  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>>{
+  template<typename IterationMapping, typename IterationGetter,
+           typename Concretizer, bool Async>
+  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
     using type = camp::resources::Hip;
   };
 
@@ -97,8 +100,9 @@ namespace RAJA
     using type = camp::resources::Hip;
   };
 
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>>>{
+  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
+           typename Concretizer, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
     using type = camp::resources::Hip;
   };
 #endif
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index f1eebfc282..bbec03dfe1 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -26,8 +26,8 @@
 #include "RAJA/pattern/detail/algorithm.hpp"
 
 #include "RAJA/util/macros.hpp"
-
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/math.hpp"
 
 namespace RAJA
 {
@@ -35,23 +35,6 @@ namespace RAJA
 namespace detail
 {
 
-/*!
-    \brief evaluate log base 2 of N rounded down to the nearest integer >= 0
-*/
-RAJA_HOST_DEVICE RAJA_INLINE
-unsigned
-ulog2(size_t N)
-{
-  unsigned val = 0;
-
-  while (N > 1) {
-    val += 1;
-    N >>= 1;
-  }
-
-  return val;
-}
-
 /*!
     \brief unstable partition given range inplace using predicate function
     and using O(N) predicate evaluations and O(1) memory
@@ -426,7 +409,7 @@ intro_sort(Iter begin,
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2*detail::ulog2(N);
+  unsigned max_depth = 2*RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   // limit max_depth statically in device code to allow compiler to remove recursion
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 811f681b9b..7e331ef00e 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -30,6 +30,9 @@
 
 #include "camp/helpers.hpp"
 
+#include "RAJA/util/macros.hpp"
+
+
 namespace RAJA
 {
 
@@ -67,6 +70,18 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
+struct DirectBase {};
+struct LoopBase {};
+struct ContiguousLoopBase : LoopBase {};
+struct StridedLoopBase : LoopBase {};
+struct UnsizedLoopBase {};
+struct SizedLoopBase {};
+template < size_t t_max_iterations >
+struct SizedLoopSpecifyingBase : SizedLoopBase
+{
+  static constexpr size_t max_iterations = t_max_iterations;
+};
+
 ///
 /// Direct assumes the loop has enough iterations for all of the indices and
 /// maps directly from an iteration to an index.
@@ -88,7 +103,7 @@ namespace iteration_mapping
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-struct Direct {};
+struct Direct : DirectBase {};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -115,7 +130,10 @@ struct Direct {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-struct Contiguousloop {};
+template < size_t max_iterations >
+struct Contiguousloop : ContiguousLoopBase,
+    std::conditional_t<(max_iterations != named_usage::unspecified),
+                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -142,7 +160,10 @@ struct Contiguousloop {};
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-struct StridedLoop {};
+template < size_t max_iterations >
+struct StridedLoop : StridedLoopBase,
+    std::conditional_t<(max_iterations != named_usage::unspecified),
+                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
 
 } // namespace iteration_mapping
 
@@ -172,6 +193,28 @@ struct SizeList {
 };
 
 
+///
+/// Compile time fraction for use with integral types
+///
+template <typename int_t, int_t numerator, int_t denominator>
+struct Fraction
+{
+  static_assert(denominator != int_t(0), "denominator must not be zero");
+
+  using inverse = Fraction<int_t, denominator, numerator>;
+
+  template < typename new_int_t >
+  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+
+  static constexpr int_t multiply(int_t val) noexcept
+  {
+    return (val / denominator) * numerator +
+           (val % denominator) * numerator / denominator;
+  }
+
+};
+
+
 /*!
  ******************************************************************************
  *
@@ -823,6 +866,98 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 
 #endif
 
+
+namespace detail {
+
+/*!
+ * \brief Abstracts access to memory using normal memory accesses.
+ */
+struct DefaultAccessor
+{
+  template < typename T >
+  static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
+  {
+    return ptr[i];
+  }
+
+  template < typename T >
+  static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
+  {
+    ptr[i] = val;
+  }
+};
+
+
+/*!
+ * \brief Abstracts T into an equal or greater size array of integers whose
+ * size is between min_integer_type_size and max_interger_type_size inclusive.
+ */
+template <typename T,
+          size_t min_integer_type_size = 1,
+          size_t max_integer_type_size = sizeof(unsigned long long)>
+struct AsIntegerArray
+{
+  static_assert(min_integer_type_size <= max_integer_type_size,
+                "incompatible min and max integer type size");
+  using integer_type = std::conditional_t<
+      ((alignof(T) >= alignof(unsigned long long) &&
+        sizeof(unsigned long long) <= max_integer_type_size) ||
+       sizeof(unsigned long) < min_integer_type_size),
+      unsigned long long,
+      std::conditional_t<
+          ((alignof(T) >= alignof(unsigned long) &&
+            sizeof(unsigned long) <= max_integer_type_size) ||
+           sizeof(unsigned int) < min_integer_type_size),
+          unsigned long,
+          std::conditional_t<
+              ((alignof(T) >= alignof(unsigned int) &&
+                sizeof(unsigned int) <= max_integer_type_size) ||
+               sizeof(unsigned short) < min_integer_type_size),
+              unsigned int,
+              std::conditional_t<
+                  ((alignof(T) >= alignof(unsigned short) &&
+                    sizeof(unsigned short) <= max_integer_type_size) ||
+                   sizeof(unsigned char) < min_integer_type_size),
+                  unsigned short,
+                  std::conditional_t<
+                      ((alignof(T) >= alignof(unsigned char) &&
+                        sizeof(unsigned char) <= max_integer_type_size)),
+                      unsigned char,
+                      void>>>>>;
+  static_assert(!std::is_same<integer_type, void>::value,
+                "could not find a compatible integer type");
+  static_assert(sizeof(integer_type) >= min_integer_type_size,
+                "integer_type smaller than min integer type size");
+  static_assert(sizeof(integer_type) <= max_integer_type_size,
+                "integer_type greater than max integer type size");
+
+  static constexpr size_t num_integer_type =
+      (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
+
+  integer_type array[num_integer_type] = {0};
+
+  AsIntegerArray() = default;
+
+  RAJA_HOST_DEVICE constexpr size_t array_size() const
+  {
+    return num_integer_type;
+  }
+
+  RAJA_HOST_DEVICE constexpr T get_value() const
+  {
+    T value;
+    memcpy(&value, &array[0], sizeof(T));
+    return value;
+  }
+
+  RAJA_HOST_DEVICE constexpr void set_value(T value)
+  {
+    memcpy(&array[0], &value, sizeof(T));
+  }
+};
+
+}  // namespace detail
+
 }  // namespace RAJA
 
 #endif  // closing endif for header file include guard
diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index af75606a7f..a8d22367e0 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit af75606a7fc0492e35cdd3860337c4e873f43124
+Subproject commit a8d22367e03d4c9c180a11886414430bdf6428a8
diff --git a/scripts/uberenv b/scripts/uberenv
index 4941c237ee..cf91883ef0 160000
--- a/scripts/uberenv
+++ b/scripts/uberenv
@@ -1 +1 @@
-Subproject commit 4941c237eec514d6d68872243efb9f4af8843f4d
+Subproject commit cf91883ef0500a808338ad6c8b56647da15fa5f3
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index cdb8940256..8da7b81eb7 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -36,12 +36,16 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
     for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
       s_type idx = c + RAJA::stripIndexType(thread_range)*b;
-      test_array[idx] = INDEX_TYPE(idx);
+      test_array[idx] = INDEX_TYPE(idx) + INDEX_TYPE(c);
     }
   }
 
   size_t shared_mem_size = RAJA::stripIndexType(thread_range)*sizeof(INDEX_TYPE);
 
+  //Use an int type to test the bump style allocator.
+  //Key idea is that we are requesting different amounts.
+  shared_mem_size += RAJA::stripIndexType(thread_range)*sizeof(int);
+
   RAJA::launch<LAUNCH_POLICY>
     (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
                         RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
@@ -52,7 +56,11 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
           INDEX_TYPE * tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(RAJA::stripIndexType(thread_range));
           RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(tile_ptr, RAJA::stripIndexType(thread_range));
 
+          int * int_tile_ptr = ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
+          RAJA::View<int, RAJA::Layout<1>> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range));
+
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
+              Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
               Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid;
             });
 
@@ -60,7 +68,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
               INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid));
+              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid));
           });
 
           ctx.releaseSharedMemory();
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 2fe790ff93..40adaccc8c 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -108,7 +108,9 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
                                        RAJA::cuda_exec_occ_calc<256>,
                                        RAJA::cuda_exec_grid<256, 64>,
-                                       RAJA::cuda_exec_explicit<256,2> >;
+                                       RAJA::cuda_exec_explicit<256,2>,
+                                       RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
+                                       RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -119,7 +121,9 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #if defined(RAJA_ENABLE_HIP)
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
                                       RAJA::hip_exec_occ_calc<256>,
-                                      RAJA::hip_exec_grid<256, 64>  >;
+                                      RAJA::hip_exec_grid<256, 64>,
+                                      RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
+                                      RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 5b5dfdbebf..7179e48fdc 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -81,8 +81,8 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 using sycl_direct_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>
+             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
             >;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index 38bc4c8bb0..f84823e414 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -100,12 +100,12 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 using sycl_direct_policies = 
   camp::list<
              RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, //slowest
              RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, //fastest
+             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
              RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>
+             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
             >;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index 9961cd0741..fea90a8305 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -68,7 +68,7 @@ using Hip_launch_policies = camp::list<
 
 using sycl_policies = camp::list<
   RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_global_item_0>>;
+  RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
 using Sycl_launch_policies = camp::list<
       sycl_policies
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index 9e5779853c..6173fc6ffa 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -75,8 +75,8 @@ using Hip_launch_policies = camp::list<
 #if defined(RAJA_ENABLE_SYCL)
 using sycl_loop_policies = camp::list<
   RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>
+  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
   >;
 
 using Sycl_launch_policies = camp::list<  
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index 9d217757b2..d703216a13 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -95,12 +95,12 @@ using Hip_launch_policies = camp::list<
 #if defined(RAJA_ENABLE_SYCL)
 using sycl_loop_policies = camp::list<
   RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, //slowest index
   RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, //fastest index
+  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
   RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>
+  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
   >;
 
 using Sycl_launch_policies = camp::list<  
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index bec07358e6..fa2b39f761 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -52,8 +52,8 @@ using Sequential_launch_policies = camp::list<seq_hip_policies>;
 using seq_sycl_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_0_loop>
+             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
             >;
 
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
@@ -110,8 +110,8 @@ using OpenMP_launch_policies = camp::list<omp_hip_policies>;
 using omp_sycl_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::sycl_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_0_loop>
+             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
             >;
 
 using OpenMP_launch_policies = camp::list<omp_sycl_policies>;
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index d8d5fc670b..e9e075b287 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -34,11 +34,21 @@ using OpenMPTargetReducePols =
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols = camp::list< RAJA::cuda_reduce >;
+using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence,
+                                   RAJA::cuda_reduce_block_fence,
+                                   RAJA::cuda_reduce_atomic_device_init_device_fence,
+                                   RAJA::cuda_reduce_atomic_device_init_block_fence,
+                                   RAJA::cuda_reduce_atomic_host_init_device_fence,
+                                   RAJA::cuda_reduce_atomic_host_init_block_fence >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols = camp::list< RAJA::hip_reduce >;
+using HipReducePols = camp::list< RAJA::hip_reduce_device_fence,
+                                  RAJA::hip_reduce_block_fence,
+                                  RAJA::hip_reduce_atomic_device_init_device_fence,
+                                  RAJA::hip_reduce_atomic_device_init_block_fence,
+                                  RAJA::hip_reduce_atomic_host_init_device_fence,
+                                  RAJA::hip_reduce_atomic_host_init_block_fence >;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)
diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt
index 856e4519b6..ea93727d59 100644
--- a/test/unit/algorithm/CMakeLists.txt
+++ b/test/unit/algorithm/CMakeLists.txt
@@ -48,43 +48,64 @@ foreach( SORT_BACKEND ${SORT_BACKENDS} )
 endforeach()
 
 
-set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge )
-set( CUDA_UTIL_SORTS       Shell Heap Intro )
-set( HIP_UTIL_SORTS        Shell Heap Intro )
 
-macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_SORTS)
-  set( SORT_BACKEND ${SORT_BACKEND_in} )
-  set( SORT_SIZE ${SORT_SIZE_in} )
-  foreach( UTIL_SORT ${UTIL_SORTS} )
-    configure_file( test-algorithm-util-sort.cpp.in
-                    test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )
+macro(RAJA_GENERATE_ALGORITHM_UTIL_TESTS ALG ALG_BACKEND_in ALG_SIZE_in UTIL_ALGS)
+  set( ALG_BACKEND ${ALG_BACKEND_in} )
+  set( ALG_SIZE ${ALG_SIZE_in} )
+  foreach( UTIL_ALG ${UTIL_ALGS} )
+    configure_file( test-algorithm-util-${ALG}.cpp.in
+                    test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp )
 
-    raja_add_test( NAME test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )
+    raja_add_test( NAME test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp )
 
-    target_include_directories(test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.exe
+    target_include_directories(test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 
   endforeach()
-  unset( SORT_SIZE )
-  unset( SORT_BACKEND )
+  unset( ALG_SIZE )
+  unset( ALG_BACKEND )
 endmacro()
 
 
-RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Default "${SEQUENTIAL_UTIL_SORTS}" )
-RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Small "Insertion" )
+set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge )
+set( CUDA_UTIL_SORTS       Shell Heap Intro )
+set( HIP_UTIL_SORTS        Shell Heap Intro )
+
+RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Default "${SEQUENTIAL_UTIL_SORTS}" )
+RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Small "Insertion" )
 
 if(RAJA_ENABLE_CUDA)
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Small "${CUDA_UTIL_SORTS}" )
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Tiny "Insertion" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Small "${CUDA_UTIL_SORTS}" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Tiny "Insertion" )
 endif()
 
 if(RAJA_ENABLE_HIP)
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Small "${HIP_UTIL_SORTS}" )
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Tiny "Insertion" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Small "${HIP_UTIL_SORTS}" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Tiny "Insertion" )
 endif()
 
+
+set( UTIL_REDUCES BinaryTree Accumulate )
+
+RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Sequential Default "${UTIL_REDUCES}" )
+
+if(RAJA_ENABLE_CUDA)
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Cuda Small "${UTIL_REDUCES}" )
+endif()
+
+if(RAJA_ENABLE_HIP)
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Hip Small "${UTIL_REDUCES}" )
+endif()
+
+
 unset( SORT_BACKENDS )
 unset( SEQUENTIAL_UTIL_SORTS )
 unset( CUDA_UTIL_SORTS )
 unset( HIP_UTIL_SORTS )
+unset( UTIL_REDUCES )
+
+
+raja_add_test(
+  NAME test-algorithm-util-for_each
+  SOURCES test-algorithm-util-for_each.cpp)
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
new file mode 100644
index 0000000000..db918ad234
--- /dev/null
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -0,0 +1,150 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for for_each
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "RAJA_unit-test-types.hpp"
+
+#include "camp/resource.hpp"
+
+#include <type_traits>
+#include <vector>
+#include <set>
+
+template<typename T>
+class ForEachUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes);
+
+
+TYPED_TEST(ForEachUnitTest, EmptyRange)
+{
+  std::vector<TypeParam> numbers;
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(numbers, [&](TypeParam& number) {
+    number += 1;
+    copies.push_back(number);
+  });
+
+  ASSERT_EQ(copies.size(), 0);
+  ASSERT_EQ(numbers.size(), 0);
+}
+
+TYPED_TEST(ForEachUnitTest, VectorRange)
+{
+  std::vector<TypeParam> numbers;
+  for (TypeParam i = 0; i < 13; ++i) {
+    numbers.push_back(i);
+  }
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(numbers, [&](TypeParam& number) {
+    copies.push_back(number);
+    number += 1;
+  });
+
+  ASSERT_EQ(copies.size(), 13);
+  for (TypeParam i = 0; i < 13; ++i) {
+    ASSERT_EQ(numbers[i], copies[i]+1);
+  }
+}
+
+TYPED_TEST(ForEachUnitTest, RajaSpanRange)
+{
+  std::vector<TypeParam> numbers;
+  for (TypeParam i = 0; i < 11; ++i) {
+    numbers.push_back(i);
+  }
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) {
+    copies.push_back(number);
+    number += 1;
+  });
+
+  ASSERT_EQ(copies.size(), 11);
+  for (TypeParam i = 0; i < 11; ++i) {
+    ASSERT_EQ(numbers[i], copies[i]+1);
+  }
+}
+
+TYPED_TEST(ForEachUnitTest, SetRange)
+{
+  std::set<TypeParam> numbers;
+  for (TypeParam i = 0; i < 6; ++i) {
+    numbers.insert(i);
+  }
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(numbers, [&](TypeParam const& number) {
+    copies.push_back(number);
+  });
+
+  ASSERT_EQ(copies.size(), 6);
+  for (TypeParam i = 0; i < 6; ++i) {
+    ASSERT_EQ(i, copies[i]);
+    ASSERT_EQ(numbers.count(i), 1);
+  }
+}
+
+
+TYPED_TEST(ForEachUnitTest, EmptyTypeList)
+{
+  using numbers = camp::list<>;
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each_type(numbers{}, [&](auto number) {
+    copies.push_back(number);
+  });
+
+  ASSERT_EQ(copies.size(), 0);
+}
+
+
+template < typename T, T val >
+T get_num(std::integral_constant<T, val>)
+{
+  return val;
+}
+
+template < typename TypeParam,
+           std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr >
+void run_int_type_test()
+{
+  using numbers = camp::list<std::integral_constant<TypeParam, 0>,
+                             std::integral_constant<TypeParam, 1>,
+                             std::integral_constant<TypeParam, 2>,
+                             std::integral_constant<TypeParam, 3>,
+                             std::integral_constant<TypeParam, 4>>;
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each_type(numbers{}, [&](auto number) {
+    copies.push_back(get_num(number));
+  });
+
+  ASSERT_EQ(copies.size(), 5);
+  for (TypeParam i = 0; i < 5; ++i) {
+    ASSERT_EQ(i, copies[i]);
+  }
+}
+///
+template < typename TypeParam,
+           std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr >
+void run_int_type_test()
+{
+  // ignore non-ints
+}
+
+TYPED_TEST(ForEachUnitTest, IntTypeList)
+{
+  run_int_type_test<TypeParam>();
+}
diff --git a/test/unit/algorithm/test-algorithm-util-reduce.cpp.in b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in
new file mode 100644
index 0000000000..d7dd20bcd2
--- /dev/null
+++ b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in
@@ -0,0 +1,36 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-algorithm-util-reduce.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @ALG_BACKEND@@UTIL_ALG@ReduceTypes =
+  Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@ReduceReducers,
+                                @ALG_BACKEND@ResourceList,
+                                ReduceValTypeList,
+                                ReduceMaxNList@ALG_SIZE@ > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@,
+                                ReduceUnitTest,
+                                @ALG_BACKEND@@UTIL_ALG@ReduceTypes );
diff --git a/test/unit/algorithm/test-algorithm-util-sort.cpp.in b/test/unit/algorithm/test-algorithm-util-sort.cpp.in
index 7dbb0dcd93..0555a9e9f0 100644
--- a/test/unit/algorithm/test-algorithm-util-sort.cpp.in
+++ b/test/unit/algorithm/test-algorithm-util-sort.cpp.in
@@ -22,15 +22,15 @@
 //
 // Cartesian product of types used in parameterized tests
 //
-using @SORT_BACKEND@@UTIL_SORT@SortTypes =
-  Test< camp::cartesian_product<@SORT_BACKEND@@UTIL_SORT@SortSorters,
-                                @SORT_BACKEND@ResourceList,
+using @ALG_BACKEND@@UTIL_ALG@SortTypes =
+  Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@SortSorters,
+                                @ALG_BACKEND@ResourceList,
                                 SortKeyTypeList,
-                                SortMaxNList@SORT_SIZE@ > >::Types;
+                                SortMaxNList@ALG_SIZE@ > >::Types;
 
 //
 // Instantiate parameterized test
 //
-INSTANTIATE_TYPED_TEST_SUITE_P( @SORT_BACKEND@,
+INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@,
                                 SortUnitTest,
-                                @SORT_BACKEND@@UTIL_SORT@SortTypes );
+                                @ALG_BACKEND@@UTIL_ALG@SortTypes );
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
new file mode 100644
index 0000000000..4e3f9fb795
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -0,0 +1,350 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing test infrastructure for reduce tests
+///
+
+#ifndef __TEST_ALGORITHM_REDUCE_UTILS_HPP__
+#define __TEST_ALGORITHM_REDUCE_UTILS_HPP__
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "type_helper.hpp"
+#include "RAJA_unit-test-forone.hpp"
+
+#include <string>
+#include <list>
+#include <unordered_map>
+#include <unordered_set>
+#include <type_traits>
+#include <algorithm>
+#include <chrono>
+#include <random>
+
+
+// tag classes to differentiate reduce by attributes and apply correct testing
+struct left_fold_reduce_tag { };
+struct unordered_reduce_tag { };
+
+struct reduce_interface_tag { };
+
+struct reduce_default_interface_tag { };
+struct reduce_init_interface_tag { };
+struct reduce_init_op_interface_tag { };
+
+
+// synchronize based on a RAJA execution policy
+template < typename policy >
+struct PolicySynchronize
+{
+  void synchronize()
+  {
+    // no synchronization needed
+  }
+};
+
+#if defined(RAJA_ENABLE_CUDA)
+// partial specialization for cuda_exec
+template < size_t BLOCK_SIZE, bool Async >
+struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
+{
+  void synchronize()
+  {
+    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+  }
+};
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+// partial specialization for hip_exec
+template < size_t BLOCK_SIZE, bool Async >
+struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
+{
+  void synchronize()
+  {
+    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+  }
+};
+#endif
+
+
+template <typename Res,
+          typename interface_tag,
+          typename ValType>
+struct ReduceData;
+
+template <typename Res, typename ValType>
+struct ReduceData<Res, reduce_interface_tag, ValType>
+{
+  ValType* values = nullptr;
+  ValType* reduced_value = nullptr;
+  Res m_res;
+
+  template < typename RandomGenerator >
+  ReduceData(size_t N, Res res, RandomGenerator gen_random)
+    : m_res(res)
+  {
+    if (N > 0) {
+      values = m_res.template allocate<ValType>(N, camp::resources::MemoryAccess::Managed);
+    }
+    reduced_value = m_res.template allocate<ValType>(1, camp::resources::MemoryAccess::Managed);
+
+    for (size_t i = 0; i < N; i++) {
+      values[i] = gen_random();
+    }
+  }
+
+  void copy_data(size_t N)
+  {
+    if ( N == 0 ) return;
+  }
+
+  Res resource()
+  {
+    return m_res;
+  }
+
+  ReduceData(ReduceData const&) = delete;
+  ReduceData& operator=(ReduceData const&) = delete;
+
+  ~ReduceData()
+  {
+    if (values != nullptr) {
+      m_res.deallocate(values, camp::resources::MemoryAccess::Managed);
+      m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed);
+    }
+  }
+};
+
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T,
+            BinaryOp,
+            Reducer reducer, reduce_interface_tag, reduce_default_interface_tag)
+{
+  data.copy_data(N);
+  data.resource().wait();
+  reducer(data.reduced_value, RAJA::make_span(data.values, N));
+  reducer.synchronize();
+}
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T init,
+            BinaryOp,
+            Reducer reducer, reduce_interface_tag, reduce_init_interface_tag)
+{
+  data.copy_data(N);
+  data.resource().wait();
+  reducer(data.reduced_value, RAJA::make_span(data.values, N), init);
+  reducer.synchronize();
+}
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T init,
+            BinaryOp op,
+            Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag)
+{
+  data.copy_data(N);
+  data.resource().wait();
+  reducer(data.reduced_value, RAJA::make_span(data.values, N), init, op);
+  reducer.synchronize();
+}
+
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename TestReducer,
+          typename BinaryOpInterface>
+::testing::AssertionResult testReduce(
+    const char* test_name,
+    const unsigned seed,
+    ReduceData<Res, reduce_interface_tag, T> & data,
+    RAJA::Index_type N,
+    T init,
+    BinaryOp op,
+    TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+{
+  doReduce(data, N, init, op, test_reducer, si, ci);
+
+  T reduced_check_value = init;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
+  }
+
+  if (reduced_check_value != *data.reduced_value) {
+    return ::testing::AssertionFailure()
+           << test_reducer.name() << " (left fold reduce) " << test_name
+           << " (with N " << N << " with seed " << seed << ")"
+           << " incorrect " << *data.reduced_value
+           << ", expected " << reduced_check_value;
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename TestReducer,
+          typename BinaryOpInterface>
+::testing::AssertionResult testReduce(
+    const char* test_name,
+    const unsigned seed,
+    ReduceData<Res, reduce_interface_tag, T> & data,
+    RAJA::Index_type N,
+    T init,
+    BinaryOp op,
+    TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+{
+  doReduce(data, N, init, op, test_reducer, si, ci);
+
+  T reduced_check_value = init;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
+  }
+
+  if (reduced_check_value != *data.reduced_value) {
+    return ::testing::AssertionFailure()
+           << test_reducer.name() << " (unordered reduce) " << test_name
+           << " (with N " << N << " with seed " << seed << ")"
+           << " incorrect " << *data.reduced_value
+           << ", expected " << reduced_check_value;
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+
+template <typename ValType,
+          typename Reducer,
+          typename Res>
+void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
+{
+  using reduce_category    = typename Reducer::reduce_category ;
+  using interface_category = typename Reducer::reduce_interface ;
+  using no_init_operator   = reduce_default_interface_tag;
+  using init_no_operator   = reduce_init_interface_tag;
+  using init_operator      = reduce_init_op_interface_tag;
+
+  std::mt19937 rng(seed);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
+
+  ReduceData<Res, interface_category, ValType> data(N, res, [&](){ return dist(rng); });
+
+  ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus<ValType>::identity(), RAJA::operators::plus<ValType>{},
+      reducer, reduce_category{}, interface_category{}, no_init_operator{}));
+  ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
+  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
+  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
+}
+
+template <typename ValType,
+          typename Reducer,
+          typename Res>
+void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
+{
+  testReducerInterfaces<ValType>(seed, 0, reducer, res);
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+    testReducerInterfaces<ValType>(seed, n, reducer, res);
+  }
+}
+
+inline unsigned get_random_seed()
+{
+  static unsigned seed = std::random_device{}();
+  return seed;
+}
+
+
+TYPED_TEST_SUITE_P(ReduceUnitTest);
+
+template < typename T >
+class ReduceUnitTest : public ::testing::Test
+{ };
+
+TYPED_TEST_P(ReduceUnitTest, UnitReduce)
+{
+  using Reducer  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
+  using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  unsigned seed = get_random_seed();
+  RAJA::Index_type MaxN = MaxNType::value;
+  Reducer reducer{};
+  ResType res = ResType::get_default();
+
+  testReducer<ValType>(seed, MaxN, reducer, res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
+
+
+//
+// Key types for reduce tests
+//
+using ReduceValTypeList =
+  camp::list<
+              RAJA::Index_type,
+              int,
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              unsigned,
+              long long,
+              unsigned long long,
+              float,
+#endif
+              double
+            >;
+
+// Max test lengths for reduce tests
+using ReduceMaxNListDefault =
+  camp::list<
+              camp::num<10000>
+            >;
+
+using ReduceMaxNListSmall =
+  camp::list<
+              camp::num<1000>
+            >;
+
+using ReduceMaxNListTiny =
+  camp::list<
+              camp::num<100>
+            >;
+
+#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
+
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
new file mode 100644
index 0000000000..062e0f9b91
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -0,0 +1,205 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing Reducer classes for util reduce tests
+///
+
+#ifndef __TEST_ALGORITHM_UTIL_REDUCE_HPP__
+#define __TEST_ALGORITHM_UTIL_REDUCE_HPP__
+
+#include "test-algorithm-reduce-utils.hpp"
+
+
+template < typename test_policy >
+using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+
+
+template < typename test_policy, typename platform = test_platform<test_policy> >
+struct BinaryTreeReduce;
+
+template < typename test_policy, typename platform = test_platform<test_policy> >
+struct Accumulate;
+
+
+template < typename test_policy >
+struct BinaryTreeReduce<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = unordered_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::binary_tree_reduce";
+  }
+
+  template < typename T, typename... Args >
+  void operator()(T* reduced_value, Args&&... args)
+  {
+    *reduced_value = RAJA::binary_tree_reduce(std::forward<Args>(args)...);
+  }
+};
+
+template < typename test_policy >
+struct Accumulate<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = left_fold_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::accumulate";
+  }
+
+  template < typename T, typename... Args >
+  void operator()(T* reduced_value, Args&&... args)
+  {
+    *reduced_value = RAJA::accumulate(std::forward<Args>(args)...);
+  }
+};
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+
+template < typename test_policy >
+struct BinaryTreeReduce<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = unordered_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  std::string m_name;
+
+  BinaryTreeReduce()
+    : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c);
+    });
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c, init);
+    });
+  }
+
+  template < typename T, typename Container, typename BinaryOp >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c, init, op);
+    });
+  }
+};
+
+template < typename test_policy >
+struct Accumulate<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = left_fold_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  std::string m_name;
+
+  Accumulate()
+    : m_name(std::string("RAJA::accumulate<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c);
+    });
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c, init);
+    });
+  }
+
+  template < typename T, typename Container, typename BinaryOp >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c, init, op);
+    });
+  }
+};
+
+#endif
+
+
+using SequentialBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_seq>
+            >;
+
+using SequentialAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_seq>
+            >;
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using CudaBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_cuda>
+            >;
+
+using CudaAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_cuda>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+
+using HipBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_hip>
+            >;
+
+using HipAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_hip>
+            >;
+
+#endif
+
+#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
+
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
index fdec220da9..869b897714 100644
--- a/test/unit/util/CMakeLists.txt
+++ b/test/unit/util/CMakeLists.txt
@@ -21,4 +21,8 @@ raja_add_test(
   NAME test-span
   SOURCES test-span.cpp)
 
+raja_add_test(
+  NAME test-fraction
+  SOURCES test-fraction.cpp)
+
 add_subdirectory(operator)
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
new file mode 100644
index 0000000000..5161b2bb3a
--- /dev/null
+++ b/test/unit/util/test-fraction.cpp
@@ -0,0 +1,64 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for Fraction
+///
+
+#include <RAJA/RAJA.hpp>
+#include "RAJA_gtest.hpp"
+#include <type_traits>
+
+template <typename IntegerType, IntegerType numerator, IntegerType denominator>
+void testFractionMultiplyTypesValues()
+{
+  using Frac = RAJA::Fraction<IntegerType, numerator, denominator>;
+
+  ASSERT_EQ(Frac::multiply(IntegerType(0)), IntegerType(0));
+
+  ASSERT_EQ(Frac::multiply(IntegerType(1)),
+            IntegerType(double(numerator) / double(denominator)));
+
+  ASSERT_EQ(Frac::multiply(IntegerType(100)),
+            IntegerType(double(numerator) / double(denominator) * double(100)));
+
+  ASSERT_EQ(Frac::multiply(IntegerType(101)),
+            IntegerType(double(numerator) / double(denominator) * double(101)));
+
+  // Test where naive algorithm causes overflow, when within precision of double
+  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) {
+
+    static constexpr IntegerType max = std::numeric_limits<IntegerType>::max();
+    static constexpr IntegerType val = (numerator > denominator) ?
+        (max / numerator * denominator) : max;
+
+    ASSERT_EQ(Frac::multiply(IntegerType(val)),
+              IntegerType(double(numerator) / double(denominator) * double(val)));
+  }
+
+}
+
+template <typename IntegerType>
+void testFractionMultiplyTypes()
+{
+  testFractionMultiplyTypesValues<IntegerType, 1, 1>();
+  testFractionMultiplyTypesValues<IntegerType, 1, 2>();
+  testFractionMultiplyTypesValues<IntegerType, 1, 3>();
+  testFractionMultiplyTypesValues<IntegerType, 2, 3>();
+  testFractionMultiplyTypesValues<IntegerType, 12, 7>();
+  testFractionMultiplyTypesValues<IntegerType, 0, 100>();
+}
+
+
+#define RAJA_FRACTION_RUN_TEST(test) \
+  test<int>(); \
+  test<size_t>();
+
+TEST(Fraction, basic_multiply_Fraction)
+{
+  RAJA_FRACTION_RUN_TEST(testFractionMultiplyTypes)
+}