From 978637d7703e55f8f3d05f73e3fcb932c971f04d Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 6 Jul 2023 09:02:23 -0700
Subject: [PATCH 01/34] Fix memory management in cuda/hip

Free memory in resolve instead of leaking.
Use device_mempool_type instead of cuda/hipMalloc
---
 include/RAJA/policy/cuda/params/reduce.hpp | 16 +++++++++++++---
 include/RAJA/policy/hip/params/reduce.hpp  | 16 +++++++++++++---
 2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 6d142ca19b..cb87ee68cc 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -17,7 +17,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
   init(Reducer<OP, T>& red, const RAJA::cuda::detail::cudaInfo & cs)
   {
-    cudaMalloc( (void**)(&(red.devicetarget)), sizeof(T));
+    red.devicetarget = RAJA::cuda::device_mempool_type::getInstance().template malloc<T>(1);
     red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z);
     red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
   }
@@ -26,17 +26,27 @@ namespace detail {
   template<typename EXEC_POL, typename OP, typename T>
   RAJA_HOST_DEVICE
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(Reducer<OP, T>& red) {
+  combine(Reducer<OP, T>& red)
+  {
     RAJA::cuda::impl::expt::grid_reduce(red);
   }
 
   // Resolve
   template<typename EXEC_POL, typename OP, typename T>
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red) {
+  resolve(Reducer<OP, T>& red)
+  {
+    // complete reduction
     cudaDeviceSynchronize();
     cudaMemcpy(&red.val, red.devicetarget, sizeof(T), cudaMemcpyDeviceToHost);
     *red.target = OP{}(red.val, *red.target);
+
+    // free memory
+    RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
+    red.device_count = nullptr;
+    red.device_mem.deallocate();
+    RAJA::cuda::device_mempool_type::getInstance().free(red.devicetarget);
+    red.devicetarget = nullptr;
   }
 
 } //  namespace detail
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index ee6b39548e..0a06a76c0c 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -17,7 +17,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
   init(Reducer<OP, T>& red, const RAJA::hip::detail::hipInfo & cs)
   {
-    hipMalloc( (void**)(&(red.devicetarget)), sizeof(T));
+    red.devicetarget = RAJA::hip::device_mempool_type::getInstance().template malloc<T>(1);
     red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z);
     red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
   }
@@ -26,17 +26,27 @@ namespace detail {
   template<typename EXEC_POL, typename OP, typename T>
   RAJA_HOST_DEVICE
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(Reducer<OP, T>& red) {
+  combine(Reducer<OP, T>& red)
+  {
     RAJA::hip::impl::expt::grid_reduce(red);
   }
 
   // Resolve
   template<typename EXEC_POL, typename OP, typename T>
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red) {
+  resolve(Reducer<OP, T>& red)
+  {
+    // complete reduction
     hipDeviceSynchronize();
     hipMemcpy(&red.val, red.devicetarget, sizeof(T), hipMemcpyDeviceToHost);
     *red.target = OP{}(red.val, *red.target);
+
+    // free memory
+    RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
+    red.device_count = nullptr;
+    red.device_mem.deallocate();
+    RAJA::hip::device_mempool_type::getInstance().free(red.devicetarget);
+    red.devicetarget = nullptr;
   }
 
 } //  namespace detail

From 59ad47498ef90017bab87b20520343004acff4fd Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 6 Jul 2023 09:44:48 -0700
Subject: [PATCH 02/34] Use policy info in cuda/hip expt::grid_reduce and
 block_reduce

---
 include/RAJA/policy/cuda/params/reduce.hpp |  2 +-
 include/RAJA/policy/cuda/policy.hpp        | 20 ++++++++++++
 include/RAJA/policy/cuda/reduce.hpp        | 38 ++++++++++------------
 include/RAJA/policy/hip/params/reduce.hpp  |  2 +-
 include/RAJA/policy/hip/policy.hpp         | 20 ++++++++++++
 include/RAJA/policy/hip/reduce.hpp         | 38 ++++++++++------------
 6 files changed, 76 insertions(+), 44 deletions(-)

diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index cb87ee68cc..442114d183 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -28,7 +28,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
   combine(Reducer<OP, T>& red)
   {
-    RAJA::cuda::impl::expt::grid_reduce(red);
+    RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
   }
 
   // Resolve
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 64ec12a4cb..7dec1f9fab 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -748,6 +748,26 @@ struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
   }
 };
 
+// helper to get just the thread indexing part of IndexGlobal
+template < typename index_global >
+struct get_index_thread;
+///
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
+{
+  using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
+};
+
+// helper to get just the block indexing part of IndexGlobal
+template < typename index_global >
+struct get_index_block;
+///
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
+{
+  using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
+};
+
 
 template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index fba82934f5..8d011ccdd8 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -528,16 +528,14 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
 
 namespace expt {
 
-template <typename Combiner, typename T>
+template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
+  const int numThreads = ThreadIterationGetter::size();
+  const int threadId = ThreadIterationGetter::index();
 
-  int warpId = threadId % RAJA::policy::cuda::WARP_SIZE;
-  int warpNum = threadId / RAJA::policy::cuda::WARP_SIZE;
+  const int warpId = threadId % RAJA::policy::cuda::WARP_SIZE;
+  const int warpNum = threadId / RAJA::policy::cuda::WARP_SIZE;
 
   T temp = val;
 
@@ -604,20 +602,20 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 }
 
 
-template <typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red) {
-
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-  unsigned int wrap_around = numBlocks - 1;
+template <typename GlobalIterationGetter, typename OP, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
+{
+  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
 
-  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                (gridDim.x * gridDim.y) * blockIdx.z;
+  const int numBlocks = BlockIterationGetter::size();
+  const int numThreads = ThreadIterationGetter::size();
+  const unsigned int wrap_around = numBlocks - 1;
 
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
+  const int blockId = BlockIterationGetter::index();
+  const int threadId = ThreadIterationGetter::index();
 
-  T temp = block_reduce<OP>(red.val, OP::identity());
+  T temp = block_reduce<ThreadIterationGetter, OP>(red.val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
@@ -641,15 +639,13 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
       temp = OP{}(temp, red.device_mem.get(i));
     }
 
-    temp = block_reduce<OP>(temp, OP::identity());
+    temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
     if (threadId == 0) {
       *(red.devicetarget) = temp;
     }
   }
-
-  return lastBlock && threadId == 0;
 }
 
 } //  namespace expt
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index 0a06a76c0c..a683885b8b 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -28,7 +28,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
   combine(Reducer<OP, T>& red)
   {
-    RAJA::hip::impl::expt::grid_reduce(red);
+    RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter>(red);
   }
 
   // Resolve
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 8b7f29462d..f326716bce 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -743,6 +743,26 @@ struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
   }
 };
 
+// helper to get just the thread indexing part of IndexGlobal
+template < typename index_global >
+struct get_index_thread;
+///
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
+{
+  using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
+};
+
+// helper to get just the block indexing part of IndexGlobal
+template < typename index_global >
+struct get_index_block;
+///
+template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
+{
+  using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
+};
+
 
 template <size_t BLOCK_SIZE=named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index a3127b50aa..584534db14 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -401,16 +401,14 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
 
 namespace expt {
 
-template <typename Combiner, typename T>
+template <typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
+  const int numThreads = ThreadIterationGetter::size();
+  const int threadId = ThreadIterationGetter::index();
 
-  int warpId = threadId % RAJA::policy::hip::WARP_SIZE;
-  int warpNum = threadId / RAJA::policy::hip::WARP_SIZE;
+  const int warpId = threadId % RAJA::policy::hip::WARP_SIZE;
+  const int warpNum = threadId / RAJA::policy::hip::WARP_SIZE;
 
   T temp = val;
 
@@ -477,20 +475,20 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 }
 
 
-template <typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red) {
-
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-  unsigned int wrap_around = numBlocks - 1;
+template <typename GlobalIterationGetter, typename OP, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red)
+{
+  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
 
-  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                (gridDim.x * gridDim.y) * blockIdx.z;
+  const int numBlocks = BlockIterationGetter::size();
+  const int numThreads = ThreadIterationGetter::size();
+  const unsigned int wrap_around = numBlocks - 1;
 
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
+  const int blockId = BlockIterationGetter::index();
+  const int threadId = ThreadIterationGetter::index();
 
-  T temp = block_reduce<OP>(red.val, OP::identity());
+  T temp = block_reduce<ThreadIterationGetter, OP>(red.val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
@@ -514,15 +512,13 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
       temp = OP{}(temp, red.device_mem.get(i));
     }
 
-    temp = block_reduce<OP>(temp, OP::identity());
+    temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
     if (threadId == 0) {
       *(red.devicetarget) = temp;
     }
   }
-
-  return lastBlock && threadId == 0;
 }
 
 } //  namespace expt

From 8a994e9213ccf8660134f77bf166c84a7a5c0bb8 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 6 Jul 2023 10:17:25 -0700
Subject: [PATCH 03/34] Add param support to detail_resolve

---
 include/RAJA/pattern/params/forall.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 499b442cc5..61ecfbe33e 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -56,9 +56,9 @@ namespace expt
     }
     
     // Resolve
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
-      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
+    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
+    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
+      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
     }
 
     // Used to construct the argument TYPES that will be invoked with the lambda.

From 5dbb0f4f375517f25033639778b264a18236ecdf Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 6 Jul 2023 09:52:51 -0700
Subject: [PATCH 04/34] Use resource to copy and sync in resolve

---
 include/RAJA/policy/cuda/forall.hpp        |  2 +-
 include/RAJA/policy/cuda/params/reduce.hpp | 10 +++++-----
 include/RAJA/policy/hip/forall.hpp         |  2 +-
 include/RAJA/policy/hip/params/reduce.hpp  | 10 +++++-----
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 7d6bb820f6..91af0cb321 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -629,7 +629,7 @@ forall_impl(resources::Cuda cuda_res,
       void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
       RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
 
     RAJA_FT_END;
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 442114d183..80f625080c 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -15,10 +15,10 @@ namespace detail {
   // Init
   template<typename EXEC_POL, typename OP, typename T>
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red, const RAJA::cuda::detail::cudaInfo & cs)
+  init(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
   {
     red.devicetarget = RAJA::cuda::device_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z);
+    red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
     red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
   }
 
@@ -34,11 +34,11 @@ namespace detail {
   // Resolve
   template<typename EXEC_POL, typename OP, typename T>
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red)
+  resolve(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
   {
     // complete reduction
-    cudaDeviceSynchronize();
-    cudaMemcpy(&red.val, red.devicetarget, sizeof(T), cudaMemcpyDeviceToHost);
+    ci.res.memcpy(&red.val, red.devicetarget, sizeof(T));
+    ci.res.wait();
     *red.target = OP{}(red.val, *red.target);
 
     // free memory
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 6483d9b59a..03d530ab2d 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -622,7 +622,7 @@ forall_impl(resources::Hip hip_res,
       void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
       RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
 
     RAJA_FT_END;
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index a683885b8b..fa73e2da2e 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -15,10 +15,10 @@ namespace detail {
   // Init
   template<typename EXEC_POL, typename OP, typename T>
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(Reducer<OP, T>& red, const RAJA::hip::detail::hipInfo & cs)
+  init(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
   {
     red.devicetarget = RAJA::hip::device_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z);
+    red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
     red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
   }
 
@@ -34,11 +34,11 @@ namespace detail {
   // Resolve
   template<typename EXEC_POL, typename OP, typename T>
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(Reducer<OP, T>& red)
+  resolve(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
   {
     // complete reduction
-    hipDeviceSynchronize();
-    hipMemcpy(&red.val, red.devicetarget, sizeof(T), hipMemcpyDeviceToHost);
+    hi.res.memcpy(&red.val, red.devicetarget, sizeof(T));
+    hi.res.wait();
     *red.target = OP{}(red.val, *red.target);
 
     // free memory

From edf7a227d9e6d22f32fd65e74570d96862dd6990 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 6 Jul 2023 09:58:06 -0700
Subject: [PATCH 05/34] Put devicetarget in pinned to avoid memcpy

---
 include/RAJA/policy/cuda/params/reduce.hpp | 7 +++----
 include/RAJA/policy/hip/params/reduce.hpp  | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 80f625080c..f60117dba9 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -17,7 +17,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
   init(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
   {
-    red.devicetarget = RAJA::cuda::device_mempool_type::getInstance().template malloc<T>(1);
+    red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
     red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
     red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
   }
@@ -37,15 +37,14 @@ namespace detail {
   resolve(Reducer<OP, T>& red, RAJA::cuda::detail::cudaInfo& ci)
   {
     // complete reduction
-    ci.res.memcpy(&red.val, red.devicetarget, sizeof(T));
     ci.res.wait();
-    *red.target = OP{}(red.val, *red.target);
+    *red.target = OP{}(*red.devicetarget, *red.target);
 
     // free memory
     RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
     red.device_count = nullptr;
     red.device_mem.deallocate();
-    RAJA::cuda::device_mempool_type::getInstance().free(red.devicetarget);
+    RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
     red.devicetarget = nullptr;
   }
 
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index fa73e2da2e..4d090e9e92 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -17,7 +17,7 @@ namespace detail {
   camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
   init(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
   {
-    red.devicetarget = RAJA::hip::device_mempool_type::getInstance().template malloc<T>(1);
+    red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
     red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
     red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
   }
@@ -37,15 +37,14 @@ namespace detail {
   resolve(Reducer<OP, T>& red, RAJA::hip::detail::hipInfo& hi)
   {
     // complete reduction
-    hi.res.memcpy(&red.val, red.devicetarget, sizeof(T));
     hi.res.wait();
-    *red.target = OP{}(red.val, *red.target);
+    *red.target = OP{}(*red.devicetarget, *red.target);
 
     // free memory
     RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
     red.device_count = nullptr;
     red.device_mem.deallocate();
-    RAJA::hip::device_mempool_type::getInstance().free(red.devicetarget);
+    RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
     red.devicetarget = nullptr;
   }
 

From 6e7bbd3ee99a3d5c9816cdc020e95e25ed5035f1 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 6 Jul 2023 18:02:37 -0700
Subject: [PATCH 06/34] Decorate unused vec_arg.

---
 include/RAJA/util/TypedViewBase.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 5f9d53d9c7..93bdca06cc 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -457,7 +457,7 @@ namespace internal
     using type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
     static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> vec_arg){
+    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> RAJA_UNUSED_ARG(vec_arg)){
       return type();
     }
   };

From bf251da88ce2a51e6228e8f4c739052020247786 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Thu, 6 Jul 2023 18:03:13 -0700
Subject: [PATCH 07/34] Type casts for unsigned int cases.

---
 .../kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
index b72ad745e9..ec1ca0ecb0 100644
--- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
+++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp
@@ -50,7 +50,7 @@ void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim
 
   RAJA::kernel<EXEC_POLICY> ( RAJA::make_tuple( Grange, Irange, Jrange ),
     [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj ) {
-      if ((int)g < 0 || g >= groups || (int)ii < 0 || ii >= idim || (int)jj < 0 || jj >= jdim) {
+      if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim || (int)jj < 0 || (int)jj >= jdim) {
         oob_count += 1;
       }
 

From 72cc6ac158e046c478eb870c8cd7bd061ac9e432 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 12 Jul 2023 12:24:10 -0700
Subject: [PATCH 08/34] Expose CMake option for enabling/disabling OpenMP task
 alg options

---
 RELEASE_NOTES.md                    | 4 ++++
 cmake/SetupRajaOptions.cmake        | 2 ++
 include/RAJA/config.hpp.in          | 8 +++++---
 include/RAJA/policy/openmp/sort.hpp | 4 ++--
 4 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 5200a41ed4..7390020bcc 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -16,6 +16,10 @@ Notable changes include:
   * New features / API changes:
 
   * Build changes/improvements:
+     * RAJA_ENABLE_OPENMP_TASK CMake option added to enable/disable algorithm 
+       options based on OpenMP task construct. Currently, this only applies
+       to RAJA's OpenMP sort implementation. The default is 'Off'. The option
+       allows users to choose a task implementation if they wish.
 
   * Bug fixes/improvements:
 
diff --git a/cmake/SetupRajaOptions.cmake b/cmake/SetupRajaOptions.cmake
index c8c11efc72..3a27bc8cd0 100644
--- a/cmake/SetupRajaOptions.cmake
+++ b/cmake/SetupRajaOptions.cmake
@@ -17,6 +17,8 @@ option(RAJA_ENABLE_SYCL "Build SYCL support" Off)
 
 option(RAJA_ENABLE_VECTORIZATION "Build experimental vectorization support" On)
 
+option(RAJA_ENABLE_OPENMP_TASK "Build OpenMP task variants of certain algorithms" Off)
+
 option(RAJA_ENABLE_REPRODUCERS "Build issue reproducers" Off)
 
 option(RAJA_ENABLE_EXERCISES "Build exercises " On)
diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
index 3ffa9cc217..c6df29447c 100644
--- a/include/RAJA/config.hpp.in
+++ b/include/RAJA/config.hpp.in
@@ -254,12 +254,14 @@ namespace RAJA {
 #if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__)
 #if defined(_OPENMP)
 #if (_OPENMP >= 200805)
-#define RAJA_ENABLE_OPENMP_TASK
+#if defined(RAJA_ENABLE_OPENMP_TASK)
+#define RAJA_ENABLE_OPENMP_TASK_INTERNAL
 #endif
+#endif // _OPENMP >= 200805
 #else
 #error RAJA configured with RAJA_ENABLE_OPENMP, but _OPENMP is not defined in this code section
-#endif // _OPENMP
-#endif // RAJA_ENABLE_OPENMP && __HIP_DEVICE_COMPILE__
+#endif // else
+#endif // RAJA_ENABLE_OPENMP && !__HIP_DEVICE_COMPILE__
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_CUDA_ACTIVE
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 8abd2ade64..8502843ed0 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -49,7 +49,7 @@ namespace openmp
 // this number is arbitrary
 constexpr int get_min_iterates_per_task() { return 128; }
 
-#ifdef RAJA_ENABLE_OPENMP_TASK
+#if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 /*!
         \brief sort given range using sorter and comparison function
                by spawning tasks
@@ -159,7 +159,7 @@ void sort(Sorter sorter,
 
     const diff_type max_threads = omp_get_max_threads();
 
-#ifdef RAJA_ENABLE_OPENMP_TASK
+#if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 
     const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
 

From f7bcb023bce47f6814fb77c7d8c7ed6092babf25 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 12 Jul 2023 12:25:48 -0700
Subject: [PATCH 09/34] Point radiuss spack configs submodule to branch with
 changes to try with GitLab CI updates

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index ffde592300..307d08fa68 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit ffde592300392a7bbebbfe0a8d752187c1b93132
+Subproject commit 307d08fa68cf367095ce112424356f44a475f2b9

From 9b32c3c9b420dbc11bf247f43999321c6929a11f Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 12 Jul 2023 13:36:40 -0700
Subject: [PATCH 10/34] Bump to new version of radiuss shared ci

---
 .gitlab-ci.yml                          | 2 +-
 .gitlab/corona-build-and-test-extra.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8ba8c96d8e..162e262e19 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -59,7 +59,7 @@ stages:
     include:
       - local: '.gitlab/custom-jobs-and-variables.yml'
       - project: 'radiuss/radiuss-shared-ci'
-        ref: v2023.03.1
+        ref: v2023.06.0
         file: '${CI_MACHINE}-build-and-test.yml'
       - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml'
     strategy: depend
diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml
index e65bba3947..f9a7509432 100644
--- a/.gitlab/corona-build-and-test-extra.yml
+++ b/.gitlab/corona-build-and-test-extra.yml
@@ -21,7 +21,7 @@
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
-rocmcc_5_4_1_hip_desul_atomics:
+rocmcc_5_5_0_hip_desul_atomics:
   variables:
     SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@5.4.1 ^hip@5.4.1 ^blt@develop"
   extends: .build_and_test_on_corona

From 4da2d0b204cbec755025925de413e43540a187c6 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 12 Jul 2023 14:49:25 -0700
Subject: [PATCH 11/34] Really change the compiler version this time.

---
 .gitlab/corona-build-and-test-extra.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml
index f9a7509432..a4ad4ff693 100644
--- a/.gitlab/corona-build-and-test-extra.yml
+++ b/.gitlab/corona-build-and-test-extra.yml
@@ -23,6 +23,6 @@
 
 rocmcc_5_5_0_hip_desul_atomics:
   variables:
-    SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@5.4.1 ^hip@5.4.1 ^blt@develop"
+    SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@5.5.0 ^hip@5.5.0 ^blt@develop"
   extends: .build_and_test_on_corona
 

From 7bb6c6078f5126e4031c45583aed9d5024ac2f7c Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 12 Jul 2023 14:49:45 -0700
Subject: [PATCH 12/34] Delete build dirs to avoid running out of space

---
 Dockerfile | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 9120208427..aad0b078e5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_DEPRECATED_TESTS=On -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8.1.0
 ENV GTEST_COLOR=1
@@ -19,7 +20,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_COVERAGE=On -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9.4.0
 ENV GTEST_COLOR=1
@@ -27,7 +29,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11.2.0
 ENV GTEST_COLOR=1
@@ -35,7 +38,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11.0.0
 ENV GTEST_COLOR=1
@@ -44,7 +48,8 @@ WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \
     cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11.0.0-debug
 ENV GTEST_COLOR=1
@@ -53,7 +58,8 @@ WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \
     cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Debug .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13.0.0
 ENV GTEST_COLOR=1
@@ -62,7 +68,8 @@ WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \
     cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Release .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure
+    ctest -T test --output-on-failure && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10.1.243
 ENV GTEST_COLOR=1
@@ -70,7 +77,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
     cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
-    make -j 4
+    make -j 4 && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.1
 ENV GTEST_COLOR=1
@@ -78,7 +86,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
     cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
-    make -j 4
+    make -j 4 && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.-debug
 ENV GTEST_COLOR=1
@@ -86,7 +95,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
     cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
-    make -j 4
+    make -j 4 && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip5.1.3
 ENV GTEST_COLOR=1
@@ -95,7 +105,8 @@ COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \
     cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \
-    make -j 6
+    make -j 6 && \
+    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl
 ENV GTEST_COLOR=1
@@ -104,4 +115,5 @@ WORKDIR /home/raja/workspace/build
 RUN /bin/bash -c "source /opt/view/setvars.sh && \
     cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \
     make -j 6 &&\
-    ctest -T test --output-on-failure"
+    ctest -T test --output-on-failure" && \
+    cd .. && rm -rf build

From 0f76d55b01a669e9149bebb20771f058553713d0 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 12 Jul 2023 14:55:09 -0700
Subject: [PATCH 13/34] Pull in variant addition

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 307d08fa68..e2afaf8e00 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 307d08fa68cf367095ce112424356f44a475f2b9
+Subproject commit e2afaf8e00ec8110d4199bed5e39d36d987340e3

From 04b38fd68d79a8ef64e8ea4ee20ce5215e5d2077 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 12 Jul 2023 15:09:52 -0700
Subject: [PATCH 14/34] Add missing cmakedefine

---
 include/RAJA/config.hpp.in | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
index c6df29447c..957cdadec0 100644
--- a/include/RAJA/config.hpp.in
+++ b/include/RAJA/config.hpp.in
@@ -176,6 +176,8 @@ static_assert(RAJA_HAS_SOME_CXX14,
 #cmakedefine RAJA_ENABLE_CLANG_CUDA
 #cmakedefine RAJA_ENABLE_HIP
 #cmakedefine RAJA_ENABLE_SYCL
+
+#cmakedefine RAJA_ENABLE_OMP_TASK
 #cmakedefine RAJA_ENABLE_VECTORIZATION
 
 #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT

From 7dfe81a5fd9f557164e326aa980177b3caaf261d Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 13 Jul 2023 08:11:55 -0700
Subject: [PATCH 15/34] Remove GPU builds from azure

---
 Dockerfile          | 66 ++++++++++++++++++++++-----------------------
 azure-pipelines.yml | 12 ++++-----
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index aad0b078e5..64939607b9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -71,42 +71,42 @@ RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/li
     ctest -T test --output-on-failure && \
     cd .. && rm -rf build
 
-FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10.1.243
-ENV GTEST_COLOR=1
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
-    cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
-    make -j 4 && \
-    cd .. && rm -rf build
+##FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10.1.243
+##ENV GTEST_COLOR=1
+##COPY . /home/raja/workspace
+##WORKDIR /home/raja/workspace/build
+##RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
+##    cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
+##    make -j 4 && \
+##    cd .. && rm -rf build
 
-FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.1
-ENV GTEST_COLOR=1
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
-    cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
-    make -j 4 && \
-    cd .. && rm -rf build
+##FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.1
+##ENV GTEST_COLOR=1
+##COPY . /home/raja/workspace
+##WORKDIR /home/raja/workspace/build
+##RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
+##    cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
+##    make -j 4 && \
+##    cd .. && rm -rf build
 
-FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.-debug
-ENV GTEST_COLOR=1
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
-    cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
-    make -j 4 && \
-    cd .. && rm -rf build
+##FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.-debug
+##ENV GTEST_COLOR=1
+##COPY . /home/raja/workspace
+##WORKDIR /home/raja/workspace/build
+##RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
+##    cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
+##    make -j 4 && \
+##    cd .. && rm -rf build
 
-FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip5.1.3
-ENV GTEST_COLOR=1
-ENV HCC_AMDGPU_TARGET=gfx900
-COPY . /home/raja/workspace
-WORKDIR /home/raja/workspace/build
-RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \
-    cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \
-    make -j 6 && \
-    cd .. && rm -rf build
+##FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip5.1.3
+##ENV GTEST_COLOR=1
+##ENV HCC_AMDGPU_TARGET=gfx900
+##COPY . /home/raja/workspace
+##WORKDIR /home/raja/workspace/build
+##RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \
+##    cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \
+##    make -j 6 && \
+##    cd .. && rm -rf build
 
 FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl
 ENV GTEST_COLOR=1
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 048c9b93e8..33e3bfa8c0 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -48,14 +48,14 @@ jobs:
         docker_target: clang11.0.0-debug
       clang13.0.0:
         docker_target: clang13.0.0
-      nvcc10.1.243:
-        docker_target: nvcc10.1.243
-      nvcc11.1.1:
-        docker_target: nvcc11.1.1
+##      nvcc10.1.243:
+##        docker_target: nvcc10.1.243
+##      nvcc11.1.1:
+##        docker_target: nvcc11.1.1
 ##      nvcc11.1.1-debug:
 ##        docker_target: nvcc11.1.1-debug
-      hip5.1.3:
-        docker_target: hip5.1.3
+##      hip5.1.3:
+##        docker_target: hip5.1.3
       sycl:
         docker_target: sycl
   pool:

From e7931c5810e09578f83312be6814bd8497e59713 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 13 Jul 2023 10:09:16 -0700
Subject: [PATCH 16/34] enable OpenMP task option for some algorithms

---
 .gitlab/ruby-build-and-test-extra.yml  | 17 ++++++++++++++++-
 .gitlab/tioga-build-and-test-extra.yml |  2 +-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml
index c6666e51d5..a8cd89a49b 100644
--- a/.gitlab/ruby-build-and-test-extra.yml
+++ b/.gitlab/ruby-build-and-test-extra.yml
@@ -12,7 +12,22 @@
 # We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
 # the comparison with the original job is easier.
 
-# No overridden jobs so far.
+clang_14_0_6:
+  variables:
+    SPEC: " ~shared +openmp +omptask +tests %clang@14.0.6"
+  extends: .build_and_test_on_ruby
+
+gcc_10_3_1:
+  variables:
+    SPEC: " ~shared +openmp +omptask +tests %gcc@10.3.1"
+    RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1"
+  extends: .build_and_test_on_ruby
+
+intel_19_1_2_gcc_8_5_0:
+  variables:
+    SPEC: " ~shared +openmp +omptask +tests %intel@19.1.2.gcc.8.5.0"
+    RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=90 --nodes=1"
+  extends: .build_and_test_on_ruby
 
 ############
 # Extra jobs
diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/tioga-build-and-test-extra.yml
index 25b9880982..a211ea2d8a 100644
--- a/.gitlab/tioga-build-and-test-extra.yml
+++ b/.gitlab/tioga-build-and-test-extra.yml
@@ -28,5 +28,5 @@ rocmcc_5_4_3_hip_desul_atomics:
 
 rocmcc_5_4_3_hip_openmp:
   variables:
-    SPEC: "~shared +rocm +openmp +tests amdgpu_target=gfx90a %rocmcc@5.4.3 ^hip@5.4.3 ^blt@develop"
+    SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@5.4.3 ^hip@5.4.3 ^blt@develop"
   extends: .build_and_test_on_tioga

From 8ad8bad2ff30ba568f5242ddf395b397391c133a Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 13 Jul 2023 10:52:27 -0700
Subject: [PATCH 17/34] Fix spec

---
 .gitlab/lassen-build-and-test-extra.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml
index 71950b0ba7..2e4118df90 100644
--- a/.gitlab/lassen-build-and-test-extra.yml
+++ b/.gitlab/lassen-build-and-test-extra.yml
@@ -36,6 +36,11 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_7_0:
 # ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
 # describe the spec here.
 
+gcc_8_3_1_omptask:
+  variables:
+    SPEC: " ~shared +openmp +omptask +tests %gcc@8.3.1"
+  extends: .build_and_test_on_lassen
+
 gcc_8_3_1_cuda_11_5_0_ats_disabled:
   extends: .build_and_test_on_lassen
   variables:

From 79a4a08d1f514242f24f66ecbe5e75f20332fbe1 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 13 Jul 2023 11:17:51 -0700
Subject: [PATCH 18/34] Add omptask option to raja perf package

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index e2afaf8e00..8935d00676 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit e2afaf8e00ec8110d4199bed5e39d36d987340e3
+Subproject commit 8935d006761aeb495a648b1f1373a0ca540ef78c

From 4f29f3510b99dbf228395fd457bad3be180f7e13 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 14 Jul 2023 07:58:09 -0700
Subject: [PATCH 19/34] Switch radiuss spack configs to main branch

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 8935d00676..955f0c3a67 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 8935d006761aeb495a648b1f1373a0ca540ef78c
+Subproject commit 955f0c3a67e28fac4648ac41787e93511207b22e

From 8c214c168e7852574ea3f0e7bda410510794b831 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 21 Jul 2023 11:22:04 -0700
Subject: [PATCH 20/34] Fix warnings from KernelDimensionCalculator

---
 include/RAJA/policy/cuda/kernel/internal.hpp | 8 ++++----
 include/RAJA/policy/hip/kernel/internal.hpp  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index aaa6eb1215..82760d46ea 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -314,7 +314,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
   {
     if (len > static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
@@ -427,7 +427,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
   template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
@@ -460,7 +460,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
   template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
@@ -537,7 +537,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
+  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 51e17d5837..f874b40e2d 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -314,7 +314,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
   {
     if (len > static_cast<IdxT>(0)) {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
@@ -427,7 +427,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
   template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
@@ -460,7 +460,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
   template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
@@ -537,7 +537,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
+  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));

From 1ea7b69a19b160f52e59fc107578654a8ecd3cb6 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 19 Jul 2023 14:11:32 -0700
Subject: [PATCH 21/34] zero working_array_x in kernel fission fusion tests

---
 .../tests/basic-fission-fusion-loop-impl.hpp               | 7 +++----
 .../tests/conditional-fission-fusion-loop-impl.hpp         | 6 +++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
index ec32fdddec..bea0fd1564 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp
@@ -53,10 +53,9 @@ void KernelBasicFissionFusionLoopTestImpl(
                                     &test_array_y);
 
 
-  memset(static_cast<void*>(test_array_x),
-         0,
-         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
-
+  working_res.memset(working_array_x,
+                     0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   RAJA::kernel<EXEC_POLICY>(
       RAJA::make_tuple(seg, seg),
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
index 97761bacb6..52cd9fc861 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
+++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp
@@ -53,9 +53,9 @@ void KernelConditionalFissionFusionLoopTestImpl(
                                     &test_array_y);
 
 
-  memset(static_cast<void*>(test_array_x),
-         0,
-         sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
+  working_res.memset(working_array_x,
+                     0,
+                     sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len));
 
   for (int param = 0; param < 2; ++param) {
 

From 784f85f29fdf3f5e38d7bad6081d6fcc97378c8d Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 1 Aug 2023 08:26:31 -0700
Subject: [PATCH 22/34] Make type use consistent

in hip/cuda kernel and occupancy calculator functions
---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    | 33 +++++-----
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    | 60 +++++++++----------
 include/RAJA/policy/hip/MemUtils_HIP.hpp      | 21 ++++---
 include/RAJA/policy/hip/kernel/HipKernel.hpp  | 12 ++--
 4 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 7eae7e29c0..697ed225a0 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -318,19 +318,20 @@ size_t cuda_max_blocks(size_t block_size)
 
 struct CudaOccMaxBlocksThreadsData
 {
-  int prev_shmem_size;
+  size_t prev_shmem_size;
   int max_blocks;
   int max_threads;
 };
 
 template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
 RAJA_INLINE
-void cuda_occupancy_max_blocks_threads(Func&& func, int shmem_size,
-                                       size_t &max_blocks, size_t &max_threads)
+void cuda_occupancy_max_blocks_threads(Func&& func, size_t shmem_size,
+                                       int &max_blocks, int &max_threads)
 {
   static constexpr int uninitialized = -1;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local CudaOccMaxBlocksThreadsData data = {
-      uninitialized, uninitialized, uninitialized};
+      uninitialized_size_t, uninitialized, uninitialized};
 
   if (data.prev_shmem_size != shmem_size) {
 
@@ -348,19 +349,20 @@ void cuda_occupancy_max_blocks_threads(Func&& func, int shmem_size,
 
 struct CudaOccMaxBlocksFixedThreadsData
 {
-  int prev_shmem_size;
+  size_t prev_shmem_size;
   int max_blocks;
   int multiProcessorCount;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), size_t num_threads, typename Func >
+template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func >
 RAJA_INLINE
-void cuda_occupancy_max_blocks(Func&& func, int shmem_size,
-                               size_t &max_blocks)
+void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size,
+                               int &max_blocks)
 {
   static constexpr int uninitialized = -1;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local CudaOccMaxBlocksFixedThreadsData data = {
-      uninitialized, uninitialized, uninitialized};
+      uninitialized_size_t, uninitialized, uninitialized};
 
   if (data.prev_shmem_size != shmem_size) {
 
@@ -385,7 +387,7 @@ void cuda_occupancy_max_blocks(Func&& func, int shmem_size,
 
 struct CudaOccMaxBlocksVariableThreadsData
 {
-  int prev_shmem_size;
+  size_t prev_shmem_size;
   int prev_num_threads;
   int max_blocks;
   int multiProcessorCount;
@@ -393,20 +395,19 @@ struct CudaOccMaxBlocksVariableThreadsData
 
 template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
 RAJA_INLINE
-void cuda_occupancy_max_blocks(Func&& func, int shmem_size,
-                               size_t &max_blocks, size_t num_threads)
+void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size,
+                               int &max_blocks, int num_threads)
 {
   static constexpr int uninitialized = 0;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local CudaOccMaxBlocksVariableThreadsData data = {
-      uninitialized, uninitialized, uninitialized, uninitialized};
+      uninitialized_size_t, uninitialized, uninitialized, uninitialized};
 
   if ( data.prev_shmem_size  != shmem_size ||
        data.prev_num_threads != num_threads ) {
 
-    int tmp_max_blocks;
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &tmp_max_blocks, func, static_cast<int>(num_threads), shmem_size));
-    data.max_blocks = tmp_max_blocks;
+    &data.max_blocks, func, num_threads, shmem_size));
 
     if (data.multiProcessorCount == uninitialized) {
 
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 8de879aadf..0e2966e8a8 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -50,7 +50,7 @@ namespace RAJA
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
  * Blocks per SM must be chosen by the user.
  */
-template <bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm>
+template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
 struct cuda_explicit_launch {};
 
 /*!
@@ -66,7 +66,7 @@ struct cuda_explicit_launch {};
  * Num_threads is 1024, which may not be appropriate for all kernels.
  * Blocks per SM defaults to 1.
  */
-template <bool async0, size_t num_blocks, size_t num_threads>
+template <bool async0, int num_blocks, int num_threads>
 using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
@@ -74,7 +74,7 @@ using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy
  * are determined by the CUDA occupancy calculator.
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
-template <size_t num_threads0, bool async0>
+template <int num_threads0, bool async0>
 using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
@@ -97,7 +97,7 @@ struct CudaKernelExt
  * calculator determine the unspecified values.
  * The kernel launch is synchronous.
  */
-template <size_t num_blocks, size_t num_threads, typename... EnclosedStmts>
+template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using CudaKernelExp =
     CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
 
@@ -107,7 +107,7 @@ using CudaKernelExp =
  * calculator determine the unspecified values.
  * The kernel launch is asynchronous.
  */
-template <size_t num_blocks, size_t num_threads, typename... EnclosedStmts>
+template <int num_blocks, int num_threads, typename... EnclosedStmts>
 using CudaKernelExpAsync =
     CudaKernelExt<cuda_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
 
@@ -134,9 +134,9 @@ using CudaKernelOccAsync =
  * number of threads (specified by num_threads)
  * The kernel launch is synchronous.
  */
-template <size_t num_threads, typename... EnclosedStmts>
+template <int num_threads, typename... EnclosedStmts>
 using CudaKernelFixed =
-    CudaKernelExt<cuda_launch<false, operators::limits<size_t>::max(), num_threads>,
+    CudaKernelExt<cuda_launch<false, operators::limits<int>::max(), num_threads>,
                   EnclosedStmts...>;
 
 /*!
@@ -144,9 +144,9 @@ using CudaKernelFixed =
  * number of threads (specified by num_threads)
  * The kernel launch is asynchronous.
  */
-template <size_t num_threads, typename... EnclosedStmts>
+template <int num_threads, typename... EnclosedStmts>
 using CudaKernelFixedAsync =
-    CudaKernelExt<cuda_launch<true, operators::limits<size_t>::max(), num_threads>,
+    CudaKernelExt<cuda_launch<true, operators::limits<int>::max(), num_threads>,
                   EnclosedStmts...>;
 
 /*!
@@ -154,9 +154,9 @@ using CudaKernelFixedAsync =
  * number of threads (specified by num_threads) and min blocks per sm.
  * The kernel launch is synchronous.
  */
-template <size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<size_t>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<false, operators::limits<int>::max(), num_threads, blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -164,9 +164,9 @@ using CudaKernelFixedSM =
  * number of threads (specified by num_threads) and min blocks per sm.
  * The kernel launch is asynchronous.
  */
-template <size_t num_threads, size_t blocks_per_sm, typename... EnclosedStmts>
+template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<size_t>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<true, operators::limits<int>::max(), num_threads, blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -210,7 +210,7 @@ __global__ void CudaKernelLauncher(Data data)
  *
  * This launcher is used by the CudaKerelFixed policies.
  */
-template <size_t BlockSize, size_t BlocksPerSM, typename Data, typename Exec>
+template <int BlockSize, int BlocksPerSM, typename Data, typename Exec>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
     void CudaKernelLauncherFixed(Data data)
 {
@@ -231,7 +231,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<size_t BlockSize, size_t BlocksPerSM, typename Data, typename executor_t>
+template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
   using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
@@ -270,7 +270,7 @@ struct CudaLaunchHelper;
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template<bool async0, size_t num_blocks, size_t num_threads, size_t blocks_per_sm, typename StmtList, typename Data, typename Types>
+template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
 struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
 {
   using Self = CudaLaunchHelper;
@@ -281,8 +281,8 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
   using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
 
-  inline static void recommended_blocks_threads(int shmem_size,
-      size_t &recommended_blocks, size_t &recommended_threads)
+  inline static void recommended_blocks_threads(size_t shmem_size,
+      int &recommended_blocks, int &recommended_threads)
   {
     auto func = kernelGetter_t::get();
 
@@ -337,7 +337,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
     }
   }
 
-  inline static void max_threads(int RAJA_UNUSED_ARG(shmem_size), size_t &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
   {
     if (num_threads <= 0) {
 
@@ -357,8 +357,8 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
     }
   }
 
-  inline static void max_blocks(int shmem_size,
-      size_t &max_blocks, size_t actual_threads)
+  inline static void max_blocks(size_t shmem_size,
+      int &max_blocks, int actual_threads)
   {
     auto func = kernelGetter_t::get();
 
@@ -418,7 +418,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
 inline
-cuda_dim_t fitCudaDims(size_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
+cuda_dim_t fitCudaDims(int limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
 
 
   // clamp things to at least 1
@@ -496,21 +496,21 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    size_t num_blocks = launch_dims.num_blocks();
-    size_t num_threads = launch_dims.num_threads();
+    int num_blocks = launch_dims.num_blocks();
+    int num_threads = launch_dims.num_threads();
     if (num_blocks > 0 || num_threads > 0) {
 
       //
       // Setup shared memory buffers
       //
-      int shmem = 0;
+      size_t shmem = 0;
 
 
       //
       // Compute the recommended physical kernel blocks and threads
       //
-      size_t recommended_blocks;
-      size_t recommended_threads;
+      int recommended_blocks;
+      int recommended_threads;
       launch_t::recommended_blocks_threads(
           shmem, recommended_blocks, recommended_threads);
 
@@ -518,7 +518,7 @@ struct StatementExecutor<
       //
       // Compute the MAX physical kernel threads
       //
-      size_t max_threads;
+      int max_threads;
       launch_t::max_threads(shmem, max_threads);
 
 
@@ -551,10 +551,10 @@ struct StatementExecutor<
       //
       // Compute the MAX physical kernel blocks
       //
-      size_t max_blocks;
+      int max_blocks;
       launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads());
 
-      size_t use_blocks;
+      int use_blocks;
 
       if ( launch_dims.num_threads() == recommended_threads ) {
 
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 9006dc54e2..bc6a454782 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -319,19 +319,20 @@ int hip_max_blocks(int block_size)
 
 struct HipOccMaxBlocksThreadsData
 {
-  int prev_shmem_size;
+  size_t prev_shmem_size;
   int max_blocks;
   int max_threads;
 };
 
 template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
 RAJA_INLINE
-void hip_occupancy_max_blocks_threads(Func&& func, int shmem_size,
+void hip_occupancy_max_blocks_threads(Func&& func, size_t shmem_size,
                                        int &max_blocks, int &max_threads)
 {
   static constexpr int uninitialized = -1;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local HipOccMaxBlocksThreadsData data = {
-      uninitialized, uninitialized, uninitialized};
+      uninitialized_size_t, uninitialized, uninitialized};
 
   if (data.prev_shmem_size != shmem_size) {
 
@@ -356,19 +357,20 @@ void hip_occupancy_max_blocks_threads(Func&& func, int shmem_size,
 
 struct HipOccMaxBlocksFixedThreadsData
 {
-  int prev_shmem_size;
+  size_t prev_shmem_size;
   int max_blocks;
   int multiProcessorCount;
 };
 
 template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func >
 RAJA_INLINE
-void hip_occupancy_max_blocks(Func&& func, int shmem_size,
+void hip_occupancy_max_blocks(Func&& func, size_t shmem_size,
                                int &max_blocks)
 {
   static constexpr int uninitialized = -1;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local HipOccMaxBlocksFixedThreadsData data = {
-      uninitialized, uninitialized, uninitialized};
+      uninitialized_size_t, uninitialized, uninitialized};
 
   if (data.prev_shmem_size != shmem_size) {
 
@@ -399,7 +401,7 @@ void hip_occupancy_max_blocks(Func&& func, int shmem_size,
 
 struct HipOccMaxBlocksVariableThreadsData
 {
-  int prev_shmem_size;
+  size_t prev_shmem_size;
   int prev_num_threads;
   int max_blocks;
   int multiProcessorCount;
@@ -407,12 +409,13 @@ struct HipOccMaxBlocksVariableThreadsData
 
 template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
 RAJA_INLINE
-void hip_occupancy_max_blocks(Func&& func, int shmem_size,
+void hip_occupancy_max_blocks(Func&& func, size_t shmem_size,
                                int &max_blocks, int num_threads)
 {
   static constexpr int uninitialized = 0;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local HipOccMaxBlocksVariableThreadsData data = {
-      uninitialized, uninitialized, uninitialized, uninitialized};
+      uninitialized_size_t, uninitialized, uninitialized, uninitialized};
 
   if ( data.prev_shmem_size  != shmem_size ||
        data.prev_num_threads != num_threads ) {
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 016fc980b8..f7d6a0e6a4 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -189,7 +189,7 @@ __global__ void HipKernelLauncher(Data data)
  *
  * This launcher is used by the HipKerelFixed policies.
  */
-template <size_t BlockSize, typename Data, typename Exec>
+template <int BlockSize, typename Data, typename Exec>
 __launch_bounds__(BlockSize, 1) __global__
     void HipKernelLauncherFixed(Data data)
 {
@@ -210,7 +210,7 @@ __launch_bounds__(BlockSize, 1) __global__
  * The default case handles BlockSize != 0 and gets the fixed max block size
  * version of the kernel.
  */
-template<size_t BlockSize, typename Data, typename executor_t>
+template<int BlockSize, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
   using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
@@ -260,7 +260,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
   using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
 
-  inline static void recommended_blocks_threads(int shmem_size,
+  inline static void recommended_blocks_threads(size_t shmem_size,
       int &recommended_blocks, int &recommended_threads)
   {
     auto func = kernelGetter_t::get();
@@ -316,7 +316,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
     }
   }
 
-  inline static void max_threads(int RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
   {
     if (num_threads <= 0) {
 
@@ -336,7 +336,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
     }
   }
 
-  inline static void max_blocks(int shmem_size,
+  inline static void max_blocks(size_t shmem_size,
       int &max_blocks, int actual_threads)
   {
     auto func = kernelGetter_t::get();
@@ -482,7 +482,7 @@ struct StatementExecutor<
       //
       // Setup shared memory buffers
       //
-      int shmem = 0;
+      size_t shmem = 0;
 
 
       //

From 5f211b531043129f477b63ad113c58d280667251 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Aug 2023 08:16:51 -0700
Subject: [PATCH 23/34] add occupancy policies

---
 docs/sphinx/user_guide/feature/policies.rst | 25 ++++++++++++++++-----
 include/RAJA/policy/cuda/policy.hpp         | 16 +++++++++++++
 include/RAJA/policy/hip/policy.hpp          |  8 +++++++
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index b425473aba..760d6ee7de 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -279,12 +279,27 @@ policies have the prefix ``hip_``.
  CUDA/HIP Execution Policies               Works with    Brief description
  ========================================= ============= =======================================
  cuda/hip_exec<BLOCK_SIZE>                 forall,       Execute loop iterations
-                                           scan,         in a GPU kernel launched
-                                           sort          with given thread-block
-                                                         size. Note that the
+                                           scan,         directly mapped to global threads
+                                           sort          in a GPU kernel launched
+                                                         with given thread-block
+                                                         size and unbounded grid size.
+                                                         Note that the thread-block
+                                                         size must be provided,
+                                                         there is no default.
+ cuda/hip_exec_occupancy<BLOCK_SIZE>       forall        Execute loop iterations
+                                                         mapped to global threads via
+                                                         grid striding with multiple
+                                                         iterations per global thread
+                                                         in a GPU kernel launched
+                                                         with given thread-block
+                                                         size and grid size bounded
+                                                         by the maximum occupancy of
+                                                         the kernel. Note that the
                                                          thread-block size must
-                                                         be provided, there is
-                                                         no default.
+                                                         be provided, there is no
+                                                         default. Note this can improve
+                                                         reducer performance in kernels
+                                                         with large iteration counts.
  cuda/hip_launch_t                         launch        Launches a device kernel,
                                                          any code expressed within
                                                          the lambda is executed
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 64ec12a4cb..72ba5c8f0c 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -805,6 +805,22 @@ template <size_t BLOCK_SIZE>
 using cuda_exec_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_occupancy_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_occupancy_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_occupancy = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE>
+using cuda_exec_occupancy_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
 using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 8b7f29462d..ecc5b756b4 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -783,6 +783,14 @@ template <size_t BLOCK_SIZE>
 using hip_exec_async = policy::hip::hip_exec<
     iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>, true>;
 
+template <size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_occupancy = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, Async>;
+
+template <size_t BLOCK_SIZE>
+using hip_exec_occupancy_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, true>;
+
 using policy::hip::hip_work;
 
 template <size_t BLOCK_SIZE>

From d2abaec15680ceb37ec03155289b29cecaee6c2b Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Aug 2023 08:38:13 -0700
Subject: [PATCH 24/34] add more comments

---
 include/RAJA/policy/cuda/policy.hpp | 9 ++++++++-
 include/RAJA/policy/hip/policy.hpp  | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 72ba5c8f0c..499993b84e 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -772,7 +772,7 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 } // namespace cuda
 
-
+// policies usable with forall, scan, and sort
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, Async>;
@@ -821,6 +821,7 @@ template <size_t BLOCK_SIZE>
 using cuda_exec_occupancy_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
+// policies usable with WorkGroup
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
 using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
@@ -835,13 +836,16 @@ using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cud
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
+// policies usable with atomics
 using policy::cuda::cuda_atomic;
 using policy::cuda::cuda_atomic_explicit;
 
+// policies usable with reducers
 using policy::cuda::cuda_reduce_base;
 using policy::cuda::cuda_reduce;
 using policy::cuda::cuda_reduce_atomic;
 
+// policies usable with kernel
 using policy::cuda::cuda_block_reduce;
 using policy::cuda::cuda_warp_reduce;
 
@@ -860,8 +864,10 @@ using policy::cuda::cuda_warp_masked_loop;
 using policy::cuda::cuda_thread_masked_direct;
 using policy::cuda::cuda_thread_masked_loop;
 
+// policies usable with synchronize
 using policy::cuda::cuda_synchronize;
 
+// policies usable with launch
 template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
 
@@ -869,6 +875,7 @@ template <bool Async, int num_threads = named_usage::unspecified>
 using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
+// policies usable with kernel and launch
 template < typename ... indexers >
 using cuda_indexer_direct = policy::cuda::cuda_indexer<
     iteration_mapping::Direct,
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index ecc5b756b4..527c3c4411 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -767,6 +767,7 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 } // namespace hip
 
+// policies usable with forall, scan, and sort
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
     iteration_mapping::Direct, hip::global_x<BLOCK_SIZE, GRID_SIZE>, Async>;
@@ -791,6 +792,7 @@ template <size_t BLOCK_SIZE>
 using hip_exec_occupancy_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, true>;
 
+// policies usable with WorkGroup
 using policy::hip::hip_work;
 
 template <size_t BLOCK_SIZE>
@@ -798,13 +800,16 @@ using hip_work_async = policy::hip::hip_work<BLOCK_SIZE, true>;
 
 using policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
 
+// policies usable with atomics
 using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
 
+// policies usable with reducers
 using policy::hip::hip_reduce_base;
 using policy::hip::hip_reduce;
 using policy::hip::hip_reduce_atomic;
 
+// policies usable with kernel
 using policy::hip::hip_block_reduce;
 using policy::hip::hip_warp_reduce;
 
@@ -823,11 +828,14 @@ using policy::hip::hip_warp_masked_loop;
 using policy::hip::hip_thread_masked_direct;
 using policy::hip::hip_thread_masked_loop;
 
+// policies usable with synchronize
 using policy::hip::hip_synchronize;
 
+// policies usable with launch
 using policy::hip::hip_launch_t;
 
 
+// policies usable with kernel and launch
 template < typename ... indexers >
 using hip_indexer_direct = policy::hip::hip_indexer<
     iteration_mapping::Direct,

From bb9cce41c8e156266e50ab3cea916d982002c20d Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Aug 2023 08:39:42 -0700
Subject: [PATCH 25/34] make types more consistent

---
 include/RAJA/policy/cuda/kernel/CudaKernel.hpp | 2 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 0e2966e8a8..95471e05f2 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -418,7 +418,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
 inline
-cuda_dim_t fitCudaDims(int limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
+cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
 
 
   // clamp things to at least 1
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index f7d6a0e6a4..e4ea8e89f0 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -397,7 +397,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
 inline
-hip_dim_t fitHipDims(unsigned int limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
+hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
 
 
   // clamp things to at least 1

From d3f47c8ae7f93177f9490bf4828a0d4749f8ea84 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Aug 2023 08:43:06 -0700
Subject: [PATCH 26/34] make naming more consistent between kernel and forall
 policy names

---
 include/RAJA/policy/cuda/policy.hpp | 8 ++++----
 include/RAJA/policy/hip/policy.hpp  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 499993b84e..58424edb88 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -806,19 +806,19 @@ using cuda_exec_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_occupancy_explicit = policy::cuda::cuda_exec_explicit<
+using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_occupancy_explicit_async = policy::cuda::cuda_exec_explicit<
+using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_occupancy = policy::cuda::cuda_exec_explicit<
+using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_occupancy_async = policy::cuda::cuda_exec_explicit<
+using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 // policies usable with WorkGroup
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 527c3c4411..2000bde42b 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -785,11 +785,11 @@ using hip_exec_async = policy::hip::hip_exec<
     iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_occupancy = policy::hip::hip_exec<
+using hip_exec_occ_calc = policy::hip::hip_exec<
     iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_occupancy_async = policy::hip::hip_exec<
+using hip_exec_occ_calc_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, true>;
 
 // policies usable with WorkGroup

From 0865243d0ea0c2c2fa5a8d3dff8d9ac9ba215502 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Aug 2023 08:43:58 -0700
Subject: [PATCH 27/34] fix docs

---
 docs/sphinx/user_guide/feature/policies.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 760d6ee7de..ed6808f6e6 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -286,7 +286,7 @@ policies have the prefix ``hip_``.
                                                          Note that the thread-block
                                                          size must be provided,
                                                          there is no default.
- cuda/hip_exec_occupancy<BLOCK_SIZE>       forall        Execute loop iterations
+ cuda/hip_exec_occ_calc<BLOCK_SIZE>        forall        Execute loop iterations
                                                          mapped to global threads via
                                                          grid striding with multiple
                                                          iterations per global thread

From 282fb01bf78f233b7a9cc6be40b105fd4e96c7ad Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Aug 2023 08:49:17 -0700
Subject: [PATCH 28/34] Add occ_calc and grid forall policy tests

---
 test/include/RAJA_test-forall-execpol.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 85c48e96d9..520f1af413 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -120,7 +120,8 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 
 #if defined(RAJA_ENABLE_CUDA)
 using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
-                                       RAJA::cuda_exec<256>,
+                                       RAJA::cuda_exec_occ_calc<256>,
+                                       RAJA::cuda_exec_grid<256, 64>,
                                        RAJA::cuda_exec_explicit<256,2> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
@@ -131,7 +132,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 
 #if defined(RAJA_ENABLE_HIP)
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
-                                      RAJA::hip_exec<256>  >;
+                                      RAJA::hip_exec_occ_calc<256>,
+                                      RAJA::hip_exec_grid<256, 64>  >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 

From 27358456564d571c2b96b1f1d0e4b80961f865ef Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Aug 2023 09:06:09 -0700
Subject: [PATCH 29/34] Change grid policies to use strided loop

add documentation for grid forall policies
---
 docs/sphinx/user_guide/feature/policies.rst | 10 ++++++++++
 include/RAJA/policy/cuda/policy.hpp         |  8 ++++----
 include/RAJA/policy/hip/policy.hpp          |  4 ++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index ed6808f6e6..2f707c5efd 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -286,6 +286,16 @@ policies have the prefix ``hip_``.
                                                          Note that the thread-block
                                                          size must be provided,
                                                          there is no default.
+ cuda/hip_exec_grid<BLOCK_SIZE, GRID_SIZE> forall,       Execute loop iterations
+                                                         mapped to global threads via
+                                                         grid striding with multiple
+                                                         iterations per global thread
+                                                         in a GPU kernel launched
+                                                         with given thread-block
+                                                         size and grid size.
+                                                         Note that the thread-block
+                                                         size and grid size must be
+                                                         provided, there is no default.
  cuda/hip_exec_occ_calc<BLOCK_SIZE>        forall        Execute loop iterations
                                                          mapped to global threads via
                                                          grid striding with multiple
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 58424edb88..fec6d70b7b 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -775,19 +775,19 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 // policies usable with forall, scan, and sort
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 2000bde42b..f943b74461 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -770,11 +770,11 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 // policies usable with forall, scan, and sort
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE, GRID_SIZE>, Async>;
+    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE, GRID_SIZE>, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE, GRID_SIZE>, true>;
+    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE, GRID_SIZE>, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec = policy::hip::hip_exec<

From d347ece4627350ba58635fd63820255e018d2224 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 4 Aug 2023 10:57:30 -0700
Subject: [PATCH 30/34] Use camp default stream in DeviceZeroedAllocators

---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 4 +++-
 include/RAJA/policy/hip/MemUtils_HIP.hpp   | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 697ed225a0..df9922b06d 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -95,9 +95,11 @@ struct DeviceZeroedAllocator {
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
+    auto res = ::camp::resources::Cuda::get_default();
     void* ptr;
     cudaErrchk(cudaMalloc(&ptr, nbytes));
-    cudaErrchk(cudaMemset(ptr, 0, nbytes));
+    cudaErrchk(cudaMemsetAsync(ptr, 0, nbytes, res.get_stream()));
+    cudaErrchk(cudaStreamSynchronize(res.get_stream()));
     return ptr;
   }
 
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index bc6a454782..6c856114ff 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -96,9 +96,11 @@ struct DeviceZeroedAllocator {
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
+    auto res = ::camp::resources::Hip::get_default();
     void* ptr;
     hipErrchk(hipMalloc(&ptr, nbytes));
-    hipErrchk(hipMemset(ptr, 0, nbytes));
+    hipErrchk(hipMemsetAsync(ptr, 0, nbytes, res.get_stream()));
+    hipErrchk(hipStreamSynchronize(res.get_stream()));
     return ptr;
   }
 

From d6bed3599a73eb13a1db00e7df4bfde1641b1e21 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 4 Aug 2023 10:57:53 -0700
Subject: [PATCH 31/34] Avoid stream 0 in WorkGroup Dispatcher

---
 include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp | 11 +++++++----
 include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp  | 12 +++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index 6ae361149d..0c63d2dbc8 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -20,6 +20,8 @@
 
 #include "RAJA/config.hpp"
 
+#include "camp/resource.hpp"
+
 #include "RAJA/policy/cuda/policy.hpp"
 
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
@@ -77,11 +79,12 @@ inline auto get_value(Factory&& factory)
   using value_type = typename std::decay_t<Factory>::value_type;
   const std::lock_guard<std::mutex> lock(get_value_mutex());
 
+  auto res = ::camp::resources::Cuda::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  get_value_global<std::decay_t<Factory>><<<1,1>>>(
-      ptr, std::forward<Factory>(factory));
-  cudaErrchk(cudaGetLastError());
-  cudaErrchk(cudaDeviceSynchronize());
+  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void *args[] = {(void*)&ptr, (void*)&factory};
+  cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
+  cudaErrchk(cudaStreamSynchronize(res.get_stream()));
 
   return *ptr;
 }
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 2f2bec7f0e..dffb773740 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -20,6 +20,8 @@
 
 #include "RAJA/config.hpp"
 
+#include "camp/resource.hpp"
+
 #include "RAJA/policy/hip/policy.hpp"
 
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
@@ -77,12 +79,12 @@ inline auto get_value(Factory&& factory)
   using value_type = typename std::decay_t<Factory>::value_type;
   const std::lock_guard<std::mutex> lock(get_value_mutex());
 
+  auto res = ::camp::resources::Hip::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = get_value_global<std::decay_t<Factory>>;
-  hipLaunchKernelGGL(func, dim3(1), dim3(1), 0, 0,
-                     ptr, std::forward<Factory>(factory));
-  hipErrchk(hipGetLastError());
-  hipErrchk(hipDeviceSynchronize());
+  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void *args[] = {(void*)&ptr, (void*)&factory};
+  hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
+  hipErrchk(hipStreamSynchronize(res.get_stream()));
 
   return *ptr;
 }

From f0569d421b70527615998edbb8b384fbec4fe891 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 4 Aug 2023 14:33:48 -0700
Subject: [PATCH 32/34] Update scripts to build witn Intel compilers

---
 .../lc-builds/toss3/icpc_X_gcc8headers.cmake  |  2 +-
 .../lc-builds/toss4/icpc-classic_X.cmake      | 18 ++++++
 scripts/lc-builds/toss3_icpc.sh               |  2 +-
 scripts/lc-builds/toss4_icpc-classic.sh       | 63 +++++++++++++++++++
 4 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100755 host-configs/lc-builds/toss4/icpc-classic_X.cmake
 create mode 100755 scripts/lc-builds/toss4_icpc-classic.sh

diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake
index b49e950244..eb4c5b949a 100755
--- a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake
+++ b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake
@@ -7,7 +7,7 @@
 
 set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "")
 
-set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/g++")
+set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++")
 
 set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "")
diff --git a/host-configs/lc-builds/toss4/icpc-classic_X.cmake b/host-configs/lc-builds/toss4/icpc-classic_X.cmake
new file mode 100755
index 0000000000..762c36db44
--- /dev/null
+++ b/host-configs/lc-builds/toss4/icpc-classic_X.cmake
@@ -0,0 +1,18 @@
+###############################################################################
+# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "")
+
+set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-10.3.1/bin/g++")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh
index 5a87d4568c..42f2177f9d 100755
--- a/scripts/lc-builds/toss3_icpc.sh
+++ b/scripts/lc-builds/toss3_icpc.sh
@@ -19,7 +19,7 @@ shift 1
 
 COMP_MAJOR_VER=${COMP_VER:0:2}
 GCC_HEADER_VER=7
-USE_TBB=On
+USE_TBB=Off
 
 if [ ${COMP_MAJOR_VER} -gt 18 ]
 then
diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh
new file mode 100755
index 0000000000..0c08f93914
--- /dev/null
+++ b/scripts/lc-builds/toss4_icpc-classic.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [ "$1" == "" ]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss4_icpc-classic.sh 19.1.2"
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+USE_TBB=Off
+
+BUILD_SUFFIX=lc_toss4-icpc-classic-${COMP_VER}
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.21.1
+
+##
+# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
+# times at a potential cost of slower 'forall' execution.
+##
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \
+  -DBLT_CXX_STD=c++14 \
+  -C ../host-configs/lc-builds/toss4/icpc-classic_X.cmake \
+  -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \
+  -DENABLE_OPENMP=On \
+  -DRAJA_ENABLE_TBB=${USE_TBB} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you may need to add some intel openmp libraries to your"
+echo "  LD_LIBRARY_PATH to run with openmp."
+echo
+echo "    LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin"
+echo
+echo "***********************************************************************"

From b5f7f20e72fe2bea5a7b1ce1fb3181cdf071b5d2 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Mon, 14 Aug 2023 12:23:50 -0700
Subject: [PATCH 33/34] Update version number

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24856b7eeb..4814246f12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,9 +14,9 @@ endif()
 include(CMakeDependentOption)
 
 # Set version number
-set(RAJA_VERSION_MAJOR 2022)
-set(RAJA_VERSION_MINOR 10)
-set(RAJA_VERSION_PATCHLEVEL 5)
+set(RAJA_VERSION_MAJOR 2023)
+set(RAJA_VERSION_MINOR 06)
+set(RAJA_VERSION_PATCHLEVEL 1)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")

From e7ee7f8ee7c432b9e81f488678300680f93bf941 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Mon, 14 Aug 2023 12:24:08 -0700
Subject: [PATCH 34/34] Add release notes.

---
 RELEASE_NOTES.md | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 7390020bcc..d4c291b2de 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -6,7 +6,7 @@
 [comment]: # (# SPDX-License-Identifier: BSD-3-Clause)
 [comment]: # (#################################################################)
 
-Version vxx.yy.zz -- Release date 20yy-mm-dd
+Version YYYY.MM.PP -- Release date 20yy-mm-dd
 ============================================
 
 This release contains ...
@@ -15,13 +15,38 @@ Notable changes include:
 
   * New features / API changes:
 
+  * Build changes/improvements:
+
+  * Bug fixes/improvements:
+
+
+Version 2023.06.1 -- Release date 2023-08-16
+============================================
+
+This release contains various smaller RAJA improvements. 
+
+Notable changes include:
+
+  * New features / API changes:
+     * Add compile time block size optimization for new reduction interface.
+     * Changed default stream usage for Workgroup constructs to use the 
+       stream associated with the default (camp) resource. Previously, we were 
+       using stream zero. Specifically, this change affects where we memset 
+       memory in the zeroed device memory pool and where we get device function
+       pointers for WorkGroup.  
+
   * Build changes/improvements:
      * RAJA_ENABLE_OPENMP_TASK CMake option added to enable/disable algorithm 
        options based on OpenMP task construct. Currently, this only applies
        to RAJA's OpenMP sort implementation. The default is 'Off'. The option
        allows users to choose a task implementation if they wish.
+     * Resolve several compiler warnings.
 
   * Bug fixes/improvements:
+     * Fix compilation of GPU occupancy calculator and use common types for 
+       HIP and CUDA backends in the occupancy calculator, kernel policies, 
+       and kernel launch helper routines.
+     * Fix direct cudaMalloc/hipMalloc calls and memory leaks.
 
 
 Version 2023.06.0 -- Release date 2023-07-06