From 978637d7703e55f8f3d05f73e3fcb932c971f04d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 6 Jul 2023 09:02:23 -0700 Subject: [PATCH 01/34] Fix memory management in cuda/hip Free memory in resolve instead of leaking. Use device_mempool_type instead of cuda/hipMalloc --- include/RAJA/policy/cuda/params/reduce.hpp | 16 +++++++++++++--- include/RAJA/policy/hip/params/reduce.hpp | 16 +++++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp index 6d142ca19b..cb87ee68cc 100644 --- a/include/RAJA/policy/cuda/params/reduce.hpp +++ b/include/RAJA/policy/cuda/params/reduce.hpp @@ -17,7 +17,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_cuda_policy > init(Reducer& red, const RAJA::cuda::detail::cudaInfo & cs) { - cudaMalloc( (void**)(&(red.devicetarget)), sizeof(T)); + red.devicetarget = RAJA::cuda::device_mempool_type::getInstance().template malloc(1); red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z); red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc(1); } @@ -26,17 +26,27 @@ namespace detail { template RAJA_HOST_DEVICE camp::concepts::enable_if< type_traits::is_cuda_policy > - combine(Reducer& red) { + combine(Reducer& red) + { RAJA::cuda::impl::expt::grid_reduce(red); } // Resolve template camp::concepts::enable_if< type_traits::is_cuda_policy > - resolve(Reducer& red) { + resolve(Reducer& red) + { + // complete reduction cudaDeviceSynchronize(); cudaMemcpy(&red.val, red.devicetarget, sizeof(T), cudaMemcpyDeviceToHost); *red.target = OP{}(red.val, *red.target); + + // free memory + RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count); + red.device_count = nullptr; + red.device_mem.deallocate(); + RAJA::cuda::device_mempool_type::getInstance().free(red.devicetarget); + red.devicetarget = nullptr; } } // namespace detail diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp index ee6b39548e..0a06a76c0c 100644 --- a/include/RAJA/policy/hip/params/reduce.hpp +++ b/include/RAJA/policy/hip/params/reduce.hpp @@ -17,7 +17,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_hip_policy > init(Reducer& red, const RAJA::hip::detail::hipInfo & cs) { - hipMalloc( (void**)(&(red.devicetarget)), sizeof(T)); + red.devicetarget = RAJA::hip::device_mempool_type::getInstance().template malloc(1); red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z); red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc(1); } @@ -26,17 +26,27 @@ namespace detail { template RAJA_HOST_DEVICE camp::concepts::enable_if< type_traits::is_hip_policy > - combine(Reducer& red) { + combine(Reducer& red) + { RAJA::hip::impl::expt::grid_reduce(red); } // Resolve template camp::concepts::enable_if< type_traits::is_hip_policy > - resolve(Reducer& red) { + resolve(Reducer& red) + { + // complete reduction hipDeviceSynchronize(); hipMemcpy(&red.val, red.devicetarget, sizeof(T), hipMemcpyDeviceToHost); *red.target = OP{}(red.val, *red.target); + + // free memory + RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count); + red.device_count = nullptr; + red.device_mem.deallocate(); + RAJA::hip::device_mempool_type::getInstance().free(red.devicetarget); + red.devicetarget = nullptr; } } // namespace detail From 59ad47498ef90017bab87b20520343004acff4fd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 6 Jul 2023 09:44:48 -0700 Subject: [PATCH 02/34] Use policy info in cuda/hip expt::grid_reduce and block_reduce --- include/RAJA/policy/cuda/params/reduce.hpp | 2 +- include/RAJA/policy/cuda/policy.hpp | 20 ++++++++++++ include/RAJA/policy/cuda/reduce.hpp | 38 ++++++++++------------ include/RAJA/policy/hip/params/reduce.hpp | 2 +- include/RAJA/policy/hip/policy.hpp | 20 ++++++++++++ include/RAJA/policy/hip/reduce.hpp | 38 ++++++++++------------ 6 files changed, 76 insertions(+), 44 deletions(-) diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp index cb87ee68cc..442114d183 100644 --- a/include/RAJA/policy/cuda/params/reduce.hpp +++ b/include/RAJA/policy/cuda/params/reduce.hpp @@ -28,7 +28,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_cuda_policy > combine(Reducer& red) { - RAJA::cuda::impl::expt::grid_reduce(red); + RAJA::cuda::impl::expt::grid_reduce(red); } // Resolve diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 64ec12a4cb..7dec1f9fab 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -748,6 +748,26 @@ struct IndexGlobal } }; +// helper to get just the thread indexing part of IndexGlobal +template < typename index_global > +struct get_index_thread; +/// +template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE > +struct get_index_thread> +{ + using type = IndexGlobal; +}; + +// helper to get just the block indexing part of IndexGlobal +template < typename index_global > +struct get_index_block; +/// +template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE > +struct get_index_block> +{ + using type = IndexGlobal; +}; + template using thread_x = IndexGlobal; diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index fba82934f5..8d011ccdd8 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -528,16 +528,14 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, namespace expt { -template +template RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) { - int numThreads = blockDim.x * blockDim.y * blockDim.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + const int numThreads = ThreadIterationGetter::size(); + const int threadId = ThreadIterationGetter::index(); - int warpId = threadId % RAJA::policy::cuda::WARP_SIZE; - int warpNum = threadId / RAJA::policy::cuda::WARP_SIZE; + const int warpId = threadId % RAJA::policy::cuda::WARP_SIZE; + const int warpNum = threadId / RAJA::policy::cuda::WARP_SIZE; T temp = val; @@ -604,20 +602,20 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer& red) { - - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int numThreads = blockDim.x * blockDim.y * blockDim.z; - unsigned int wrap_around = numBlocks - 1; +template +RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red) +{ + using BlockIterationGetter = typename get_index_block::type; + using ThreadIterationGetter = typename get_index_thread::type; - int blockId = blockIdx.x + gridDim.x * blockIdx.y + - (gridDim.x * gridDim.y) * blockIdx.z; + const int numBlocks = BlockIterationGetter::size(); + const int numThreads = ThreadIterationGetter::size(); + const unsigned int wrap_around = numBlocks - 1; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + const int blockId = BlockIterationGetter::index(); + const int threadId = ThreadIterationGetter::index(); - T temp = block_reduce(red.val, OP::identity()); + T temp = block_reduce(red.val, OP::identity()); // one thread per block writes to device_mem bool lastBlock = false; @@ -641,15 +639,13 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer& red temp = OP{}(temp, red.device_mem.get(i)); } - temp = block_reduce(temp, OP::identity()); + temp = block_reduce(temp, OP::identity()); // one thread returns value if (threadId == 0) { *(red.devicetarget) = temp; } } - - return lastBlock && threadId == 0; } } // namespace expt diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp index 0a06a76c0c..a683885b8b 100644 --- a/include/RAJA/policy/hip/params/reduce.hpp +++ b/include/RAJA/policy/hip/params/reduce.hpp @@ -28,7 +28,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_hip_policy > combine(Reducer& red) { - RAJA::hip::impl::expt::grid_reduce(red); + RAJA::hip::impl::expt::grid_reduce(red); } // Resolve diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 8b7f29462d..f326716bce 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -743,6 +743,26 @@ struct IndexGlobal } }; +// helper to get just the thread indexing part of IndexGlobal +template < typename index_global > +struct get_index_thread; +/// +template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE > +struct get_index_thread> +{ + using type = IndexGlobal; +}; + +// helper to get just the block indexing part of IndexGlobal +template < typename index_global > +struct get_index_block; +/// +template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE > +struct get_index_block> +{ + using type = IndexGlobal; +}; + template using thread_x = IndexGlobal; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index a3127b50aa..584534db14 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -401,16 +401,14 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, namespace expt { -template +template RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) { - int numThreads = blockDim.x * blockDim.y * blockDim.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + const int numThreads = ThreadIterationGetter::size(); + const int threadId = ThreadIterationGetter::index(); - int warpId = threadId % RAJA::policy::hip::WARP_SIZE; - int warpNum = threadId / RAJA::policy::hip::WARP_SIZE; + const int warpId = threadId % RAJA::policy::hip::WARP_SIZE; + const int warpNum = threadId / RAJA::policy::hip::WARP_SIZE; T temp = val; @@ -477,20 +475,20 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) } -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer& red) { - - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int numThreads = blockDim.x * blockDim.y * blockDim.z; - unsigned int wrap_around = numBlocks - 1; +template +RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red) +{ + using BlockIterationGetter = typename get_index_block::type; + using ThreadIterationGetter = typename get_index_thread::type; - int blockId = blockIdx.x + gridDim.x * blockIdx.y + - (gridDim.x * gridDim.y) * blockIdx.z; + const int numBlocks = BlockIterationGetter::size(); + const int numThreads = ThreadIterationGetter::size(); + const unsigned int wrap_around = numBlocks - 1; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + const int blockId = BlockIterationGetter::index(); + const int threadId = ThreadIterationGetter::index(); - T temp = block_reduce(red.val, OP::identity()); + T temp = block_reduce(red.val, OP::identity()); // one thread per block writes to device_mem bool lastBlock = false; @@ -514,15 +512,13 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer& red temp = OP{}(temp, red.device_mem.get(i)); } - temp = block_reduce(temp, OP::identity()); + temp = block_reduce(temp, OP::identity()); // one thread returns value if (threadId == 0) { *(red.devicetarget) = temp; } } - - return lastBlock && threadId == 0; } } // namespace expt From 8a994e9213ccf8660134f77bf166c84a7a5c0bb8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 6 Jul 2023 10:17:25 -0700 Subject: [PATCH 03/34] Add param support to detail_resolve --- include/RAJA/pattern/params/forall.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp index 499b442cc5..61ecfbe33e 100644 --- a/include/RAJA/pattern/params/forall.hpp +++ b/include/RAJA/pattern/params/forall.hpp @@ -56,9 +56,9 @@ namespace expt } // Resolve - template - static constexpr void detail_resolve(EXEC_POL, camp::idx_seq, ForallParamPack& f_params ) { - CAMP_EXPAND(detail::resolve( camp::get(f_params.param_tup) )); + template + static constexpr void detail_resolve(EXEC_POL, camp::idx_seq, ForallParamPack& f_params, Args&& ...args) { + CAMP_EXPAND(detail::resolve( camp::get(f_params.param_tup), std::forward(args)... )); } // Used to construct the argument TYPES that will be invoked with the lambda. From 5dbb0f4f375517f25033639778b264a18236ecdf Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 6 Jul 2023 09:52:51 -0700 Subject: [PATCH 04/34] Use resource to copy and sync in resolve --- include/RAJA/policy/cuda/forall.hpp | 2 +- include/RAJA/policy/cuda/params/reduce.hpp | 10 +++++----- include/RAJA/policy/hip/forall.hpp | 2 +- include/RAJA/policy/hip/params/reduce.hpp | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index 7d6bb820f6..91af0cb321 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -629,7 +629,7 @@ forall_impl(resources::Cuda cuda_res, void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params}; RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async); - RAJA::expt::ParamMultiplexer::resolve(f_params); + RAJA::expt::ParamMultiplexer::resolve(f_params, launch_info); } RAJA_FT_END; diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp index 442114d183..80f625080c 100644 --- a/include/RAJA/policy/cuda/params/reduce.hpp +++ b/include/RAJA/policy/cuda/params/reduce.hpp @@ -15,10 +15,10 @@ namespace detail { // Init template camp::concepts::enable_if< type_traits::is_cuda_policy > - init(Reducer& red, const RAJA::cuda::detail::cudaInfo & cs) + init(Reducer& red, RAJA::cuda::detail::cudaInfo& ci) { red.devicetarget = RAJA::cuda::device_mempool_type::getInstance().template malloc(1); - red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z); + red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z); red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc(1); } @@ -34,11 +34,11 @@ namespace detail { // Resolve template camp::concepts::enable_if< type_traits::is_cuda_policy > - resolve(Reducer& red) + resolve(Reducer& red, RAJA::cuda::detail::cudaInfo& ci) { // complete reduction - cudaDeviceSynchronize(); - cudaMemcpy(&red.val, red.devicetarget, sizeof(T), cudaMemcpyDeviceToHost); + ci.res.memcpy(&red.val, red.devicetarget, sizeof(T)); + ci.res.wait(); *red.target = OP{}(red.val, *red.target); // free memory diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index 6483d9b59a..03d530ab2d 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -622,7 +622,7 @@ forall_impl(resources::Hip hip_res, void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params}; RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async); - RAJA::expt::ParamMultiplexer::resolve(f_params); + RAJA::expt::ParamMultiplexer::resolve(f_params, launch_info); } RAJA_FT_END; diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp index a683885b8b..fa73e2da2e 100644 --- a/include/RAJA/policy/hip/params/reduce.hpp +++ b/include/RAJA/policy/hip/params/reduce.hpp @@ -15,10 +15,10 @@ namespace detail { // Init template camp::concepts::enable_if< type_traits::is_hip_policy > - init(Reducer& red, const RAJA::hip::detail::hipInfo & cs) + init(Reducer& red, RAJA::hip::detail::hipInfo& hi) { red.devicetarget = RAJA::hip::device_mempool_type::getInstance().template malloc(1); - red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z); + red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z); red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc(1); } @@ -34,11 +34,11 @@ namespace detail { // Resolve template camp::concepts::enable_if< type_traits::is_hip_policy > - resolve(Reducer& red) + resolve(Reducer& red, RAJA::hip::detail::hipInfo& hi) { // complete reduction - hipDeviceSynchronize(); - hipMemcpy(&red.val, red.devicetarget, sizeof(T), hipMemcpyDeviceToHost); + hi.res.memcpy(&red.val, red.devicetarget, sizeof(T)); + hi.res.wait(); *red.target = OP{}(red.val, *red.target); // free memory From edf7a227d9e6d22f32fd65e74570d96862dd6990 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 6 Jul 2023 09:58:06 -0700 Subject: [PATCH 05/34] Put devicetarget in pinned to avoid memcpy --- include/RAJA/policy/cuda/params/reduce.hpp | 7 +++---- include/RAJA/policy/hip/params/reduce.hpp | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp index 80f625080c..f60117dba9 100644 --- a/include/RAJA/policy/cuda/params/reduce.hpp +++ b/include/RAJA/policy/cuda/params/reduce.hpp @@ -17,7 +17,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_cuda_policy > init(Reducer& red, RAJA::cuda::detail::cudaInfo& ci) { - red.devicetarget = RAJA::cuda::device_mempool_type::getInstance().template malloc(1); + red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc(1); red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z); red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc(1); } @@ -37,15 +37,14 @@ namespace detail { resolve(Reducer& red, RAJA::cuda::detail::cudaInfo& ci) { // complete reduction - ci.res.memcpy(&red.val, red.devicetarget, sizeof(T)); ci.res.wait(); - *red.target = OP{}(red.val, *red.target); + *red.target = OP{}(*red.devicetarget, *red.target); // free memory RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count); red.device_count = nullptr; red.device_mem.deallocate(); - RAJA::cuda::device_mempool_type::getInstance().free(red.devicetarget); + RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget); red.devicetarget = nullptr; } diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp index fa73e2da2e..4d090e9e92 100644 --- a/include/RAJA/policy/hip/params/reduce.hpp +++ b/include/RAJA/policy/hip/params/reduce.hpp @@ -17,7 +17,7 @@ namespace detail { camp::concepts::enable_if< type_traits::is_hip_policy > init(Reducer& red, RAJA::hip::detail::hipInfo& hi) { - red.devicetarget = RAJA::hip::device_mempool_type::getInstance().template malloc(1); + red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc(1); red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z); red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc(1); } @@ -37,15 +37,14 @@ namespace detail { resolve(Reducer& red, RAJA::hip::detail::hipInfo& hi) { // complete reduction - hi.res.memcpy(&red.val, red.devicetarget, sizeof(T)); hi.res.wait(); - *red.target = OP{}(red.val, *red.target); + *red.target = OP{}(*red.devicetarget, *red.target); // free memory RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count); red.device_count = nullptr; red.device_mem.deallocate(); - RAJA::hip::device_mempool_type::getInstance().free(red.devicetarget); + RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget); red.devicetarget = nullptr; } From 6e7bbd3ee99a3d5c9816cdc020e95e25ed5035f1 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 6 Jul 2023 18:02:37 -0700 Subject: [PATCH 06/34] Decorate unused vec_arg. --- include/RAJA/util/TypedViewBase.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp index 5f9d53d9c7..93bdca06cc 100644 --- a/include/RAJA/util/TypedViewBase.hpp +++ b/include/RAJA/util/TypedViewBase.hpp @@ -457,7 +457,7 @@ namespace internal using type = RAJA::expt::StaticTensorIndex>; static constexpr RAJA_HOST_DEVICE RAJA_INLINE - type extract(RAJA::expt::StaticTensorIndex> vec_arg){ + type extract(RAJA::expt::StaticTensorIndex> RAJA_UNUSED_ARG(vec_arg)){ return type(); } }; From bf251da88ce2a51e6228e8f4c739052020247786 Mon Sep 17 00:00:00 2001 From: Robert Chen Date: Thu, 6 Jul 2023 18:03:13 -0700 Subject: [PATCH 07/34] Type casts for unsigned int cases. --- .../kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp index b72ad745e9..ec1ca0ecb0 100644 --- a/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp +++ b/test/functional/kernel/hyperplane/tests/test-kernel-hyperplane-2D.hpp @@ -50,7 +50,7 @@ void KernelHyperplane2DTestImpl(const int groups, const int idim, const int jdim RAJA::kernel ( RAJA::make_tuple( Grange, Irange, Jrange ), [=] RAJA_HOST_DEVICE ( INDEX_TYPE g, INDEX_TYPE ii, INDEX_TYPE jj ) { - if ((int)g < 0 || g >= groups || (int)ii < 0 || ii >= idim || (int)jj < 0 || jj >= jdim) { + if ((int)g < 0 || (int)g >= groups || (int)ii < 0 || (int)ii >= idim || (int)jj < 0 || (int)jj >= jdim) { oob_count += 1; } From 72cc6ac158e046c478eb870c8cd7bd061ac9e432 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 12 Jul 2023 12:24:10 -0700 Subject: [PATCH 08/34] Expose CMake option for enabling/disabling OpenMP task alg options --- RELEASE_NOTES.md | 4 ++++ cmake/SetupRajaOptions.cmake | 2 ++ include/RAJA/config.hpp.in | 8 +++++--- include/RAJA/policy/openmp/sort.hpp | 4 ++-- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 5200a41ed4..7390020bcc 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -16,6 +16,10 @@ Notable changes include: * New features / API changes: * Build changes/improvements: + * RAJA_ENABLE_OPENMP_TASK CMake option added to enable/disable algorithm + options based on OpenMP task construct. Currently, this only applies + to RAJA's OpenMP sort implementation. The default is 'Off'. The option + allows users to choose a task implementation if they wish. * Bug fixes/improvements: diff --git a/cmake/SetupRajaOptions.cmake b/cmake/SetupRajaOptions.cmake index c8c11efc72..3a27bc8cd0 100644 --- a/cmake/SetupRajaOptions.cmake +++ b/cmake/SetupRajaOptions.cmake @@ -17,6 +17,8 @@ option(RAJA_ENABLE_SYCL "Build SYCL support" Off) option(RAJA_ENABLE_VECTORIZATION "Build experimental vectorization support" On) +option(RAJA_ENABLE_OPENMP_TASK "Build OpenMP task variants of certain algorithms" Off) + option(RAJA_ENABLE_REPRODUCERS "Build issue reproducers" Off) option(RAJA_ENABLE_EXERCISES "Build exercises " On) diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 3ffa9cc217..c6df29447c 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -254,12 +254,14 @@ namespace RAJA { #if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__) #if defined(_OPENMP) #if (_OPENMP >= 200805) -#define RAJA_ENABLE_OPENMP_TASK +#if defined(RAJA_ENABLE_OPENMP_TASK) +#define RAJA_ENABLE_OPENMP_TASK_INTERNAL #endif +#endif // _OPENMP >= 200805 #else #error RAJA configured with RAJA_ENABLE_OPENMP, but _OPENMP is not defined in this code section -#endif // _OPENMP -#endif // RAJA_ENABLE_OPENMP && __HIP_DEVICE_COMPILE__ +#endif // else +#endif // RAJA_ENABLE_OPENMP && !__HIP_DEVICE_COMPILE__ #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__) #define RAJA_CUDA_ACTIVE diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp index 8abd2ade64..8502843ed0 100644 --- a/include/RAJA/policy/openmp/sort.hpp +++ b/include/RAJA/policy/openmp/sort.hpp @@ -49,7 +49,7 @@ namespace openmp // this number is arbitrary constexpr int get_min_iterates_per_task() { return 128; } -#ifdef RAJA_ENABLE_OPENMP_TASK +#if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL) /*! \brief sort given range using sorter and comparison function by spawning tasks @@ -159,7 +159,7 @@ void sort(Sorter sorter, const diff_type max_threads = omp_get_max_threads(); -#ifdef RAJA_ENABLE_OPENMP_TASK +#if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL) const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task); From f7bcb023bce47f6814fb77c7d8c7ed6092babf25 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 12 Jul 2023 12:25:48 -0700 Subject: [PATCH 09/34] Point radiuss spack configs submodule to branch with changes to try with GitLab CI updates --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index ffde592300..307d08fa68 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit ffde592300392a7bbebbfe0a8d752187c1b93132 +Subproject commit 307d08fa68cf367095ce112424356f44a475f2b9 From 9b32c3c9b420dbc11bf247f43999321c6929a11f Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 12 Jul 2023 13:36:40 -0700 Subject: [PATCH 10/34] Bump to new version of radiuss shared ci --- .gitlab-ci.yml | 2 +- .gitlab/corona-build-and-test-extra.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8ba8c96d8e..162e262e19 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -59,7 +59,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: v2023.03.1 + ref: v2023.06.0 file: '${CI_MACHINE}-build-and-test.yml' - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml' strategy: depend diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml index e65bba3947..f9a7509432 100644 --- a/.gitlab/corona-build-and-test-extra.yml +++ b/.gitlab/corona-build-and-test-extra.yml @@ -21,7 +21,7 @@ # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. -rocmcc_5_4_1_hip_desul_atomics: +rocmcc_5_5_0_hip_desul_atomics: variables: SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@5.4.1 ^hip@5.4.1 ^blt@develop" extends: .build_and_test_on_corona From 4da2d0b204cbec755025925de413e43540a187c6 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 12 Jul 2023 14:49:25 -0700 Subject: [PATCH 11/34] Really change the compiler version this time. --- .gitlab/corona-build-and-test-extra.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml index f9a7509432..a4ad4ff693 100644 --- a/.gitlab/corona-build-and-test-extra.yml +++ b/.gitlab/corona-build-and-test-extra.yml @@ -23,6 +23,6 @@ rocmcc_5_5_0_hip_desul_atomics: variables: - SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@5.4.1 ^hip@5.4.1 ^blt@develop" + SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %rocmcc@5.5.0 ^hip@5.5.0 ^blt@develop" extends: .build_and_test_on_corona From 7bb6c6078f5126e4031c45583aed9d5024ac2f7c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 12 Jul 2023 14:49:45 -0700 Subject: [PATCH 12/34] Delete build dirs to avoid running out of space --- Dockerfile | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9120208427..aad0b078e5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_DEPRECATED_TESTS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ctest -T test --output-on-failure && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-8.1.0 AS gcc8.1.0 ENV GTEST_COLOR=1 @@ -19,7 +20,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_WARNINGS_AS_ERRORS=On -DENABLE_COVERAGE=On -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ctest -T test --output-on-failure && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-9.4.0 AS gcc9.4.0 ENV GTEST_COLOR=1 @@ -27,7 +29,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_RUNTIME_PLUGINS=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ctest -T test --output-on-failure && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/gcc-ubuntu-20.04:gcc-11.2.0 AS gcc11.2.0 ENV GTEST_COLOR=1 @@ -35,7 +38,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_CXX_COMPILER=g++ -DRAJA_ENABLE_WARNINGS=On -DRAJA_ENABLE_TBB=On -DRAJA_ENABLE_BOUNDS_CHECK=ON -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ctest -T test --output-on-failure && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11.0.0 ENV GTEST_COLOR=1 @@ -44,7 +48,8 @@ WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \ cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ctest -T test --output-on-failure && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11.0.0-debug ENV GTEST_COLOR=1 @@ -53,7 +58,8 @@ WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \ cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Debug .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ctest -T test --output-on-failure && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13.0.0 ENV GTEST_COLOR=1 @@ -62,7 +68,8 @@ WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \ cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Release .. && \ make -j 6 &&\ - ctest -T test --output-on-failure + ctest -T test --output-on-failure && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10.1.243 ENV GTEST_COLOR=1 @@ -70,7 +77,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ - make -j 4 + make -j 4 && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.1 ENV GTEST_COLOR=1 @@ -78,7 +86,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ - make -j 4 + make -j 4 && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.-debug ENV GTEST_COLOR=1 @@ -86,7 +95,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ - make -j 4 + make -j 4 && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip5.1.3 ENV GTEST_COLOR=1 @@ -95,7 +105,8 @@ COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \ cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \ - make -j 6 + make -j 6 && \ + cd .. && rm -rf build FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl ENV GTEST_COLOR=1 @@ -104,4 +115,5 @@ WORKDIR /home/raja/workspace/build RUN /bin/bash -c "source /opt/view/setvars.sh && \ cmake -DCMAKE_CXX_COMPILER=dpcpp -DRAJA_ENABLE_SYCL=On -DENABLE_OPENMP=Off -DENABLE_ALL_WARNINGS=Off -DBLT_CXX_STD=c++17 .. && \ make -j 6 &&\ - ctest -T test --output-on-failure" + ctest -T test --output-on-failure" && \ + cd .. && rm -rf build From 0f76d55b01a669e9149bebb20771f058553713d0 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 12 Jul 2023 14:55:09 -0700 Subject: [PATCH 13/34] Pull in variant addition --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 307d08fa68..e2afaf8e00 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 307d08fa68cf367095ce112424356f44a475f2b9 +Subproject commit e2afaf8e00ec8110d4199bed5e39d36d987340e3 From 04b38fd68d79a8ef64e8ea4ee20ce5215e5d2077 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Wed, 12 Jul 2023 15:09:52 -0700 Subject: [PATCH 14/34] Add missing cmakedefine --- include/RAJA/config.hpp.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index c6df29447c..957cdadec0 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -176,6 +176,8 @@ static_assert(RAJA_HAS_SOME_CXX14, #cmakedefine RAJA_ENABLE_CLANG_CUDA #cmakedefine RAJA_ENABLE_HIP #cmakedefine RAJA_ENABLE_SYCL + +#cmakedefine RAJA_ENABLE_OMP_TASK #cmakedefine RAJA_ENABLE_VECTORIZATION #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT From 7dfe81a5fd9f557164e326aa980177b3caaf261d Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 13 Jul 2023 08:11:55 -0700 Subject: [PATCH 15/34] Remove GPU builds from azure --- Dockerfile | 66 ++++++++++++++++++++++----------------------- azure-pipelines.yml | 12 ++++----- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/Dockerfile b/Dockerfile index aad0b078e5..64939607b9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,42 +71,42 @@ RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/li ctest -T test --output-on-failure && \ cd .. && rm -rf build -FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10.1.243 -ENV GTEST_COLOR=1 -COPY . /home/raja/workspace -WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ - cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ - make -j 4 && \ - cd .. && rm -rf build +##FROM ghcr.io/rse-ops/cuda:cuda-10.1.243-ubuntu-18.04 AS nvcc10.1.243 +##ENV GTEST_COLOR=1 +##COPY . /home/raja/workspace +##WORKDIR /home/raja/workspace/build +##RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ +## cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ +## make -j 4 && \ +## cd .. && rm -rf build -FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.1 -ENV GTEST_COLOR=1 -COPY . /home/raja/workspace -WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ - cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ - make -j 4 && \ - cd .. && rm -rf build +##FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.1 +##ENV GTEST_COLOR=1 +##COPY . /home/raja/workspace +##WORKDIR /home/raja/workspace/build +##RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ +## cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ +## make -j 4 && \ +## cd .. && rm -rf build -FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.-debug -ENV GTEST_COLOR=1 -COPY . /home/raja/workspace -WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ - cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ - make -j 4 && \ - cd .. && rm -rf build +##FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11.1.-debug +##ENV GTEST_COLOR=1 +##COPY . /home/raja/workspace +##WORKDIR /home/raja/workspace/build +##RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ +## cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ +## make -j 4 && \ +## cd .. && rm -rf build -FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip5.1.3 -ENV GTEST_COLOR=1 -ENV HCC_AMDGPU_TARGET=gfx900 -COPY . /home/raja/workspace -WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \ - cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \ - make -j 6 && \ - cd .. && rm -rf build +##FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip5.1.3 +##ENV GTEST_COLOR=1 +##ENV HCC_AMDGPU_TARGET=gfx900 +##COPY . /home/raja/workspace +##WORKDIR /home/raja/workspace/build +##RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \ +## cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \ +## make -j 6 && \ +## cd .. && rm -rf build FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl ENV GTEST_COLOR=1 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 048c9b93e8..33e3bfa8c0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -48,14 +48,14 @@ jobs: docker_target: clang11.0.0-debug clang13.0.0: docker_target: clang13.0.0 - nvcc10.1.243: - docker_target: nvcc10.1.243 - nvcc11.1.1: - docker_target: nvcc11.1.1 +## nvcc10.1.243: +## docker_target: nvcc10.1.243 +## nvcc11.1.1: +## docker_target: nvcc11.1.1 ## nvcc11.1.1-debug: ## docker_target: nvcc11.1.1-debug - hip5.1.3: - docker_target: hip5.1.3 +## hip5.1.3: +## docker_target: hip5.1.3 sycl: docker_target: sycl pool: From e7931c5810e09578f83312be6814bd8497e59713 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 13 Jul 2023 10:09:16 -0700 Subject: [PATCH 16/34] enable OpenMP task option for some algorithms --- .gitlab/ruby-build-and-test-extra.yml | 17 ++++++++++++++++- .gitlab/tioga-build-and-test-extra.yml | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml index c6666e51d5..a8cd89a49b 100644 --- a/.gitlab/ruby-build-and-test-extra.yml +++ b/.gitlab/ruby-build-and-test-extra.yml @@ -12,7 +12,22 @@ # We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that # the comparison with the original job is easier. -# No overridden jobs so far. +clang_14_0_6: + variables: + SPEC: " ~shared +openmp +omptask +tests %clang@14.0.6" + extends: .build_and_test_on_ruby + +gcc_10_3_1: + variables: + SPEC: " ~shared +openmp +omptask +tests %gcc@10.3.1" + RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" + extends: .build_and_test_on_ruby + +intel_19_1_2_gcc_8_5_0: + variables: + SPEC: " ~shared +openmp +omptask +tests %intel@19.1.2.gcc.8.5.0" + RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=90 --nodes=1" + extends: .build_and_test_on_ruby ############ # Extra jobs diff --git a/.gitlab/tioga-build-and-test-extra.yml b/.gitlab/tioga-build-and-test-extra.yml index 25b9880982..a211ea2d8a 100644 --- a/.gitlab/tioga-build-and-test-extra.yml +++ b/.gitlab/tioga-build-and-test-extra.yml @@ -28,5 +28,5 @@ rocmcc_5_4_3_hip_desul_atomics: rocmcc_5_4_3_hip_openmp: variables: - SPEC: "~shared +rocm +openmp +tests amdgpu_target=gfx90a %rocmcc@5.4.3 ^hip@5.4.3 ^blt@develop" + SPEC: "~shared +rocm +openmp +omptask +tests amdgpu_target=gfx90a %rocmcc@5.4.3 ^hip@5.4.3 ^blt@develop" extends: .build_and_test_on_tioga From 8ad8bad2ff30ba568f5242ddf395b397391c133a Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 13 Jul 2023 10:52:27 -0700 Subject: [PATCH 17/34] Fix spec --- .gitlab/lassen-build-and-test-extra.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml index 71950b0ba7..2e4118df90 100644 --- a/.gitlab/lassen-build-and-test-extra.yml +++ b/.gitlab/lassen-build-and-test-extra.yml @@ -36,6 +36,11 @@ xl_2022_08_19_gcc_8_3_1_cuda_11_7_0: # ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully # describe the spec here. +gcc_8_3_1_omptask: + variables: + SPEC: " ~shared +openmp +omptask +tests %gcc@8.3.1" + extends: .build_and_test_on_lassen + gcc_8_3_1_cuda_11_5_0_ats_disabled: extends: .build_and_test_on_lassen variables: From 79a4a08d1f514242f24f66ecbe5e75f20332fbe1 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 13 Jul 2023 11:17:51 -0700 Subject: [PATCH 18/34] Add omptask option to raja perf package --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index e2afaf8e00..8935d00676 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit e2afaf8e00ec8110d4199bed5e39d36d987340e3 +Subproject commit 8935d006761aeb495a648b1f1373a0ca540ef78c From 4f29f3510b99dbf228395fd457bad3be180f7e13 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 14 Jul 2023 07:58:09 -0700 Subject: [PATCH 19/34] Switch radiuss spack configs to main branch --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 8935d00676..955f0c3a67 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 8935d006761aeb495a648b1f1373a0ca540ef78c +Subproject commit 955f0c3a67e28fac4648ac41787e93511207b22e From 8c214c168e7852574ea3f0e7bda410510794b831 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 21 Jul 2023 11:22:04 -0700 Subject: [PATCH 20/34] Fix warnings from KernelDimensionCalculator --- include/RAJA/policy/cuda/kernel/internal.hpp | 8 ++++---- include/RAJA/policy/hip/kernel/internal.hpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp index aaa6eb1215..82760d46ea 100644 --- a/include/RAJA/policy/cuda/kernel/internal.hpp +++ b/include/RAJA/policy/cuda/kernel/internal.hpp @@ -314,7 +314,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len) + static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len) { if (len > static_cast(0)) { RAJA_ABORT_OR_THROW("must know one of block_size or grid_size"); @@ -427,7 +427,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len) + static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len)) { set_cuda_dim(dims.threads, static_cast(IndexMapper::block_size)); set_cuda_dim(min_dims.threads, static_cast(IndexMapper::block_size)); @@ -460,7 +460,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len) + static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len)) { set_cuda_dim(dims.blocks, static_cast(IndexMapper::grid_size)); set_cuda_dim(min_dims.blocks, static_cast(IndexMapper::grid_size)); @@ -537,7 +537,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len) + static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len)) { set_cuda_dim(dims.threads, static_cast(IndexMapper::block_size)); set_cuda_dim(dims.blocks, static_cast(IndexMapper::grid_size)); diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp index 51e17d5837..f874b40e2d 100644 --- a/include/RAJA/policy/hip/kernel/internal.hpp +++ b/include/RAJA/policy/hip/kernel/internal.hpp @@ -314,7 +314,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len) + static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len) { if (len > static_cast(0)) { RAJA_ABORT_OR_THROW("must know one of block_size or grid_size"); @@ -427,7 +427,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len) + static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len)) { set_hip_dim(dims.threads, static_cast(IndexMapper::block_size)); set_hip_dim(min_dims.threads, static_cast(IndexMapper::block_size)); @@ -460,7 +460,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len) + static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len)) { set_hip_dim(dims.blocks, static_cast(IndexMapper::grid_size)); set_hip_dim(min_dims.blocks, static_cast(IndexMapper::grid_size)); @@ -537,7 +537,7 @@ struct KernelDimensionCalculator; template < typename IdxT > - static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len) + static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len)) { set_hip_dim(dims.threads, static_cast(IndexMapper::block_size)); set_hip_dim(dims.blocks, static_cast(IndexMapper::grid_size)); From 1ea7b69a19b160f52e59fc107578654a8ecd3cb6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 19 Jul 2023 14:11:32 -0700 Subject: [PATCH 21/34] zero working_array_x in kernel fission fusion tests --- .../tests/basic-fission-fusion-loop-impl.hpp | 7 +++---- .../tests/conditional-fission-fusion-loop-impl.hpp | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp index ec32fdddec..bea0fd1564 100644 --- a/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp +++ b/test/functional/kernel/basic-fission-fusion-loop/tests/basic-fission-fusion-loop-impl.hpp @@ -53,10 +53,9 @@ void KernelBasicFissionFusionLoopTestImpl( &test_array_y); - memset(static_cast(test_array_x), - 0, - sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); - + working_res.memset(working_array_x, + 0, + sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); RAJA::kernel( RAJA::make_tuple(seg, seg), diff --git a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp index 97761bacb6..52cd9fc861 100644 --- a/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp +++ b/test/functional/kernel/conditional-fission-fusion-loop/tests/conditional-fission-fusion-loop-impl.hpp @@ -53,9 +53,9 @@ void KernelConditionalFissionFusionLoopTestImpl( &test_array_y); - memset(static_cast(test_array_x), - 0, - sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); + working_res.memset(working_array_x, + 0, + sizeof(DATA_TYPE) * RAJA::stripIndexType(data_len)); for (int param = 0; param < 2; ++param) { From 784f85f29fdf3f5e38d7bad6081d6fcc97378c8d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 1 Aug 2023 08:26:31 -0700 Subject: [PATCH 22/34] Make type use consistent in hip/cuda kernel and occupancy calculator functions --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 33 +++++----- .../RAJA/policy/cuda/kernel/CudaKernel.hpp | 60 +++++++++---------- include/RAJA/policy/hip/MemUtils_HIP.hpp | 21 ++++--- include/RAJA/policy/hip/kernel/HipKernel.hpp | 12 ++-- 4 files changed, 65 insertions(+), 61 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 7eae7e29c0..697ed225a0 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -318,19 +318,20 @@ size_t cuda_max_blocks(size_t block_size) struct CudaOccMaxBlocksThreadsData { - int prev_shmem_size; + size_t prev_shmem_size; int max_blocks; int max_threads; }; template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > RAJA_INLINE -void cuda_occupancy_max_blocks_threads(Func&& func, int shmem_size, - size_t &max_blocks, size_t &max_threads) +void cuda_occupancy_max_blocks_threads(Func&& func, size_t shmem_size, + int &max_blocks, int &max_threads) { static constexpr int uninitialized = -1; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local CudaOccMaxBlocksThreadsData data = { - uninitialized, uninitialized, uninitialized}; + uninitialized_size_t, uninitialized, uninitialized}; if (data.prev_shmem_size != shmem_size) { @@ -348,19 +349,20 @@ void cuda_occupancy_max_blocks_threads(Func&& func, int shmem_size, struct CudaOccMaxBlocksFixedThreadsData { - int prev_shmem_size; + size_t prev_shmem_size; int max_blocks; int multiProcessorCount; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), size_t num_threads, typename Func > +template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func > RAJA_INLINE -void cuda_occupancy_max_blocks(Func&& func, int shmem_size, - size_t &max_blocks) +void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size, + int &max_blocks) { static constexpr int uninitialized = -1; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local CudaOccMaxBlocksFixedThreadsData data = { - uninitialized, uninitialized, uninitialized}; + uninitialized_size_t, uninitialized, uninitialized}; if (data.prev_shmem_size != shmem_size) { @@ -385,7 +387,7 @@ void cuda_occupancy_max_blocks(Func&& func, int shmem_size, struct CudaOccMaxBlocksVariableThreadsData { - int prev_shmem_size; + size_t prev_shmem_size; int prev_num_threads; int max_blocks; int multiProcessorCount; @@ -393,20 +395,19 @@ struct CudaOccMaxBlocksVariableThreadsData template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > RAJA_INLINE -void cuda_occupancy_max_blocks(Func&& func, int shmem_size, - size_t &max_blocks, size_t num_threads) +void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size, + int &max_blocks, int num_threads) { static constexpr int uninitialized = 0; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local CudaOccMaxBlocksVariableThreadsData data = { - uninitialized, uninitialized, uninitialized, uninitialized}; + uninitialized_size_t, uninitialized, uninitialized, uninitialized}; if ( data.prev_shmem_size != shmem_size || data.prev_num_threads != num_threads ) { - int tmp_max_blocks; cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &tmp_max_blocks, func, static_cast(num_threads), shmem_size)); - data.max_blocks = tmp_max_blocks; + &data.max_blocks, func, num_threads, shmem_size)); if (data.multiProcessorCount == uninitialized) { diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 8de879aadf..0e2966e8a8 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -50,7 +50,7 @@ namespace RAJA * Num_blocks is chosen to maximize the number of blocks running concurrently. * Blocks per SM must be chosen by the user. */ -template +template struct cuda_explicit_launch {}; /*! @@ -66,7 +66,7 @@ struct cuda_explicit_launch {}; * Num_threads is 1024, which may not be appropriate for all kernels. * Blocks per SM defaults to 1. */ -template +template using cuda_launch = cuda_explicit_launch; /*! @@ -74,7 +74,7 @@ using cuda_launch = cuda_explicit_launch +template using cuda_occ_calc_launch = cuda_explicit_launch; namespace statement @@ -97,7 +97,7 @@ struct CudaKernelExt * calculator determine the unspecified values. * The kernel launch is synchronous. */ -template +template using CudaKernelExp = CudaKernelExt, EnclosedStmts...>; @@ -107,7 +107,7 @@ using CudaKernelExp = * calculator determine the unspecified values. * The kernel launch is asynchronous. */ -template +template using CudaKernelExpAsync = CudaKernelExt, EnclosedStmts...>; @@ -134,9 +134,9 @@ using CudaKernelOccAsync = * number of threads (specified by num_threads) * The kernel launch is synchronous. */ -template +template using CudaKernelFixed = - CudaKernelExt::max(), num_threads>, + CudaKernelExt::max(), num_threads>, EnclosedStmts...>; /*! @@ -144,9 +144,9 @@ using CudaKernelFixed = * number of threads (specified by num_threads) * The kernel launch is asynchronous. */ -template +template using CudaKernelFixedAsync = - CudaKernelExt::max(), num_threads>, + CudaKernelExt::max(), num_threads>, EnclosedStmts...>; /*! @@ -154,9 +154,9 @@ using CudaKernelFixedAsync = * number of threads (specified by num_threads) and min blocks per sm. * The kernel launch is synchronous. */ -template +template using CudaKernelFixedSM = - CudaKernelExt::max(), num_threads, blocks_per_sm>, + CudaKernelExt::max(), num_threads, blocks_per_sm>, EnclosedStmts...>; /*! @@ -164,9 +164,9 @@ using CudaKernelFixedSM = * number of threads (specified by num_threads) and min blocks per sm. * The kernel launch is asynchronous. */ -template +template using CudaKernelFixedSMAsync = - CudaKernelExt::max(), num_threads, blocks_per_sm>, + CudaKernelExt::max(), num_threads, blocks_per_sm>, EnclosedStmts...>; /*! @@ -210,7 +210,7 @@ __global__ void CudaKernelLauncher(Data data) * * This launcher is used by the CudaKerelFixed policies. */ -template +template __launch_bounds__(BlockSize, BlocksPerSM) __global__ void CudaKernelLauncherFixed(Data data) { @@ -231,7 +231,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__ * The default case handles BlockSize != 0 and gets the fixed max block size * version of the kernel. */ -template +template struct CudaKernelLauncherGetter { using type = camp::decay)>; @@ -270,7 +270,7 @@ struct CudaLaunchHelper; * The user may specify the number of threads and blocks or let one or both be * determined at runtime using the CUDA occupancy calculator. */ -template +template struct CudaLaunchHelper,StmtList,Data,Types> { using Self = CudaLaunchHelper; @@ -281,8 +281,8 @@ struct CudaLaunchHelper; - inline static void recommended_blocks_threads(int shmem_size, - size_t &recommended_blocks, size_t &recommended_threads) + inline static void recommended_blocks_threads(size_t shmem_size, + int &recommended_blocks, int &recommended_threads) { auto func = kernelGetter_t::get(); @@ -337,7 +337,7 @@ struct CudaLaunchHelper 0 || num_threads > 0) { // // Setup shared memory buffers // - int shmem = 0; + size_t shmem = 0; // // Compute the recommended physical kernel blocks and threads // - size_t recommended_blocks; - size_t recommended_threads; + int recommended_blocks; + int recommended_threads; launch_t::recommended_blocks_threads( shmem, recommended_blocks, recommended_threads); @@ -518,7 +518,7 @@ struct StatementExecutor< // // Compute the MAX physical kernel threads // - size_t max_threads; + int max_threads; launch_t::max_threads(shmem, max_threads); @@ -551,10 +551,10 @@ struct StatementExecutor< // // Compute the MAX physical kernel blocks // - size_t max_blocks; + int max_blocks; launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads()); - size_t use_blocks; + int use_blocks; if ( launch_dims.num_threads() == recommended_threads ) { diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index 9006dc54e2..bc6a454782 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -319,19 +319,20 @@ int hip_max_blocks(int block_size) struct HipOccMaxBlocksThreadsData { - int prev_shmem_size; + size_t prev_shmem_size; int max_blocks; int max_threads; }; template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > RAJA_INLINE -void hip_occupancy_max_blocks_threads(Func&& func, int shmem_size, +void hip_occupancy_max_blocks_threads(Func&& func, size_t shmem_size, int &max_blocks, int &max_threads) { static constexpr int uninitialized = -1; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local HipOccMaxBlocksThreadsData data = { - uninitialized, uninitialized, uninitialized}; + uninitialized_size_t, uninitialized, uninitialized}; if (data.prev_shmem_size != shmem_size) { @@ -356,19 +357,20 @@ void hip_occupancy_max_blocks_threads(Func&& func, int shmem_size, struct HipOccMaxBlocksFixedThreadsData { - int prev_shmem_size; + size_t prev_shmem_size; int max_blocks; int multiProcessorCount; }; template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func > RAJA_INLINE -void hip_occupancy_max_blocks(Func&& func, int shmem_size, +void hip_occupancy_max_blocks(Func&& func, size_t shmem_size, int &max_blocks) { static constexpr int uninitialized = -1; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local HipOccMaxBlocksFixedThreadsData data = { - uninitialized, uninitialized, uninitialized}; + uninitialized_size_t, uninitialized, uninitialized}; if (data.prev_shmem_size != shmem_size) { @@ -399,7 +401,7 @@ void hip_occupancy_max_blocks(Func&& func, int shmem_size, struct HipOccMaxBlocksVariableThreadsData { - int prev_shmem_size; + size_t prev_shmem_size; int prev_num_threads; int max_blocks; int multiProcessorCount; @@ -407,12 +409,13 @@ struct HipOccMaxBlocksVariableThreadsData template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > RAJA_INLINE -void hip_occupancy_max_blocks(Func&& func, int shmem_size, +void hip_occupancy_max_blocks(Func&& func, size_t shmem_size, int &max_blocks, int num_threads) { static constexpr int uninitialized = 0; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local HipOccMaxBlocksVariableThreadsData data = { - uninitialized, uninitialized, uninitialized, uninitialized}; + uninitialized_size_t, uninitialized, uninitialized, uninitialized}; if ( data.prev_shmem_size != shmem_size || data.prev_num_threads != num_threads ) { diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 016fc980b8..f7d6a0e6a4 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -189,7 +189,7 @@ __global__ void HipKernelLauncher(Data data) * * This launcher is used by the HipKerelFixed policies. */ -template +template __launch_bounds__(BlockSize, 1) __global__ void HipKernelLauncherFixed(Data data) { @@ -210,7 +210,7 @@ __launch_bounds__(BlockSize, 1) __global__ * The default case handles BlockSize != 0 and gets the fixed max block size * version of the kernel. */ -template +template struct HipKernelLauncherGetter { using type = camp::decay)>; @@ -260,7 +260,7 @@ struct HipLaunchHelper,Stmt using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>; - inline static void recommended_blocks_threads(int shmem_size, + inline static void recommended_blocks_threads(size_t shmem_size, int &recommended_blocks, int &recommended_threads) { auto func = kernelGetter_t::get(); @@ -316,7 +316,7 @@ struct HipLaunchHelper,Stmt } } - inline static void max_threads(int RAJA_UNUSED_ARG(shmem_size), int &max_threads) + inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads) { if (num_threads <= 0) { @@ -336,7 +336,7 @@ struct HipLaunchHelper,Stmt } } - inline static void max_blocks(int shmem_size, + inline static void max_blocks(size_t shmem_size, int &max_blocks, int actual_threads) { auto func = kernelGetter_t::get(); @@ -482,7 +482,7 @@ struct StatementExecutor< // // Setup shared memory buffers // - int shmem = 0; + size_t shmem = 0; // From 5f211b531043129f477b63ad113c58d280667251 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Aug 2023 08:16:51 -0700 Subject: [PATCH 23/34] add occupancy policies --- docs/sphinx/user_guide/feature/policies.rst | 25 ++++++++++++++++----- include/RAJA/policy/cuda/policy.hpp | 16 +++++++++++++ include/RAJA/policy/hip/policy.hpp | 8 +++++++ 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index b425473aba..760d6ee7de 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -279,12 +279,27 @@ policies have the prefix ``hip_``. CUDA/HIP Execution Policies Works with Brief description ========================================= ============= ======================================= cuda/hip_exec forall, Execute loop iterations - scan, in a GPU kernel launched - sort with given thread-block - size. Note that the + scan, directly mapped to global threads + sort in a GPU kernel launched + with given thread-block + size and unbounded grid size. + Note that the thread-block + size must be provided, + there is no default. + cuda/hip_exec_occupancy forall Execute loop iterations + mapped to global threads via + grid striding with multiple + iterations per global thread + in a GPU kernel launched + with given thread-block + size and grid size bounded + by the maximum occupancy of + the kernel. Note that the thread-block size must - be provided, there is - no default. + be provided, there is no + default. Note this can improve + reducer performance in kernels + with large iteration counts. cuda/hip_launch_t launch Launches a device kernel, any code expressed within the lambda is executed diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 64ec12a4cb..72ba5c8f0c 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -805,6 +805,22 @@ template using cuda_exec_async = policy::cuda::cuda_exec_explicit< iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; +template +using cuda_exec_occupancy_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occupancy_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occupancy = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occupancy_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + template using cuda_work_explicit = policy::cuda::cuda_work_explicit; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 8b7f29462d..ecc5b756b4 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -783,6 +783,14 @@ template using hip_exec_async = policy::hip::hip_exec< iteration_mapping::Direct, hip::global_x, true>; +template +using hip_exec_occupancy = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, Async>; + +template +using hip_exec_occupancy_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, true>; + using policy::hip::hip_work; template From d2abaec15680ceb37ec03155289b29cecaee6c2b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Aug 2023 08:38:13 -0700 Subject: [PATCH 24/34] add more comments --- include/RAJA/policy/cuda/policy.hpp | 9 ++++++++- include/RAJA/policy/hip/policy.hpp | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 72ba5c8f0c..499993b84e 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -772,7 +772,7 @@ using global_z = IndexGlobal; } // namespace cuda - +// policies usable with forall, scan, and sort template using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit< iteration_mapping::Direct, cuda::global_x, BLOCKS_PER_SM, Async>; @@ -821,6 +821,7 @@ template using cuda_exec_occupancy_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; +// policies usable with WorkGroup template using cuda_work_explicit = policy::cuda::cuda_work_explicit; @@ -835,13 +836,16 @@ using cuda_work_async = policy::cuda::cuda_work_explicit using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t; @@ -869,6 +875,7 @@ template using cuda_launch_t = policy::cuda::cuda_launch_explicit_t; +// policies usable with kernel and launch template < typename ... indexers > using cuda_indexer_direct = policy::cuda::cuda_indexer< iteration_mapping::Direct, diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index ecc5b756b4..527c3c4411 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -767,6 +767,7 @@ using global_z = IndexGlobal; } // namespace hip +// policies usable with forall, scan, and sort template using hip_exec_grid = policy::hip::hip_exec< iteration_mapping::Direct, hip::global_x, Async>; @@ -791,6 +792,7 @@ template using hip_exec_occupancy_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, true>; +// policies usable with WorkGroup using policy::hip::hip_work; template @@ -798,13 +800,16 @@ using hip_work_async = policy::hip::hip_work; using policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average; +// policies usable with atomics using policy::hip::hip_atomic; using policy::hip::hip_atomic_explicit; +// policies usable with reducers using policy::hip::hip_reduce_base; using policy::hip::hip_reduce; using policy::hip::hip_reduce_atomic; +// policies usable with kernel using policy::hip::hip_block_reduce; using policy::hip::hip_warp_reduce; @@ -823,11 +828,14 @@ using policy::hip::hip_warp_masked_loop; using policy::hip::hip_thread_masked_direct; using policy::hip::hip_thread_masked_loop; +// policies usable with synchronize using policy::hip::hip_synchronize; +// policies usable with launch using policy::hip::hip_launch_t; +// policies usable with kernel and launch template < typename ... indexers > using hip_indexer_direct = policy::hip::hip_indexer< iteration_mapping::Direct, From bb9cce41c8e156266e50ab3cea916d982002c20d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Aug 2023 08:39:42 -0700 Subject: [PATCH 25/34] make types more consistent --- include/RAJA/policy/cuda/kernel/CudaKernel.hpp | 2 +- include/RAJA/policy/hip/kernel/HipKernel.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 0e2966e8a8..95471e05f2 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -418,7 +418,7 @@ struct CudaLaunchHelper,Stmt * maximizing the number of threads (or blocks) in x, y, then z. */ inline -hip_dim_t fitHipDims(unsigned int limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){ +hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){ // clamp things to at least 1 From d3f47c8ae7f93177f9490bf4828a0d4749f8ea84 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Aug 2023 08:43:06 -0700 Subject: [PATCH 26/34] make naming more consistent between kernel and forall policy names --- include/RAJA/policy/cuda/policy.hpp | 8 ++++---- include/RAJA/policy/hip/policy.hpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 499993b84e..58424edb88 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -806,19 +806,19 @@ using cuda_exec_async = policy::cuda::cuda_exec_explicit< iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; template -using cuda_exec_occupancy_explicit = policy::cuda::cuda_exec_explicit< +using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; template -using cuda_exec_occupancy_explicit_async = policy::cuda::cuda_exec_explicit< +using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; template -using cuda_exec_occupancy = policy::cuda::cuda_exec_explicit< +using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template -using cuda_exec_occupancy_async = policy::cuda::cuda_exec_explicit< +using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; // policies usable with WorkGroup diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 527c3c4411..2000bde42b 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -785,11 +785,11 @@ using hip_exec_async = policy::hip::hip_exec< iteration_mapping::Direct, hip::global_x, true>; template -using hip_exec_occupancy = policy::hip::hip_exec< +using hip_exec_occ_calc = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, Async>; template -using hip_exec_occupancy_async = policy::hip::hip_exec< +using hip_exec_occ_calc_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, true>; // policies usable with WorkGroup From 0865243d0ea0c2c2fa5a8d3dff8d9ac9ba215502 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Aug 2023 08:43:58 -0700 Subject: [PATCH 27/34] fix docs --- docs/sphinx/user_guide/feature/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 760d6ee7de..ed6808f6e6 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -286,7 +286,7 @@ policies have the prefix ``hip_``. Note that the thread-block size must be provided, there is no default. - cuda/hip_exec_occupancy forall Execute loop iterations + cuda/hip_exec_occ_calc forall Execute loop iterations mapped to global threads via grid striding with multiple iterations per global thread From 282fb01bf78f233b7a9cc6be40b105fd4e96c7ad Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Aug 2023 08:49:17 -0700 Subject: [PATCH 28/34] Add occ_calc and grid forall policy tests --- test/include/RAJA_test-forall-execpol.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index 85c48e96d9..520f1af413 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -120,7 +120,8 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols; #if defined(RAJA_ENABLE_CUDA) using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>, - RAJA::cuda_exec<256>, + RAJA::cuda_exec_occ_calc<256>, + RAJA::cuda_exec_grid<256, 64>, RAJA::cuda_exec_explicit<256,2> >; using CudaForallReduceExecPols = CudaForallExecPols; @@ -131,7 +132,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols; #if defined(RAJA_ENABLE_HIP) using HipForallExecPols = camp::list< RAJA::hip_exec<128>, - RAJA::hip_exec<256> >; + RAJA::hip_exec_occ_calc<256>, + RAJA::hip_exec_grid<256, 64> >; using HipForallReduceExecPols = HipForallExecPols; From 27358456564d571c2b96b1f1d0e4b80961f865ef Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 3 Aug 2023 09:06:09 -0700 Subject: [PATCH 29/34] Change grid policies to use strided loop add documentation for grid forall policies --- docs/sphinx/user_guide/feature/policies.rst | 10 ++++++++++ include/RAJA/policy/cuda/policy.hpp | 8 ++++---- include/RAJA/policy/hip/policy.hpp | 4 ++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index ed6808f6e6..2f707c5efd 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -286,6 +286,16 @@ policies have the prefix ``hip_``. Note that the thread-block size must be provided, there is no default. + cuda/hip_exec_grid forall, Execute loop iterations + mapped to global threads via + grid striding with multiple + iterations per global thread + in a GPU kernel launched + with given thread-block + size and grid size. + Note that the thread-block + size and grid size must be + provided, there is no default. cuda/hip_exec_occ_calc forall Execute loop iterations mapped to global threads via grid striding with multiple diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 58424edb88..fec6d70b7b 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -775,19 +775,19 @@ using global_z = IndexGlobal; // policies usable with forall, scan, and sort template using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; template using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; template using cuda_exec_grid = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_explicit = policy::cuda::cuda_exec_explicit< diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 2000bde42b..f943b74461 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -770,11 +770,11 @@ using global_z = IndexGlobal; // policies usable with forall, scan, and sort template using hip_exec_grid = policy::hip::hip_exec< - iteration_mapping::Direct, hip::global_x, Async>; + iteration_mapping::StridedLoop, hip::global_x, Async>; template using hip_exec_grid_async = policy::hip::hip_exec< - iteration_mapping::Direct, hip::global_x, true>; + iteration_mapping::StridedLoop, hip::global_x, true>; template using hip_exec = policy::hip::hip_exec< From d347ece4627350ba58635fd63820255e018d2224 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 4 Aug 2023 10:57:30 -0700 Subject: [PATCH 30/34] Use camp default stream in DeviceZeroedAllocators --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 4 +++- include/RAJA/policy/hip/MemUtils_HIP.hpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 697ed225a0..df9922b06d 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -95,9 +95,11 @@ struct DeviceZeroedAllocator { // returns a valid pointer on success, nullptr on failure void* malloc(size_t nbytes) { + auto res = ::camp::resources::Cuda::get_default(); void* ptr; cudaErrchk(cudaMalloc(&ptr, nbytes)); - cudaErrchk(cudaMemset(ptr, 0, nbytes)); + cudaErrchk(cudaMemsetAsync(ptr, 0, nbytes, res.get_stream())); + cudaErrchk(cudaStreamSynchronize(res.get_stream())); return ptr; } diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index bc6a454782..6c856114ff 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -96,9 +96,11 @@ struct DeviceZeroedAllocator { // returns a valid pointer on success, nullptr on failure void* malloc(size_t nbytes) { + auto res = ::camp::resources::Hip::get_default(); void* ptr; hipErrchk(hipMalloc(&ptr, nbytes)); - hipErrchk(hipMemset(ptr, 0, nbytes)); + hipErrchk(hipMemsetAsync(ptr, 0, nbytes, res.get_stream())); + hipErrchk(hipStreamSynchronize(res.get_stream())); return ptr; } From d6bed3599a73eb13a1db00e7df4bfde1641b1e21 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 4 Aug 2023 10:57:53 -0700 Subject: [PATCH 31/34] Avoid stream 0 in WorkGroup Dispatcher --- include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp | 11 +++++++---- include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp | 12 +++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp index 6ae361149d..0c63d2dbc8 100644 --- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp +++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp @@ -20,6 +20,8 @@ #include "RAJA/config.hpp" +#include "camp/resource.hpp" + #include "RAJA/policy/cuda/policy.hpp" #include "RAJA/pattern/WorkGroup/Dispatcher.hpp" @@ -77,11 +79,12 @@ inline auto get_value(Factory&& factory) using value_type = typename std::decay_t::value_type; const std::lock_guard lock(get_value_mutex()); + auto res = ::camp::resources::Cuda::get_default(); auto ptr = static_cast(get_cached_value_ptr(sizeof(value_type))); - get_value_global><<<1,1>>>( - ptr, std::forward(factory)); - cudaErrchk(cudaGetLastError()); - cudaErrchk(cudaDeviceSynchronize()); + auto func = reinterpret_cast(&get_value_global>); + void *args[] = {(void*)&ptr, (void*)&factory}; + cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream())); + cudaErrchk(cudaStreamSynchronize(res.get_stream())); return *ptr; } diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp index 2f2bec7f0e..dffb773740 100644 --- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp +++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp @@ -20,6 +20,8 @@ #include "RAJA/config.hpp" +#include "camp/resource.hpp" + #include "RAJA/policy/hip/policy.hpp" #include "RAJA/pattern/WorkGroup/Dispatcher.hpp" @@ -77,12 +79,12 @@ inline auto get_value(Factory&& factory) using value_type = typename std::decay_t::value_type; const std::lock_guard lock(get_value_mutex()); + auto res = ::camp::resources::Hip::get_default(); auto ptr = static_cast(get_cached_value_ptr(sizeof(value_type))); - auto func = get_value_global>; - hipLaunchKernelGGL(func, dim3(1), dim3(1), 0, 0, - ptr, std::forward(factory)); - hipErrchk(hipGetLastError()); - hipErrchk(hipDeviceSynchronize()); + auto func = reinterpret_cast(&get_value_global>); + void *args[] = {(void*)&ptr, (void*)&factory}; + hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream())); + hipErrchk(hipStreamSynchronize(res.get_stream())); return *ptr; } From f0569d421b70527615998edbb8b384fbec4fe891 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 4 Aug 2023 14:33:48 -0700 Subject: [PATCH 32/34] Update scripts to build witn Intel compilers --- .../lc-builds/toss3/icpc_X_gcc8headers.cmake | 2 +- .../lc-builds/toss4/icpc-classic_X.cmake | 18 ++++++ scripts/lc-builds/toss3_icpc.sh | 2 +- scripts/lc-builds/toss4_icpc-classic.sh | 63 +++++++++++++++++++ 4 files changed, 83 insertions(+), 2 deletions(-) create mode 100755 host-configs/lc-builds/toss4/icpc-classic_X.cmake create mode 100755 scripts/lc-builds/toss4_icpc-classic.sh diff --git a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake index b49e950244..eb4c5b949a 100755 --- a/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake +++ b/host-configs/lc-builds/toss3/icpc_X_gcc8headers.cmake @@ -7,7 +7,7 @@ set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "") -set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-8.1.0/bin/g++") +set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-8.3.1/bin/g++") set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "") diff --git a/host-configs/lc-builds/toss4/icpc-classic_X.cmake b/host-configs/lc-builds/toss4/icpc-classic_X.cmake new file mode 100755 index 0000000000..762c36db44 --- /dev/null +++ b/host-configs/lc-builds/toss4/icpc-classic_X.cmake @@ -0,0 +1,18 @@ +############################################################################### +# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "") + +set(COMMON_FLAGS "-gxx-name=/usr/tce/packages/gcc/gcc-10.3.1/bin/g++") + +set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "") + +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh index 5a87d4568c..42f2177f9d 100755 --- a/scripts/lc-builds/toss3_icpc.sh +++ b/scripts/lc-builds/toss3_icpc.sh @@ -19,7 +19,7 @@ shift 1 COMP_MAJOR_VER=${COMP_VER:0:2} GCC_HEADER_VER=7 -USE_TBB=On +USE_TBB=Off if [ ${COMP_MAJOR_VER} -gt 18 ] then diff --git a/scripts/lc-builds/toss4_icpc-classic.sh b/scripts/lc-builds/toss4_icpc-classic.sh new file mode 100755 index 0000000000..0c08f93914 --- /dev/null +++ b/scripts/lc-builds/toss4_icpc-classic.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +############################################################################### +# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +if [ "$1" == "" ]; then + echo + echo "You must pass a compiler version number to script. For example," + echo " toss4_icpc-classic.sh 19.1.2" + exit +fi + +COMP_VER=$1 +shift 1 + +USE_TBB=Off + +BUILD_SUFFIX=lc_toss4-icpc-classic-${COMP_VER} + +echo +echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it" +echo "Configuration extra arguments:" +echo " $@" +echo + +rm -rf build_${BUILD_SUFFIX} 2>/dev/null +mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX} + +module load cmake/3.21.1 + +## +# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile +# times at a potential cost of slower 'forall' execution. +## + +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \ + -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \ + -DBLT_CXX_STD=c++14 \ + -C ../host-configs/lc-builds/toss4/icpc-classic_X.cmake \ + -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \ + -DENABLE_OPENMP=On \ + -DRAJA_ENABLE_TBB=${USE_TBB} \ + -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \ + "$@" \ + .. + +echo +echo "***********************************************************************" +echo +echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA" +echo +echo " Please note that you may need to add some intel openmp libraries to your" +echo " LD_LIBRARY_PATH to run with openmp." +echo +echo " LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel/intel-${COMP_VER}/compiler/lib/intel64_lin" +echo +echo "***********************************************************************" From b5f7f20e72fe2bea5a7b1ce1fb3181cdf071b5d2 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 14 Aug 2023 12:23:50 -0700 Subject: [PATCH 33/34] Update version number --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 24856b7eeb..4814246f12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,9 +14,9 @@ endif() include(CMakeDependentOption) # Set version number -set(RAJA_VERSION_MAJOR 2022) -set(RAJA_VERSION_MINOR 10) -set(RAJA_VERSION_PATCHLEVEL 5) +set(RAJA_VERSION_MAJOR 2023) +set(RAJA_VERSION_MINOR 06) +set(RAJA_VERSION_PATCHLEVEL 1) if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")) message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}") From e7ee7f8ee7c432b9e81f488678300680f93bf941 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 14 Aug 2023 12:24:08 -0700 Subject: [PATCH 34/34] Add release notes. --- RELEASE_NOTES.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 7390020bcc..d4c291b2de 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -6,7 +6,7 @@ [comment]: # (# SPDX-License-Identifier: BSD-3-Clause) [comment]: # (#################################################################) -Version vxx.yy.zz -- Release date 20yy-mm-dd +Version YYYY.MM.PP -- Release date 20yy-mm-dd ============================================ This release contains ... @@ -15,13 +15,38 @@ Notable changes include: * New features / API changes: + * Build changes/improvements: + + * Bug fixes/improvements: + + +Version 2023.06.1 -- Release date 2023-08-16 +============================================ + +This release contains various smaller RAJA improvements. + +Notable changes include: + + * New features / API changes: + * Add compile time block size optimization for new reduction interface. + * Changed default stream usage for Workgroup constructs to use the + stream associated with the default (camp) resource. Previously, we were + using stream zero. Specifically, this change affects where we memset + memory in the zeroed device memory pool and where we get device function + pointers for WorkGroup. + * Build changes/improvements: * RAJA_ENABLE_OPENMP_TASK CMake option added to enable/disable algorithm options based on OpenMP task construct. Currently, this only applies to RAJA's OpenMP sort implementation. The default is 'Off'. The option allows users to choose a task implementation if they wish. + * Resolve several compiler warnings. * Bug fixes/improvements: + * Fix compilation of GPU occupancy calculator and use common types for + HIP and CUDA backends in the occupancy calculator, kernel policies, + and kernel launch helper routines. + * Fix direct cudaMalloc/hipMalloc calls and memory leaks. Version 2023.06.0 -- Release date 2023-07-06