LLNL · trws · Jul 11, 2022 · Jul 11, 2022 · Jul 11, 2022 · Jul 12, 2022
diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
@@ -253,6 +253,21 @@ namespace RAJA {
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_CUDA_ACTIVE
+
+// Compiler numeric identifiers
+#define RAJA_CUDA_COMPILER_NVCC -1
+#define RAJA_CUDA_COMPILER_CLANG 1
+#define RAJA_CUDA_COMPILER_NVCXX 2
+
+#if defined(__NVCOMPILER_CUDA__) // it's NVC++
+#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_NVCXX
+#elif defined(__NVCC__) // it's NVCC
+#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_NVCC
+#elif defined(__clang__) && defined(__CUDA__) // clang cuda
+#define RAJA_CUDA_COMPILER RAJA_CUDA_COMPILER_CLANG
+#else
+#error Unknown compiler is claiming to be a CUDA compiler
+#endif
 #endif // RAJA_ENABLE_CUDA && __CUDACC__
 
 #if defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)

diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
@@ -50,6 +50,47 @@
 #include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
+#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
+                             __Mask, __Type)                                   \
+  inline __device__ int __FnName(unsigned int __mask, int __val,               \
+                                 __Type __offset, int __width = warpSize) {    \
+    return __IntIntrinsic(__mask, __val, __offset,                             \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
+                                       __Type __offset,                        \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
+    __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned int __mask, unsigned long long __val, __Type __offset,          \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        __mask, static_cast<long long>(__val), __offset, __width));   \
+  }
+
+__MAKE_SYNC_SHUFFLES(__shfl_sync_fixed, __nvvm_shfl_sync_idx_i32,
+                     __nvvm_shfl_sync_idx_f32, 0x1f, int);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_xor_sync_fixed, __nvvm_shfl_sync_bfly_i32,
+                     __nvvm_shfl_sync_bfly_f32, 0x1f, int);
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+#endif
+
 namespace RAJA
 {
 
@@ -243,7 +284,11 @@ RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int la
 template <>
 RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
 {
+#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda
+  return ::__shfl_xor_sync_fixed(0xffffffffu, var, laneMask);
+#else
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+#endif
 }
 
 template <>
@@ -310,7 +355,11 @@ RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLan
 template <>
 RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
 {
+#if RAJA_CUDA_COMPILER == RAJA_CUDA_COMPILER_CLANG // use this for clang cuda
+  return ::__shfl_sync_fixed(0xffffffffu, var, srcLane);
+#else
   return ::__shfl_sync(0xffffffffu, var, srcLane);
+#endif
 }
 
 template <>
@@ -389,7 +438,7 @@ RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
   T temp = val;
 
   for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-    T rhs = __shfl_xor_sync(0xffffffff, temp, i);
+    T rhs = shfl_xor_sync(temp, i);
     Combiner{}(temp, rhs);
   }
 

diff --git a/test/integration/plugin/CMakeLists.txt b/test/integration/plugin/CMakeLists.txt
@@ -39,13 +39,21 @@ foreach( BACKEND ${PLUGIN_BACKENDS} )
 
     configure_file( test-plugin-workgroup.cpp.in
                     test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp )
+  if(${BACKEND} STREQUAL "Hip")
+      raja_add_test( NAME test-plugin-workgroup-Known-Hip-Failure-${BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${BACKEND}.cpp
+                         plugin_to_test.cpp )
 
-    raja_add_test( NAME test-plugin-workgroup-${DISPATCHER}-${BACKEND}
-               SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp
-                       plugin_to_test.cpp )
+      target_include_directories(test-plugin-workgroup-Known-Hip-Failure-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  else()
+      raja_add_test( NAME test-plugin-workgroup-${BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${BACKEND}.cpp
+                         plugin_to_test.cpp )
 
-    target_include_directories(test-plugin-workgroup-${DISPATCHER}-${BACKEND}.exe
-                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+      target_include_directories(test-plugin-workgroup-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endif()
 
   endforeach()
 endforeach()

diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt
@@ -64,7 +64,8 @@ macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_
                     test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )
 
     #Some tests are known to fail for Hip, mark those tests (Will not be run in Gitlab CI)
-    if(${SORT_BACKEND} STREQUAL "Hip" AND (${UTIL_SORT} STREQUAL "Heap" OR 
+    if((${SORT_BACKEND} STREQUAL "Hip" OR RAJA_ENABLE_CLANG_CUDA)
+      AND (${UTIL_SORT} STREQUAL "Heap" OR 
 	                                   ${UTIL_SORT} STREQUAL "Insertion" OR
 					   ${UTIL_SORT} STREQUAL "Intro"))
         raja_add_test( NAME test-algorithm-util-sort-Known-Hip-Failure-${UTIL_SORT}-${SORT_BACKEND}