Merge pull request #1405 from LLNL/rc-v2022.10.4

Rc v2022.10.4
LLNL · Dec 15, 2022 · c2a6b17 · c2a6b17
2 parents a83a448 + 887c9e0
commit c2a6b17
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 11 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,7 +16,7 @@ include(CMakeDependentOption)
 # Set version number
 set(RAJA_VERSION_MAJOR 2022)
 set(RAJA_VERSION_MINOR 10)
-set(RAJA_VERSION_PATCHLEVEL 3)
+set(RAJA_VERSION_PATCHLEVEL 4)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -19,6 +19,13 @@ Notable changes include:
 
   * Bug fixes/improvements:
 
+Version 2022.10.4 -- Release date 2022-14-01
+============================================
+
+This release fixes an issue that was found after the v2022.10.3 release.
+
+  * Fixes device alignment bug in workgroups which led to missing symbol errors
+    with the AMD clang compiler.
 
 Version 2022.10.3 -- Release date 2022-12-01
 ============================================

diff --git a/docs/conf.py b/docs/conf.py
@@ -88,7 +88,7 @@
 # The short X.Y version.
 version = u'2022.10'
 # The full version, including alpha/beta/rc tags.
-release = u'2022.10.3'
+release = u'2022.10.4'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
@@ -32,6 +32,7 @@
 #define RAJA_config_HPP
 
 #include <utility>
+#include <cstddef>
 #include <type_traits>
 
 #if defined(_MSVC_LANG)
@@ -239,6 +240,15 @@ static_assert(RAJA_HAS_SOME_CXX14,
 #define RAJA_PRAGMA(x) _Pragma(RAJA_STRINGIFY(x))
 #endif
 
+
+/* NOTE: Below we define RAJA_MAX_ALIGN for each compiler, currently it is set as 16 bytes
+for all cases, except MSVC. Previously this was set by alignof(std::max_align_t) which, in Clang,
+is based on the sizeof(long double). This causes an in inconsistency as CUDA/HIP long doubles 
+are demoted to doubles causing alignof(std::max_align_t) to return 8 bytes on the device and
+16 bytes on the host. We therefore set a standard size and ensure validity through a 
+static_assert.
+*/
+
 namespace RAJA {
 
 #if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__)
@@ -374,7 +384,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 //
 // Configuration options for Intel compilers
 //
-
+#define RAJA_MAX_ALIGN 16
 #if defined (RAJA_ENABLE_FORCEINLINE_RECURSIVE)
 #define RAJA_FORCEINLINE_RECURSIVE  RAJA_PRAGMA(forceinline recursive)
 #else
@@ -387,6 +397,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 #define RAJA_INLINE inline  __attribute__((always_inline))
 #endif
 
+
 #define RAJA_UNROLL RAJA_PRAGMA(unroll)
 #define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll(N))
 
@@ -412,9 +423,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 //
 // Configuration options for GNU compilers
 //
+#define RAJA_MAX_ALIGN 16
 #define RAJA_FORCEINLINE_RECURSIVE
 #define RAJA_INLINE inline  __attribute__((always_inline))
-
 #if !defined(__NVCC__)
 #define RAJA_UNROLL RAJA_PRAGMA(GCC unroll 10000)
 #define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(GCC unroll N)
@@ -446,11 +457,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 //
 // Configuration options for xlc compiler (i.e., bgq/sequoia).
 //
+#define RAJA_MAX_ALIGN 16
 #define RAJA_FORCEINLINE_RECURSIVE
 #define RAJA_INLINE inline  __attribute__((always_inline))
 #define RAJA_UNROLL
 #define RAJA_UNROLL_COUNT(N)
-
 // FIXME: alignx is breaking CUDA+xlc
 #if defined(RAJA_ENABLE_CUDA)
 #define RAJA_ALIGN_DATA(d) d
@@ -476,12 +487,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 //
 // Configuration options for clang compilers
 //
+#define RAJA_MAX_ALIGN 16
 #define RAJA_FORCEINLINE_RECURSIVE
 #define RAJA_INLINE inline  __attribute__((always_inline))
 #define RAJA_UNROLL RAJA_PRAGMA(clang loop unroll(enable))
 #define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(clang loop unroll_count(N))
-
-
 // note that neither nvcc nor Apple Clang compiler currently doesn't support
 // the __builtin_assume_aligned attribute
 #if defined(RAJA_ENABLE_CUDA) || defined(__APPLE__)
@@ -514,7 +524,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 
 // This is the same as undefined compiler, but squelches the warning message
 #elif defined(RAJA_COMPILER_MSVC)
-
+#define RAJA_MAX_ALIGN alignof(std::max_align_t)
 #define RAJA_FORCEINLINE_RECURSIVE
 #define RAJA_INLINE inline
 #define RAJA_ALIGN_DATA(d) d
@@ -526,6 +536,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 #else
 
 #pragma message("RAJA_COMPILER unknown, using default empty macros.")
+#define RAJA_MAX_ALIGN 16
 #define RAJA_FORCEINLINE_RECURSIVE
 #define RAJA_INLINE inline
 #define RAJA_ALIGN_DATA(d) d
@@ -536,6 +547,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 
 #endif
 
+static_assert(RAJA_MAX_ALIGN >= alignof(std::max_align_t) && (RAJA_MAX_ALIGN/alignof(std::max_align_t))*alignof(std::max_align_t) == RAJA_MAX_ALIGN, 
+        "Inconsistent RAJA_MAX_ALIGN size");
+
 #cmakedefine RAJA_HAVE_POSIX_MEMALIGN
 #cmakedefine RAJA_HAVE_ALIGNED_ALLOC
 #cmakedefine RAJA_HAVE_MM_MALLOC

diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -45,7 +45,7 @@ struct WorkStruct;
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
 template < typename Dispatcher_T >
-using GenericWorkStruct = WorkStruct<alignof(std::max_align_t), Dispatcher_T>;
+using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
 template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
 struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
@@ -71,7 +71,6 @@ struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, Call
         "WorkStruct and GenericWorkStruct must have obj at the same offset");
     static_assert(sizeof(value_type) <= sizeof(true_value_type),
         "WorkStruct must not be smaller than GenericWorkStruct");
-
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
@@ -112,7 +111,7 @@ struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, Call
 
   const dispatcher_type* dispatcher;
   typename dispatcher_type::invoker_type invoke;
-  typename std::aligned_storage<size, alignof(std::max_align_t)>::type obj;
+  typename std::aligned_storage<size, RAJA_MAX_ALIGN>::type obj;
 };
 
 }  // namespace detail