diff --git a/CMakeLists.txt b/CMakeLists.txt index ff6600cd13..25ee8f6dd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ include(CMakeDependentOption) # Set version number set(RAJA_VERSION_MAJOR 2022) set(RAJA_VERSION_MINOR 10) -set(RAJA_VERSION_PATCHLEVEL 3) +set(RAJA_VERSION_PATCHLEVEL 4) if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")) message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}") diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index f34f191608..cc4b1edf26 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -19,6 +19,13 @@ Notable changes include: * Bug fixes/improvements: +Version 2022.10.4 -- Release date 2022-14-01 +============================================ + +This release fixes an issue that was found after the v2022.10.3 release. + + * Fixes device alignment bug in workgroups which led to missing symbol errors + with the AMD clang compiler. Version 2022.10.3 -- Release date 2022-12-01 ============================================ diff --git a/docs/conf.py b/docs/conf.py index 51e0336b1a..5a111497d9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -88,7 +88,7 @@ # The short X.Y version. version = u'2022.10' # The full version, including alpha/beta/rc tags. -release = u'2022.10.3' +release = u'2022.10.4' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 404d8beebf..037ec13e2a 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -32,6 +32,7 @@ #define RAJA_config_HPP #include +#include #include #if defined(_MSVC_LANG) @@ -239,6 +240,15 @@ static_assert(RAJA_HAS_SOME_CXX14, #define RAJA_PRAGMA(x) _Pragma(RAJA_STRINGIFY(x)) #endif + +/* NOTE: Below we define RAJA_MAX_ALIGN for each compiler, currently it is set as 16 bytes +for all cases, except MSVC. Previously this was set by alignof(std::max_align_t) which, in Clang, +is based on the sizeof(long double). This causes an in inconsistency as CUDA/HIP long doubles +are demoted to doubles causing alignof(std::max_align_t) to return 8 bytes on the device and +16 bytes on the host. We therefore set a standard size and ensure validity through a +static_assert. +*/ + namespace RAJA { #if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__) @@ -374,7 +384,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; // // Configuration options for Intel compilers // - +#define RAJA_MAX_ALIGN 16 #if defined (RAJA_ENABLE_FORCEINLINE_RECURSIVE) #define RAJA_FORCEINLINE_RECURSIVE RAJA_PRAGMA(forceinline recursive) #else @@ -387,6 +397,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #define RAJA_INLINE inline __attribute__((always_inline)) #endif + #define RAJA_UNROLL RAJA_PRAGMA(unroll) #define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll(N)) @@ -412,9 +423,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; // // Configuration options for GNU compilers // +#define RAJA_MAX_ALIGN 16 #define RAJA_FORCEINLINE_RECURSIVE #define RAJA_INLINE inline __attribute__((always_inline)) - #if !defined(__NVCC__) #define RAJA_UNROLL RAJA_PRAGMA(GCC unroll 10000) #define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(GCC unroll N) @@ -446,11 +457,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; // // Configuration options for xlc compiler (i.e., bgq/sequoia). // +#define RAJA_MAX_ALIGN 16 #define RAJA_FORCEINLINE_RECURSIVE #define RAJA_INLINE inline __attribute__((always_inline)) #define RAJA_UNROLL #define RAJA_UNROLL_COUNT(N) - // FIXME: alignx is breaking CUDA+xlc #if defined(RAJA_ENABLE_CUDA) #define RAJA_ALIGN_DATA(d) d @@ -476,12 +487,11 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; // // Configuration options for clang compilers // +#define RAJA_MAX_ALIGN 16 #define RAJA_FORCEINLINE_RECURSIVE #define RAJA_INLINE inline __attribute__((always_inline)) #define RAJA_UNROLL RAJA_PRAGMA(clang loop unroll(enable)) #define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(clang loop unroll_count(N)) - - // note that neither nvcc nor Apple Clang compiler currently doesn't support // the __builtin_assume_aligned attribute #if defined(RAJA_ENABLE_CUDA) || defined(__APPLE__) @@ -514,7 +524,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; // This is the same as undefined compiler, but squelches the warning message #elif defined(RAJA_COMPILER_MSVC) - +#define RAJA_MAX_ALIGN alignof(std::max_align_t) #define RAJA_FORCEINLINE_RECURSIVE #define RAJA_INLINE inline #define RAJA_ALIGN_DATA(d) d @@ -526,6 +536,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #else #pragma message("RAJA_COMPILER unknown, using default empty macros.") +#define RAJA_MAX_ALIGN 16 #define RAJA_FORCEINLINE_RECURSIVE #define RAJA_INLINE inline #define RAJA_ALIGN_DATA(d) d @@ -536,6 +547,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #endif +static_assert(RAJA_MAX_ALIGN >= alignof(std::max_align_t) && (RAJA_MAX_ALIGN/alignof(std::max_align_t))*alignof(std::max_align_t) == RAJA_MAX_ALIGN, + "Inconsistent RAJA_MAX_ALIGN size"); + #cmakedefine RAJA_HAVE_POSIX_MEMALIGN #cmakedefine RAJA_HAVE_ALIGNED_ALLOC #cmakedefine RAJA_HAVE_MM_MALLOC diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp index 6bdd56a3c3..c24e09fc53 100644 --- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp +++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp @@ -45,7 +45,7 @@ struct WorkStruct; * sizeof(GenericWorkStruct) <= sizeof(WorkStruct) */ template < typename Dispatcher_T > -using GenericWorkStruct = WorkStruct; +using GenericWorkStruct = WorkStruct; template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs > struct WorkStruct> @@ -71,7 +71,6 @@ struct WorkStruct(ptr); value_ptr->dispatcher = dispatcher; @@ -112,7 +111,7 @@ struct WorkStruct::type obj; + typename std::aligned_storage::type obj; }; } // namespace detail