Merge pull request #1371 from LLNL/v2022.10.2-RC

V2022.10.2 rc
LLNL · Nov 7, 2022 · 54a0aaa · 54a0aaa
2 parents 2176ef1 + 44e7af9
commit 54a0aaa
Show file tree

Hide file tree

Showing 38 changed files with 243 additions and 82 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,7 +16,7 @@ include(CMakeDependentOption)
 # Set version number
 set(RAJA_VERSION_MAJOR 2022)
 set(RAJA_VERSION_MINOR 10)
-set(RAJA_VERSION_PATCHLEVEL 1)
+set(RAJA_VERSION_PATCHLEVEL 2)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")
@@ -118,28 +118,23 @@ include(cmake/SetupBasics.cmake)
 include(cmake/SetupPackages.cmake)
 
 if (RAJA_ENABLE_CUDA)
-  if (DEFINED CUDA_ARCH)
-    if (CUDA_ARCH MATCHES "^sm_*")
-      if ("${CUDA_ARCH}" STRLESS "sm_35")
-        message( FATAL_ERROR "RAJA requires minimum CUDA compute architecture of sm_35")
-      endif()
-    endif()
-    if (CUDA_ARCH MATCHES "^compute_*")
-      if ("${CUDA_ARCH}" STRLESS "compute_35")
-        message( FATAL_ERROR "RAJA requires minimum CUDA compute architecture of compute_35")
-      endif()
+  if (DEFINED CMAKE_CUDA_ARCHITECTURES)
+    if ("${CMAKE_CUDA_ARCHITECTURES}" STRLESS "35")
+      message( FATAL_ERROR "RAJA requires minimum CUDA compute architecture of 35")
     endif()
   else()
-    message(STATUS "CUDA compute architecture set to RAJA default sm_35 since it was not specified")
-    set(CUDA_ARCH "sm_35" CACHE STRING "Set CUDA_ARCH to RAJA minimum supported" FORCE)
+    message(STATUS "CUDA compute architecture set to RAJA default 35 since it was not specified")
+    set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "Set CMAKE_CUDA_ARCHITECTURES to RAJA minimum supported" FORCE)
   endif()
+  message(STATUS "CMAKE_CUDA_ARCHITECTURES set to ${CMAKE_CUDA_ARCHITECTURES}") 
   if ( (CMAKE_CXX_COMPILER_ID MATCHES GNU) AND (CMAKE_SYSTEM_PROCESSOR MATCHES ppc64le) )
     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
       set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -mno-float128")
     endif ()
   endif ()
 endif()
 
+
 # Setup vendor-specific compiler flags
 include(cmake/SetupCompilers.cmake)
 # Macros for building executables and libraries

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -20,6 +20,29 @@ Notable changes include:
   * Bug fixes/improvements:
 
 
+Version 2022.10.2 -- Release date 2022-11-08
+============================================
+
+This release fixes a few issues that were found after the v2022.10.1 patch
+release and updates a few things. Sorry for the churn, folks.
+
+Notable changes include:
+
+  * Update desul submodule to commit e4b65e00.
+
+  * CUDA compute architecture must now be set using the 
+    'CMAKE_CUDA_ARCHITECTURES' CMake variable. For example, by passing
+    '-DCMAKE_CUDA_ARCHITECTURES=70' to CMake for 'sm_70' architecture. 
+    Using '-DCUDA_ARCH=sm_*' will not no longer do the right thing. Please
+    see the RAJA User Guide for more information.
+  * A linking bug was fixed related to the usage of the new RAJA::KernelName
+    capability.
+  * A compilation bug was fixed in the new reduction interface support for 
+    OpenMP target offload.  
+  * An issue was fixed in AVX compiler checking logic for RAJA vectorization
+    intrinsics capabilities.
+
+
 Version 2022.10.1 -- Release date 2022-10-31
 ============================================
 
@@ -55,8 +78,9 @@ Notable changes include:
        code and then select which to use at run time. There is no discussion 
        of this in the RAJA User Guide yet. However, there are a couple of 
        example codes in files RAJA/examples/*dynamic-forall*.cpp.
-          * The RAJA::launch framework has been moved out of the experimental namespace, into the RAJA:: namespace, which introduces an API change.
-          * Add support for all RAJA segment types in the RAJA::launch framework.
+     * The RAJA::launch framework has been moved out of the experimental 
+       namespace, into the RAJA:: namespace, which introduces an API change.
+     * Add support for all RAJA segment types in the RAJA::launch framework.
      * Add SYCL back-end support for RAJA::launch and dynamic shared memory
        for all back-ends in RAJA::launch. These changes introduce API changes.
      * Add additional policies to WorkGroup construct that allow for different

diff --git a/cmake/SetupCompilers.cmake b/cmake/SetupCompilers.cmake
@@ -43,7 +43,7 @@ endif()
 
 if (RAJA_ENABLE_CUDA)
   set(CMAKE_CUDA_STANDARD 14)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict -arch ${CUDA_ARCH} --expt-extended-lambda --expt-relaxed-constexpr -Xcudafe \"--display_error_number\"")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -restrict --expt-extended-lambda --expt-relaxed-constexpr -Xcudafe \"--display_error_number\"")
 
   if (NOT RAJA_HOST_CONFIG_LOADED)
     set(CMAKE_CUDA_FLAGS_RELEASE "-O2")

diff --git a/docs/conf.py b/docs/conf.py
@@ -88,7 +88,7 @@
 # The short X.Y version.
 version = u'2022.10'
 # The full version, including alpha/beta/rc tags.
-release = u'2022.10.1'
+release = u'2022.10.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/sphinx/user_guide/config_options.rst b/docs/sphinx/user_guide/config_options.rst
@@ -206,7 +206,6 @@ Other programming model specific compilation options are also available:
                                                must be on too!)
       RAJA_ENABLE_EXTERNAL_CUB                 Off
       RAJA_ENABLE_NV_TOOLS_EXT                 Off
-      CUDA_ARCH                                sm_35 (based on hardware support)
       RAJA_ENABLE_EXTERNAL_ROCPRIM             Off
       RAJA_ENABLE_ROCTX                        Off
       ======================================   =================================

diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst
@@ -229,8 +229,9 @@ compiler options to nvcc.
           ``-Xcompiler`` directive to properly propagate.
 
 To set the CUDA compute architecture, which should be chosen based on the 
-NVIDIA GPU hardware you are using, you can use the ``CUDA_ARCH`` CMake 
-variable. For example, the CMake option ``-DCUDA_ARCH=sm_70`` will tell the 
+NVIDIA GPU hardware you are using, you can use the ``CMAKE_CUDA_ARCHITECTURES`` 
+CMake variable. For example, the CMake option 
+``-DCMAKE_CUDA_ARCHITECTURES=70`` will tell the 
 compiler to use the `sm_70` SASS architecture in its second stage of 
 compilation. The compiler will pick the PTX architecture to use in the first 
 stage of compilation that is suitable for the SASS architecture you specify.
@@ -244,12 +245,13 @@ appropriate nvcc options in the ``CMAKE_CUDA_FLAGS_*`` variables.
           implemented inside RAJA. This is described in 
           :ref:`feat-atomics-label`.
 
-          * If you do not specify a value for ``CUDA_ARCH``, it will be set to
-            `sm_35` by default and CMake will emit a status message 
-            indicating this choice was made.
+          * If you do not specify a value for ``CMAKE_CUDA_ARCHITECTURES``, 
+            it will be set to `35` by default and CMake will emit a status 
+            message indicating this choice was made.
 
-          * If you give a ``CUDA_ARCH`` value less than `sm_35` (e.g., `sm_30`),
-            CMake will report this as an error and stop processing.
+          * If you give a ``CMAKE_CUDA_ARCHITECTURES`` value less than `35` 
+            (e.g., `30`), CMake will report this as an error and stop 
+            processing.
 
 Also, RAJA relies on the CUB CUDA utilities library, mentioned earlier, for 
 some CUDA back-end functionality. The CUB version included in the CUDA toolkit 

diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
@@ -18,7 +18,7 @@ namespace detail
 
 } // namespace detail
 
-auto KernelName(const char * n)
+inline auto KernelName(const char * n)
 {
   return detail::KernelName(n);
 }

diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
@@ -87,7 +87,7 @@ RAJA_INLINE T atomicInc(AtomicPolicy, T volatile *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_wrapping_fetch_inc(const_cast<T*>(acc),
+  return desul::atomic_fetch_inc_mod(const_cast<T*>(acc),
                                           val,
                                           raja_default_desul_order{},
                                           raja_default_desul_scope{});
@@ -110,7 +110,7 @@ RAJA_INLINE T atomicDec(AtomicPolicy, T volatile *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_wrapping_fetch_dec(const_cast<T*>(acc),
+  return desul::atomic_fetch_dec_mod(const_cast<T*>(acc),
                                           val,
                                           raja_default_desul_order{},
                                           raja_default_desul_scope{});

diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
@@ -11,7 +11,7 @@
 #define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
       _Pragma(" omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) \
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")\
         //initializer(omp_priv = omp_in) ")
 
 namespace RAJA
@@ -38,7 +38,7 @@ namespace expt
                 ForallParam&& f_params)
     {
       using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<ExecPol>(f_params);
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
       RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
       RAJA_EXTRACT_BED_IT(iter);
@@ -47,7 +47,7 @@ namespace expt
         RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
       }
 
-      RAJA::expt::ParamMultiplexer::resolve<ExecPol>(f_params);
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
     }
 
     //

diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
@@ -76,7 +76,7 @@ forall_impl(resources::Omp omp_res,
   auto i = distance_it;
 
 #pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it) redcution(combine: f_params)
+    schedule(static, 1) map(to : body,begin_it) reduction(combine: f_params)
   for (i = 0; i < distance_it; ++i) {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);

diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -126,7 +126,7 @@ namespace expt
       RAJA_INLINE
       self_type &load_packed(element_type const *ptr){
 			  // AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) \
+        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
             (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
         m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
         #else
@@ -185,7 +185,7 @@ namespace expt
       RAJA_INLINE
       self_type const &store_packed(element_type *ptr) const{
 				// AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) \
+        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
             (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
         _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
         #else

diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh
@@ -22,7 +22,7 @@ shift 1
 BUILD_SUFFIX=lc_blueos-clang-${COMP_VER}
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
@@ -35,6 +35,7 @@ module load cmake/3.20.2
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
   -C ../host-configs/lc-builds/blueos/clang_X.cmake \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \

diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh
@@ -22,7 +22,7 @@ shift 1
 BUILD_SUFFIX=lc_blueos-clang-${COMP_VER}_omptarget
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
@@ -35,6 +35,7 @@ module load cmake/3.20.2
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
   -C ../host-configs/lc-builds/blueos/clang_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=Off \

diff --git a/scripts/lc-builds/blueos_clangcuda.sh b/scripts/lc-builds/blueos_clangcuda.sh
@@ -27,7 +27,7 @@ shift 3
 BUILD_SUFFIX=lc_blueos-clangcuda${COMP_CLANG_VER}_cuda${TOOLKIT_CUDA_VER}-${CUDA_ARCH}
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
@@ -43,6 +43,7 @@ cmake \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \
   -DCMAKE_C_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${TOOLKIT_CUDA_VER} \
+  -DBLT_CXX_STD=c++14 \
   -C ../host-configs/lc-builds/blueos/clangcuda_X.cmake \
   -DENABLE_OPENMP=Off \
   -DENABLE_CLANG_CUDA=On \

diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh
@@ -20,7 +20,7 @@ shift 1
 BUILD_SUFFIX=lc_blueos-gcc-${COMP_VER}
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
@@ -33,6 +33,7 @@ module load cmake/3.20.2
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
   -C ../host-configs/lc-builds/blueos/gcc_X.cmake \
   -DENABLE_OPENMP=On \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \

diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh
@@ -11,11 +11,11 @@ if [[ $# -lt 3 ]]; then
   echo
   echo "You must pass 3 arguments to the script (in this order): "
   echo "   1) compiler version number for nvcc"
-  echo "   2) CUDA compute architecture"
-  echo "   3) compiler version number for clang. "
+  echo "   2) CUDA compute architecture (number only, not 'sm_70' for example)"
+  echo "   3) compiler version number for clang"
   echo
   echo "For example: "
-  echo "    blueos_nvcc_clang.sh 10.2.89 sm_70 10.0.1"
+  echo "    blueos_nvcc_clang.sh 10.2.89 70 10.0.1"
   exit
 fi
 
@@ -27,7 +27,7 @@ shift 3
 BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-clang${COMP_CLANG_VER}
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
@@ -40,12 +40,13 @@ module load cmake/3.20.2
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/clang/clang-${COMP_CLANG_VER}/bin/clang++ \
+  -DBLT_CXX_STD=c++14 \
   -C ../host-configs/lc-builds/blueos/nvcc_clang_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
-  -DCUDA_ARCH=${COMP_ARCH} \
+  -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..

diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh
@@ -11,11 +11,11 @@ if [[ $# -lt 3 ]]; then
   echo
   echo "You must pass 3 arguments to the script (in this order): "
   echo "   1) compiler version number for nvcc"
-  echo "   2) CUDA compute architecture"
-  echo "   3) compiler version number for gcc. "
+  echo "   2) CUDA compute architecture (number only, not 'sm_70' for example)"
+  echo "   3) compiler version number for gcc"
   echo
   echo "For example: "
-  echo "    blueos_nvcc_gcc.sh 10.2.89 sm_70 8.3.1"
+  echo "    blueos_nvcc_gcc.sh 10.2.89 70 8.3.1"
   exit
 fi
 
@@ -27,7 +27,7 @@ shift 3
 BUILD_SUFFIX=lc_blueos-nvcc${COMP_NVCC_VER}-${COMP_ARCH}-gcc${COMP_GCC_VER}
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
@@ -40,12 +40,13 @@ module load cmake/3.20.2
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_GCC_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
   -C ../host-configs/lc-builds/blueos/nvcc_gcc_X.cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=On \
   -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER} \
   -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-${COMP_NVCC_VER}/bin/nvcc \
-  -DCUDA_ARCH=${COMP_ARCH} \
+  -DCMAKE_CUDA_ARCHITECTURES=${COMP_ARCH} \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..