From c03ead175d6e2c0b4aafa4f12e2c1ecdb763e026 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 14:33:50 +0200
Subject: [PATCH 01/94] Use raw Spack packages for update (from spack@develop)

---
 scripts/spack_packages/camp/package.py |  92 ++--
 scripts/spack_packages/raja/package.py | 556 +++++++++----------------
 2 files changed, 249 insertions(+), 399 deletions(-)

diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
index 3bff14ef61..8f40f6b295 100644
--- a/scripts/spack_packages/camp/package.py
+++ b/scripts/spack_packages/camp/package.py
@@ -8,6 +8,14 @@
 from spack.package import *
 
 
+def hip_repair_options(options, spec):
+    # there is only one dir like this, but the version component is unknown
+    options.append(
+        "-DHIP_CLANG_INCLUDE_PATH="
+        + glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0]
+    )
+
+
 class Camp(CMakePackage, CudaPackage, ROCmPackage):
     """
     Compiler agnostic metaprogramming library providing concepts,
@@ -15,69 +23,61 @@ class Camp(CMakePackage, CudaPackage, ROCmPackage):
     """
 
     homepage = "https://github.com/LLNL/camp"
-    git      = "https://github.com/LLNL/camp.git"
-    url      = "https://github.com/LLNL/camp/archive/v0.1.0.tar.gz"
+    git = "https://github.com/LLNL/camp.git"
+    url = "https://github.com/LLNL/camp/archive/v0.1.0.tar.gz"
 
-    maintainers = ['trws']
+    maintainers = ["trws"]
 
-    version('main', branch='main', submodules='True')
-    version('2022.03.0', sha256='e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721')
-    version('0.3.0', sha256='129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb')
-    version('0.2.3', sha256='58a0f3bd5eadb588d7dc83f3d050aff8c8db639fc89e8d6553f9ce34fc2421a7')
-    version('0.2.2', sha256='194d38b57e50e3494482a7f94940b27f37a2bee8291f2574d64db342b981d819')
-    version('0.1.0', sha256='fd4f0f2a60b82a12a1d9f943f8893dc6fe770db493f8fae5ef6f7d0c439bebcc')
+    version("main", branch="main", submodules="True")
+    version("2022.03.2", sha256="bc4aaeacfe8f2912e28f7a36fc731ab9e481bee15f2c6daf0cb208eed3f201eb")
+    version("2022.03.0", sha256="e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721")
+    version("0.3.0", sha256="129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb")
+    version("0.2.3", sha256="58a0f3bd5eadb588d7dc83f3d050aff8c8db639fc89e8d6553f9ce34fc2421a7")
+    version("0.2.2", sha256="194d38b57e50e3494482a7f94940b27f37a2bee8291f2574d64db342b981d819")
+    version("0.1.0", sha256="fd4f0f2a60b82a12a1d9f943f8893dc6fe770db493f8fae5ef6f7d0c439bebcc")
 
     # TODO: figure out gtest dependency and then set this default True.
-    variant('tests', default=False, description='Build tests')
-    variant('openmp', default=False, description='Build with OpenMP support')
+    variant("tests", default=False, description="Build tests")
+    variant("openmp", default=False, description="Build OpenMP support")
 
-    depends_on('cub', when='+cuda')
+    depends_on("cub", when="+cuda")
 
-    depends_on('blt')
+    depends_on("blt")
 
     def cmake_args(self):
         spec = self.spec
 
         options = []
 
-        options.append("-DBLT_SOURCE_DIR={0}".format(spec['blt'].prefix))
+        options.append("-DBLT_SOURCE_DIR={0}".format(spec["blt"].prefix))
 
-        if '+cuda' in spec:
-            options.extend([
-                '-DENABLE_CUDA=ON',
-                '-DCUDA_TOOLKIT_ROOT_DIR=%s' % (spec['cuda'].prefix)])
+        options.append("-DENABLE_OPENMP=" + ("On" if "+openmp" in spec else "Off"))
+        if "+cuda" in spec:
+            options.extend(
+                ["-DENABLE_CUDA=ON", "-DCUDA_TOOLKIT_ROOT_DIR=%s" % (spec["cuda"].prefix)]
+            )
 
-            if not spec.satisfies('cuda_arch=none'):
-                cuda_arch = spec.variants['cuda_arch'].value
-                options.append('-DCMAKE_CUDA_ARCHITECTURES={0}'.format(cuda_arch[0]))
-                options.append('-DCUDA_ARCH=sm_{0}'.format(cuda_arch[0]))
-                flag = '-arch sm_{0}'.format(cuda_arch[0])
-                options.append('-DCMAKE_CUDA_FLAGS:STRING={0}'.format(flag))
+            if not spec.satisfies("cuda_arch=none"):
+                cuda_arch = spec.variants["cuda_arch"].value
+                options.append("-DCMAKE_CUDA_ARCHITECTURES={0}".format(cuda_arch[0]))
+                options.append("-DCUDA_ARCH=sm_{0}".format(cuda_arch[0]))
+                flag = "-arch sm_{0}".format(cuda_arch[0])
+                options.append("-DCMAKE_CUDA_FLAGS:STRING={0}".format(flag))
         else:
-            options.append('-DENABLE_CUDA=OFF')
-
-        if '+rocm' in spec:
-            options.extend([
-                '-DENABLE_HIP=ON',
-                '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix)
-            ])
-            archs = self.spec.variants['amdgpu_target'].value
-            if archs != 'none':
+            options.append("-DENABLE_CUDA=OFF")
+
+        if "+rocm" in spec:
+            options.extend(["-DENABLE_HIP=ON", "-DHIP_ROOT_DIR={0}".format(spec["hip"].prefix)])
+
+            hip_repair_options(options, spec)
+
+            archs = self.spec.variants["amdgpu_target"].value
+            if archs != "none":
                 arch_str = ",".join(archs)
-                options.append(
-                    '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}'.format(arch_str)
-                )
-            # there is only one dir like this, but the version component is unknown
-            options.append(
-                "-DHIP_CLANG_INCLUDE_PATH=" + glob.glob(
-                    "{}/lib/clang/*/include".format(spec['llvm-amdgpu'].prefix)
-                )[0]
-            )
+                options.append("-DHIP_HIPCC_FLAGS=--amdgpu-target={0}".format(arch_str))
         else:
-            options.append('-DENABLE_HIP=OFF')
-
-        options.append(self.define_from_variant('ENABLE_TESTS', 'tests'))
-        options.append(self.define_from_variant('ENABLE_OPENMP', 'openmp'))
+            options.append("-DENABLE_HIP=OFF")
 
+        options.append(self.define_from_variant("ENABLE_TESTS", "tests"))
 
         return options
diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 35b138e447..4850ed1c54 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -1,382 +1,232 @@
-# Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
+# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
 # Spack Project Developers. See the top-level COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
-
-from spack import *
-
-import glob
-import os
 import socket
 
-from os import environ as env
-from os.path import join as pjoin
-
-import re
-
-def cmake_cache_entry(name, value, comment=""):
-    """Generate a string for a cmake cache variable"""
-
-    return 'set(%s "%s" CACHE PATH "%s")\n\n' % (name,value,comment)
-
-
-def cmake_cache_string(name, string, comment=""):
-    """Generate a string for a cmake cache variable"""
-
-    return 'set(%s "%s" CACHE STRING "%s")\n\n' % (name,string,comment)
-
-
-def cmake_cache_option(name, boolean_value, comment=""):
-    """Generate a string for a cmake configuration option"""
-
-    value = "ON" if boolean_value else "OFF"
-    return 'set(%s %s CACHE BOOL "%s")\n\n' % (name,value,comment)
-
-
-def get_spec_path(spec, package_name, path_replacements = {}, use_bin = False) :
-    """Extracts the prefix path for the given spack package
-       path_replacements is a dictionary with string replacements for the path.
-    """
-
-    if not use_bin:
-        path = spec[package_name].prefix
-    else:
-        path = spec[package_name].prefix.bin
-
-    path = os.path.realpath(path)
-
-    for key in path_replacements:
-        path = path.replace(key, path_replacements[key])
-
-    return path
-
-
-class Raja(CMakePackage, CudaPackage, ROCmPackage):
-    """RAJA Performance Portability Abstractions for C++ HPC Applications."""
-
-    homepage = "https://github.com/LLNL/RAJA"
-    git      = "https://github.com/LLNL/RAJA.git"
-    tags     = ['radiuss', 'e4s']
-
-    maintainers = ['davidbeckingsale']
-
-    version('develop', branch='develop', submodules='True')
-    version('main',  branch='main',  submodules='True')
-    version('0.14.1', tag='v0.14.1', submodules="True")
-    version('0.14.0', tag='v0.14.0', submodules="True")
-    version('0.13.0', tag='v0.13.0', submodules="True")
-    version('0.12.1', tag='v0.12.1', submodules="True")
-    version('0.12.0', tag='v0.12.0', submodules="True")
-    version('0.11.0', tag='v0.11.0', submodules="True")
-    version('0.10.1', tag='v0.10.1', submodules="True")
-    version('0.10.0', tag='v0.10.0', submodules="True")
-    version('0.9.0', tag='v0.9.0', submodules="True")
-    version('0.8.0', tag='v0.8.0', submodules="True")
-    version('0.7.0', tag='v0.7.0', submodules="True")
-    version('0.6.0', tag='v0.6.0', submodules="True")
-    version('0.5.3', tag='v0.5.3', submodules="True")
-    version('0.5.2', tag='v0.5.2', submodules="True")
-    version('0.5.1', tag='v0.5.1', submodules="True")
-    version('0.5.0', tag='v0.5.0', submodules="True")
-    version('0.4.1', tag='v0.4.1', submodules="True")
-    version('0.4.0', tag='v0.4.0', submodules="True")
-
-    variant('openmp', default=True, description='Build OpenMP backend')
-    variant('shared', default=False, description='Build Shared Libs')
-    variant('libcpp', default=False, description='Uses libc++ instead of libstdc++')
-    variant('tests', default='basic', values=('none', 'basic', 'benchmarks'),
-            multi=False, description='Tests to run')
-    variant('desul', default=False, description='Build Desul Atomics backend')
-
-    depends_on('cmake@3.9:', type='build')
-
-    depends_on('blt@0.4.1', type='build', when='@main')
-    depends_on('blt@0.4.1:', type='build')
-
-    depends_on('camp')
-    depends_on('camp@main') # TODO: remove this ASAP
-    depends_on('camp+rocm', when='+rocm')
-    depends_on('camp+openmp', when='+openmp')
-    for val in ROCmPackage.amdgpu_targets:
-        depends_on('camp amdgpu_target=%s' % val, when='amdgpu_target=%s' % val)
-
-    depends_on('camp+cuda', when='+cuda')
-    for sm_ in CudaPackage.cuda_arch_values:
-        depends_on('camp cuda_arch={0}'.format(sm_),
-                   when='cuda_arch={0}'.format(sm_))
-
-    conflicts('+openmp', when='+rocm')
-    depends_on('rocprim', when='+rocm')
+from spack.package import *
+from spack.pkg.builtin.camp import hip_repair_options
+
+
+class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
+    """RAJA Parallel Framework."""
+
+    homepage = "https://software.llnl.gov/RAJA/"
+    git = "https://github.com/LLNL/RAJA.git"
+    tags = ["radiuss", "e4s"]
+
+    maintainers = ["davidbeckingsale"]
+
+    version("develop", branch="develop", submodules=False)
+    version("main", branch="main", submodules=False)
+    version("2022.03.0", tag="v2022.03.0", submodules=False)
+    version("0.14.0", tag="v0.14.0", submodules="True")
+    version("0.13.0", tag="v0.13.0", submodules="True")
+    version("0.12.1", tag="v0.12.1", submodules="True")
+    version("0.12.0", tag="v0.12.0", submodules="True")
+    version("0.11.0", tag="v0.11.0", submodules="True")
+    version("0.10.1", tag="v0.10.1", submodules="True")
+    version("0.10.0", tag="v0.10.0", submodules="True")
+    version("0.9.0", tag="v0.9.0", submodules="True")
+    version("0.8.0", tag="v0.8.0", submodules="True")
+    version("0.7.0", tag="v0.7.0", submodules="True")
+    version("0.6.0", tag="v0.6.0", submodules="True")
+    version("0.5.3", tag="v0.5.3", submodules="True")
+    version("0.5.2", tag="v0.5.2", submodules="True")
+    version("0.5.1", tag="v0.5.1", submodules="True")
+    version("0.5.0", tag="v0.5.0", submodules="True")
+    version("0.4.1", tag="v0.4.1", submodules="True")
+    version("0.4.0", tag="v0.4.0", submodules="True")
+
+    # export targets when building pre-2.4.0 release with BLT 0.4.0+
+    patch(
+        "https://github.com/LLNL/RAJA/commit/eca1124ee4af380d6613adc6012c307d1fd4176b.patch?full_index=1",
+        sha256="12bb78c00b6683ad3e7fd4e3f87f9776bae074b722431b79696bc862816735ef",
+        when="@:0.13.0 ^blt@0.4:",
+    )
+
+    variant("openmp", default=True, description="Build OpenMP backend")
+    variant("shared", default=True, description="Build Shared Libs")
+    variant("examples", default=True, description="Build examples.")
+    variant("exercises", default=True, description="Build exercises.")
+    # TODO: figure out gtest dependency and then set this default True
+    # and remove the +tests conflict below.
+    variant("tests", default=False, description="Build tests")
+
+    depends_on("blt")
+    depends_on("blt@0.5.0:", type="build", when="@0.14.1:")
+    depends_on("blt@0.4.1", type="build", when="@0.14.0")
+    depends_on("blt@0.4.0:", type="build", when="@0.13.0")
+    depends_on("blt@0.3.6:", type="build", when="@:0.12.0")
+
+    depends_on("camp@0.2.2:0.2.3", when="@0.14.0")
+    depends_on("camp@0.1.0", when="@0.10.0:0.13.0")
+    depends_on("camp@2022.03.2:", when="@2022.03.0:")
+    depends_on("camp@main", when="@main")
+    depends_on("camp@main", when="@develop")
+    depends_on("camp+openmp", when="+openmp")
+
+    depends_on("cmake@:3.20", when="+rocm", type="build")
+    depends_on("cmake@3.14:", when="@2022.03.0:")
+
+    depends_on("llvm-openmp", when="+openmp %apple-clang")
+
+    depends_on("rocprim", when="+rocm")
+    with when("+rocm @0.12.0:"):
+        depends_on("camp+rocm")
+        for arch in ROCmPackage.amdgpu_targets:
+            depends_on(
+                "camp+rocm amdgpu_target={0}".format(arch), when="amdgpu_target={0}".format(arch)
+            )
+        conflicts("+openmp")
 
-    phases = ['hostconfig', 'cmake', 'build', 'install']
+    with when("+cuda @0.12.0:"):
+        depends_on("camp+cuda")
+        for sm_ in CudaPackage.cuda_arch_values:
+            depends_on("camp +cuda cuda_arch={0}".format(sm_), when="cuda_arch={0}".format(sm_))
 
     def _get_sys_type(self, spec):
-        sys_type = str(spec.architecture)
-        # if on llnl systems, we can use the SYS_TYPE
+        sys_type = spec.architecture
         if "SYS_TYPE" in env:
             sys_type = env["SYS_TYPE"]
         return sys_type
 
-    def _get_host_config_path(self, spec):
-        var=''
-        if '+cuda' in spec:
-            var= '-'.join([var,'cuda'])
-        if '+libcpp' in spec:
-            var='-'.join([var,'libcpp'])
-
-        host_config_path = "hc-%s-%s-%s%s-%s.cmake" % (socket.gethostname().rstrip('1234567890'),
-                                               self._get_sys_type(spec),
-                                               spec.compiler,
-                                               var,
-                                               spec.dag_hash())
-        dest_dir = self.stage.source_path
-        host_config_path = os.path.abspath(pjoin(dest_dir, host_config_path))
-        return host_config_path
-
-    def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
-        """
-        This method creates a 'host-config' file that specifies
-        all of the options used to configure and build Umpire.
-
-        For more details about 'host-config' files see:
-            http://software.llnl.gov/conduit/building.html
-
-        Note:
-          The `py_site_pkgs_dir` arg exists to allow a package that
-          subclasses this package provide a specific site packages
-          dir when calling this function. `py_site_pkgs_dir` should
-          be an absolute path or `None`.
-
-          This is necessary because the spack `site_packages_dir`
-          var will not exist in the base class. For more details
-          on this issue see: https://github.com/spack/spack/issues/6261
-        """
-
-        #######################
-        # Compiler Info
-        #######################
-        c_compiler = env["SPACK_CC"]
-        cpp_compiler = env["SPACK_CXX"]
-
-        # Even though we don't have fortran code in our project we sometimes
-        # use the Fortran compiler to determine which libstdc++ to use
-        f_compiler = ""
-        if "SPACK_FC" in env.keys():
-            # even if this is set, it may not exist
-            # do one more sanity check
-            if os.path.isfile(env["SPACK_FC"]):
-                f_compiler = env["SPACK_FC"]
-
-        #######################################################################
-        # By directly fetching the names of the actual compilers we appear
-        # to doing something evil here, but this is necessary to create a
-        # 'host config' file that works outside of the spack install env.
-        #######################################################################
-
-        sys_type = self._get_sys_type(spec)
-
-        ##############################################
-        # Find and record what CMake is used
-        ##############################################
-
-        cmake_exe = spec['cmake'].command.path
-        cmake_exe = os.path.realpath(cmake_exe)
-
-        host_config_path = self._get_host_config_path(spec)
-        cfg = open(host_config_path, "w")
-        cfg.write("###################\n".format("#" * 60))
-        cfg.write("# Generated host-config - Edit at own risk!\n")
-        cfg.write("###################\n".format("#" * 60))
-        cfg.write("# Copyright 2016-22, Lawrence Livermore National Security, LLC\n")
-        cfg.write("# and RAJA project contributors. See the RAJA/LICENSE file\n")
-        cfg.write("# for details.\n")
-        cfg.write("#\n")
-        cfg.write("# SPDX-License-Identifier: (BSD-3-Clause) \n")
-        cfg.write("###################\n\n".format("#" * 60))
-
-        cfg.write("#------------------\n".format("-" * 60))
-        cfg.write("# SYS_TYPE: {0}\n".format(sys_type))
-        cfg.write("# Compiler Spec: {0}\n".format(spec.compiler))
-        cfg.write("# CMake executable path: %s\n" % cmake_exe)
-        cfg.write("#------------------\n\n".format("-" * 60))
-
-        cfg.write(cmake_cache_string("CMAKE_BUILD_TYPE", spec.variants['build_type'].value))
-
-        #######################
-        # Compiler Settings
-        #######################
-
-        cfg.write("#------------------\n".format("-" * 60))
-        cfg.write("# Compilers\n")
-        cfg.write("#------------------\n\n".format("-" * 60))
-        cfg.write(cmake_cache_entry("CMAKE_C_COMPILER", c_compiler))
-        cfg.write(cmake_cache_entry("CMAKE_CXX_COMPILER", cpp_compiler))
-
-        # use global spack compiler flags
-        cflags = ' '.join(spec.compiler_flags['cflags'])
-        if "+libcpp" in spec:
-            cflags += ' '.join([cflags,"-DGTEST_HAS_CXXABI_H_=0"])
-        if cflags:
-            cfg.write(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
-
-        cxxflags = ' '.join(spec.compiler_flags['cxxflags'])
-        if "+libcpp" in spec:
-            cxxflags += ' '.join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"])
-        if cxxflags:
-            cfg.write(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
-
-        # TODO (bernede1@llnl.gov): Is this useful for RAJA?
-        if ("gfortran" in f_compiler) and ("clang" in cpp_compiler):
-            libdir = pjoin(os.path.dirname(
-                           os.path.dirname(f_compiler)), "lib")
-            flags = ""
-            for _libpath in [libdir, libdir + "64"]:
-                if os.path.exists(_libpath):
-                    flags += " -Wl,-rpath,{0}".format(_libpath)
-            description = ("Adds a missing libstdc++ rpath")
-            #if flags:
-            #    cfg.write(cmake_cache_string("BLT_EXE_LINKER_FLAGS", flags,
-            #                                description))
-
-        gcc_toolchain_regex = re.compile("--gcc-toolchain=(.*)")
-        gcc_name_regex = re.compile(".*gcc-name.*")
-
-        using_toolchain = list(filter(gcc_toolchain_regex.match, spec.compiler_flags['cxxflags']))
-        if(using_toolchain):
-          gcc_toolchain_path = gcc_toolchain_regex.match(using_toolchain[0])
-        using_gcc_name = list(filter(gcc_name_regex.match, spec.compiler_flags['cxxflags']))
-        compilers_using_toolchain = ["pgi", "xl", "icpc"]
-        if any(compiler in cpp_compiler for compiler in compilers_using_toolchain):
-            if using_toolchain or using_gcc_name:
-                cfg.write(cmake_cache_entry("BLT_CMAKE_IMPLICIT_LINK_DIRECTORIES_EXCLUDE",
-                "/usr/tce/packages/gcc/gcc-4.9.3/lib64;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64/gcc/powerpc64le-unknown-linux-gnu/4.9.3;/usr/tce/packages/gcc/gcc-4.9.3/gnu/lib64;/usr/tce/packages/gcc/gcc-4.9.3/lib64/gcc/x86_64-unknown-linux-gnu/4.9.3"))
-
-        compilers_using_cxx14 = ["intel-17", "intel-18", "xl"]
-        if any(compiler in cpp_compiler for compiler in compilers_using_cxx14):
-            cfg.write(cmake_cache_entry("BLT_CXX_STD", "c++14"))
+    @property
+    def cache_name(self):
+        hostname = socket.gethostname()
+        if "SYS_TYPE" in env:
+            hostname = hostname.rstrip("1234567890")
+        return "{0}-{1}-{2}@{3}.cmake".format(
+            hostname,
+            self._get_sys_type(self.spec),
+            self.spec.compiler.name,
+            self.spec.compiler.version,
+        )
+
+    def initconfig_hardware_entries(self):
+        spec = self.spec
+        entries = super(Raja, self).initconfig_hardware_entries()
+
+        entries.append(cmake_cache_option("ENABLE_OPENMP", "+openmp" in spec))
 
         if "+cuda" in spec:
-            cfg.write("#------------------{0}\n".format("-" * 60))
-            cfg.write("# Cuda\n")
-            cfg.write("#------------------{0}\n\n".format("-" * 60))
-
-            cfg.write(cmake_cache_option("ENABLE_CUDA", True))
-
-            cudatoolkitdir = spec['cuda'].prefix
-            cfg.write(cmake_cache_entry("CUDA_TOOLKIT_ROOT_DIR",
-                                        cudatoolkitdir))
-            cudacompiler = "${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc"
-            cfg.write(cmake_cache_entry("CMAKE_CUDA_COMPILER",
-                                        cudacompiler))
-
-            if ("xl" in cpp_compiler):
-                cfg.write(cmake_cache_entry("CMAKE_CUDA_FLAGS", "-Xcompiler -O3 -Xcompiler -qxlcompatmacros -Xcompiler -qalias=noansi " + 
-                                            "-Xcompiler -qsmp=omp -Xcompiler -qhot -Xcompiler -qnoeh -Xcompiler -qsuppress=1500-029 " +
-                                            "-Xcompiler -qsuppress=1500-036 -Xcompiler -qsuppress=1500-030"))
-                cuda_release_flags = "-O3"
-                cuda_reldebinf_flags = "-O3 -g"
-                cuda_debug_flags = "-O0 -g"
-
-                cfg.write(cmake_cache_string("BLT_CXX_STD", "c++14"))
-            elif ("gcc" in cpp_compiler):
-                cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000"
-                cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions -Xcompiler -finline-limit=20000"
-                cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions -Xcompiler -finline-limit=20000"
-            else:
-                cuda_release_flags = "-O3 -Xcompiler -Ofast -Xcompiler -finline-functions"
-                cuda_reldebinf_flags = "-O3 -g -Xcompiler -Ofast -Xcompiler -finline-functions"
-                cuda_debug_flags = "-O0 -g -Xcompiler -O0 -Xcompiler -finline-functions"
-   
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELEASE", cuda_release_flags))
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_RELWITHDEBINFO", cuda_reldebinf_flags))
-            cfg.write(cmake_cache_string("CMAKE_CUDA_FLAGS_DEBUG", cuda_debug_flags))
-
-            if not spec.satisfies('cuda_arch=none'):
-                cuda_arch = spec.variants['cuda_arch'].value
-                cfg.write(cmake_cache_string("CUDA_ARCH", 'sm_{0}'.format(cuda_arch[0])))
+            entries.append(cmake_cache_option("ENABLE_CUDA", True))
 
+            if not spec.satisfies("cuda_arch=none"):
+                cuda_arch = spec.variants["cuda_arch"].value
+                entries.append(cmake_cache_string("CUDA_ARCH", "sm_{0}".format(cuda_arch[0])))
+                entries.append(
+                    cmake_cache_string("CMAKE_CUDA_ARCHITECTURES", "{0}".format(cuda_arch[0]))
+                )
         else:
-            cfg.write(cmake_cache_option("ENABLE_CUDA", False))
+            entries.append(cmake_cache_option("ENABLE_CUDA", False))
 
         if "+rocm" in spec:
-            cfg.write("#------------------{0}\n".format("-" * 60))
-            cfg.write("# HIP\n")
-            cfg.write("#------------------{0}\n\n".format("-" * 60))
-
-            cfg.write(cmake_cache_option("ENABLE_HIP", True))
-
-            hip_root = spec['hip'].prefix
-            rocm_root = hip_root + "/.."
-            hip_arch = spec.variants['amdgpu_target'].value
-            cfg.write(cmake_cache_entry("HIP_ROOT_DIR",
-                                        hip_root))
-            # there is only one dir like this, but the version component is unknown
-            cfg.write(
-                cmake_cache_path(
-                    "HIP_CLANG_INCLUDE_PATH",
-                    glob.glob(
-                        "{}/lib/clang/*/include".format(spec['llvm-amdgpu'].prefix)
-                    )[0]
+            entries.append(cmake_cache_option("ENABLE_HIP", True))
+            entries.append(cmake_cache_path("HIP_ROOT_DIR", "{0}".format(spec["hip"].prefix)))
+            hip_repair_options(entries, spec)
+            archs = self.spec.variants["amdgpu_target"].value
+            if archs != "none":
+                arch_str = ",".join(archs)
+                entries.append(
+                    cmake_cache_string("HIP_HIPCC_FLAGS", "--amdgpu-target={0}".format(arch_str))
                 )
-            )
-            cfg.write(cmake_cache_entry("ROCM_ROOT_DIR",
-                                        rocm_root))
-            cfg.write(cmake_cache_entry("HIP_PATH",
-                                        rocm_root + '/llvm/bin'))
-            cfg.write(cmake_cache_entry("CMAKE_HIP_ARCHITECTURES", hip_arch[0]))
-
-            if ('%gcc' in spec) or (using_toolchain):
-                if ('%gcc' in spec):
-                    gcc_bin = os.path.dirname(self.compiler.cxx)
-                    gcc_prefix = join_path(gcc_bin, '..')
-                else:
-                    gcc_prefix = gcc_toolchain_path.group(1)
-                cfg.write(cmake_cache_entry("HIP_CLANG_FLAGS",
-                "--gcc-toolchain={0}".format(gcc_prefix))) 
-                cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS",
-                " -Wl,-rpath {}/lib64".format(gcc_prefix)))
-
         else:
-            cfg.write(cmake_cache_option("ENABLE_HIP", False))
-
-        cfg.write("#------------------{0}\n".format("-" * 60))
-        cfg.write("# Other\n")
-        cfg.write("#------------------{0}\n\n".format("-" * 60))
-
-        cfg.write(cmake_cache_string("RAJA_RANGE_ALIGN", "4"))
-        cfg.write(cmake_cache_string("RAJA_RANGE_MIN_LENGTH", "32"))
-        cfg.write(cmake_cache_string("RAJA_DATA_ALIGN", "64"))
-
-        cfg.write(cmake_cache_option("RAJA_HOST_CONFIG_LOADED", True))
+            entries.append(cmake_cache_option("ENABLE_HIP", False))
 
-        # shared vs static libs
-        cfg.write(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec))
-        cfg.write(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec))
-        cfg.write(cmake_cache_option("RAJA_ENABLE_DESUL_ATOMICS","+desul" in spec))
+        return entries
 
-        if "+desul" in spec:
-            cfg.write(cmake_cache_string("BLT_CXX_STD","c++14"))
-            if "+cuda" in spec:
-                cfg.write(cmake_cache_string("CMAKE_CUDA_STANDARD", "14"))
-
-        cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec))
-        cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests))
-        cfg.write(cmake_cache_string("camp_DIR", spec['camp'].prefix))
+    def initconfig_package_entries(self):
+        spec = self.spec
+        entries = []
+
+        option_prefix = "RAJA_" if spec.satisfies("@0.14.0:") else ""
+
+        entries.append(cmake_cache_path("BLT_SOURCE_DIR", spec["blt"].prefix))
+        if "camp" in self.spec:
+            entries.append(cmake_cache_path("camp_DIR", spec["camp"].prefix))
+        entries.append(cmake_cache_option("BUILD_SHARED_LIBS", "+shared" in spec))
+        entries.append(
+            cmake_cache_option("{}ENABLE_EXAMPLES".format(option_prefix), "+examples" in spec)
+        )
+        if spec.satisfies("@0.14.0:"):
+            entries.append(
+                cmake_cache_option(
+                    "{}ENABLE_EXERCISES".format(option_prefix), "+exercises" in spec
+                )
+            )
+        else:
+            entries.append(cmake_cache_option("ENABLE_EXERCISES", "+exercises" in spec))
 
-        #######################
-        # Close and save
-        #######################
-        cfg.write("\n")
-        cfg.close()
+        # Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS which
+        # is used by the spack compiler wrapper.  This can go away when BLT
+        # removes -Werror from GTest flags
+        if self.spec.satisfies("%clang target=ppc64le:") or not self.run_tests:
+            entries.append(cmake_cache_option("ENABLE_TESTS", False))
+        else:
+            entries.append(cmake_cache_option("ENABLE_TESTS", True))
 
-        print("OUT: host-config file {0}".format(host_config_path))
+        return entries
 
     def cmake_args(self):
-        spec = self.spec
-        host_config_path = self._get_host_config_path(spec)
-
         options = []
-        options.extend(['-C', host_config_path])
-
         return options
+
+    @property
+    def build_relpath(self):
+        """Relative path to the cmake build subdirectory."""
+        return join_path("..", self.build_dirname)
+
+    @run_after("install")
+    def setup_build_tests(self):
+        """Copy the build test files after the package is installed to a
+        relative install test subdirectory for use during `spack test run`."""
+        # Now copy the relative files
+        self.cache_extra_test_sources(self.build_relpath)
+
+        # Ensure the path exists since relying on a relative path at the
+        # same level as the normal stage source path.
+        mkdirp(self.install_test_root)
+
+    @property
+    def _extra_tests_path(self):
+        # TODO: The tests should be converted to re-build and run examples
+        # TODO: using the installed libraries.
+        return join_path(self.install_test_root, self.build_relpath, "bin")
+
+    def _test_examples(self):
+        """Perform very basic checks on a subset of copied examples."""
+        checks = [
+            (
+                "ex5_line-of-sight_solution",
+                [r"RAJA sequential", r"RAJA OpenMP", r"result -- PASS"],
+            ),
+            (
+                "ex6_stencil-offset-layout_solution",
+                [r"RAJA Views \(permuted\)", r"result -- PASS"],
+            ),
+            (
+                "ex8_tiled-matrix-transpose_solution",
+                [r"parallel top inner loop", r"collapsed inner loops", r"result -- PASS"],
+            ),
+            ("kernel-dynamic-tile", [r"Running index", r"(24,24)"]),
+            ("plugin-example", [r"Launching host kernel for the 10 time"]),
+            ("tut_batched-matrix-multiply", [r"result -- PASS"]),
+            ("wave-eqn", [r"Max Error = 2", r"Evolved solution to time"]),
+        ]
+        for exe, expected in checks:
+            reason = "test: checking output of {0} for {1}".format(exe, expected)
+            self.run_test(
+                exe,
+                [],
+                expected,
+                installed=False,
+                purpose=reason,
+                skip_missing=True,
+                work_dir=self._extra_tests_path,
+            )
+
+    def test(self):
+        """Perform smoke tests."""
+        self._test_examples()

From 891f5a4e3eccc9ead6b78349e2a110d5faba3e39 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 14:39:57 +0200
Subject: [PATCH 02/94] Update uberenv config with correct configuration phase

---
 .uberenv_config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.uberenv_config.json b/.uberenv_config.json
index 2fc700f855..dc692a9045 100644
--- a/.uberenv_config.json
+++ b/.uberenv_config.json
@@ -1,7 +1,7 @@
 {
 "package_name" : "raja",
 "package_version" : "develop",
-"package_final_phase" : "hostconfig",
+"package_final_phase" : "initconfig",
 "package_source_dir" : "../..",
 "spack_url": "https://github.com/spack/spack.git",
 "spack_branch": "v0.18.1",

From df4c5d686c4b5ae724e8d827604413a4a355addf Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 14:50:20 +0200
Subject: [PATCH 03/94] Retrieve CMake from hostconfig file (same as in umpire)

---
 scripts/gitlab/build_and_test.sh | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index eda0c550ef..34acbeae37 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -127,8 +127,7 @@ fi
 build_dir="${build_root}/build_${hostconfig//.cmake/}"
 install_dir="${build_root}/install_${hostconfig//.cmake/}"
 
-# TODO: This is from Umpire, could it work with RAJA ?
-#cmake_exe=`grep 'CMake executable' ${hostconfig_path} | cut -d ':' -f 2 | xargs`
+cmake_exe=`grep 'CMake executable' ${hostconfig_path} | cut -d ':' -f 2 | xargs`
 
 # Build
 if [[ "${option}" != "--deps-only" && "${option}" != "--test-only" ]]
@@ -161,17 +160,14 @@ then
     then
         module unload rocm
     fi
-
-    module load cmake/3.20.2 || module load cmake/3.19.2 || module load cmake/3.21.1
-
-    cmake \
+    $cmake_exe \
       -C ${hostconfig_path} \
       -DCMAKE_INSTALL_PREFIX=${install_dir} \
       ${project_dir}
-    if ! cmake --build . -j ${core_counts[$truehostname]}
+    if ! $cmake_exe --build . -j ${core_counts[$truehostname]}
     then
         echo "ERROR: compilation failed, building with verbose output..."
-        cmake --build . --verbose -j 1
+        $cmake_exe --build . --verbose -j 1
     else
         make install
     fi
@@ -236,8 +232,8 @@ then
 
         cd ${install_dir}/examples/RAJA/using-with-cmake
         mkdir build && cd build
-        if ! cmake -C ../host-config.cmake ..; then
-        echo "ERROR: running cmake for using-with-cmake test" && exit 1
+        if ! $cmake_exe -C ../host-config.cmake ..; then
+        echo "ERROR: running $cmake_exe for using-with-cmake test" && exit 1
         fi
 
         if ! make; then

From bf65fd7fde93a46fbf0c641149125e8d887d2977 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 15:26:20 +0200
Subject: [PATCH 04/94] add desul option

---
 scripts/spack_packages/raja/package.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 4850ed1c54..08aa43fad1 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -53,6 +53,7 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
     # TODO: figure out gtest dependency and then set this default True
     # and remove the +tests conflict below.
     variant("tests", default=False, description="Build tests")
+    variant("desul", default=False, description='Build Desul Atomics backend')
 
     depends_on("blt")
     depends_on("blt@0.5.0:", type="build", when="@0.14.1:")
@@ -143,6 +144,13 @@ def initconfig_package_entries(self):
 
         option_prefix = "RAJA_" if spec.satisfies("@0.14.0:") else ""
 
+        entries.append(cmake_cache_option("RAJA_ENABLE_DESUL_ATOMICS", "+desul" in spec))
+
+        if "+desul" in spec:
+            entries.append(cmake_cache_string("BLT_CXX_STD","c++14"))
+            if "+cuda" in spec:
+                entries.append(cmake_cache_string("CMAKE_CUDA_STANDARD", "14"))
+
         entries.append(cmake_cache_path("BLT_SOURCE_DIR", spec["blt"].prefix))
         if "camp" in self.spec:
             entries.append(cmake_cache_path("camp_DIR", spec["camp"].prefix))

From 1f7e5b73ba0d94dcfa71010c1ca169e0d1bb8079 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 18:31:51 +0200
Subject: [PATCH 05/94] add libcpp variant

---
 scripts/spack_packages/raja/package.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 08aa43fad1..82039dd49d 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -53,7 +53,8 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
     # TODO: figure out gtest dependency and then set this default True
     # and remove the +tests conflict below.
     variant("tests", default=False, description="Build tests")
-    variant("desul", default=False, description='Build Desul Atomics backend')
+    variant("libcpp", default=False, description="Uses libc++ instead of libstdc++")
+    variant("desul", default=False, description="Build Desul Atomics backend")
 
     depends_on("blt")
     depends_on("blt@0.5.0:", type="build", when="@0.14.1:")
@@ -94,6 +95,7 @@ def _get_sys_type(self, spec):
         return sys_type
 
     @property
+    # TODO: name cache file conditionally to cuda and libcpp variants
     def cache_name(self):
         hostname = socket.gethostname()
         if "SYS_TYPE" in env:
@@ -146,6 +148,19 @@ def initconfig_package_entries(self):
 
         entries.append(cmake_cache_option("RAJA_ENABLE_DESUL_ATOMICS", "+desul" in spec))
 
+        # use global spack compiler flags
+        cflags = " ".join(spec.compiler_flags["cflags"])
+        if "+libcpp" in spec:
+            cflags += " ".join([cflags,"-DGTEST_HAS_CXXABI_H_=0"])
+        if cflags:
+            entries.append(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
+
+        cxxflags = " ".join(spec.compiler_flags["cxxflags"])
+        if "+libcpp" in spec:
+            cxxflags += " ".join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"])
+        if cxxflags:
+            entries.append(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
+
         if "+desul" in spec:
             entries.append(cmake_cache_string("BLT_CXX_STD","c++14"))
             if "+cuda" in spec:

From 3d39e66612b55429e7308f2945aa0063d3295c8a Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 18:54:27 +0200
Subject: [PATCH 06/94] Fix HIP option: resulted in wrong CMake Cache file

---
 scripts/spack_packages/camp/package.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
index 8f40f6b295..864fafbd70 100644
--- a/scripts/spack_packages/camp/package.py
+++ b/scripts/spack_packages/camp/package.py
@@ -10,10 +10,7 @@
 
 def hip_repair_options(options, spec):
     # there is only one dir like this, but the version component is unknown
-    options.append(
-        "-DHIP_CLANG_INCLUDE_PATH="
-        + glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0]
-    )
+    entries.append(cmake_cache_path("HIP_CLANG_INCLUDE_PATH", glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0]))
 
 
 class Camp(CMakePackage, CudaPackage, ROCmPackage):

From 7af22d56a7454fbab6e66c188ffd11b639e98df6 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 19:05:32 +0200
Subject: [PATCH 07/94] Revert "Fix HIP option: resulted in wrong CMake Cache
 file"

This reverts commit 4dd80d0a7d71df946245679d32880867724dae13.
---
 scripts/spack_packages/camp/package.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
index 864fafbd70..8f40f6b295 100644
--- a/scripts/spack_packages/camp/package.py
+++ b/scripts/spack_packages/camp/package.py
@@ -10,7 +10,10 @@
 
 def hip_repair_options(options, spec):
     # there is only one dir like this, but the version component is unknown
-    entries.append(cmake_cache_path("HIP_CLANG_INCLUDE_PATH", glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0]))
+    options.append(
+        "-DHIP_CLANG_INCLUDE_PATH="
+        + glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0]
+    )
 
 
 class Camp(CMakePackage, CudaPackage, ROCmPackage):

From 3c27b36c2704f4782200b3087a0efd26ccf4664f Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 19:06:04 +0200
Subject: [PATCH 08/94] Fix hip-repaired-option usage: resulted in wrong CMake
 Cache file

---
 scripts/spack_packages/raja/package.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 82039dd49d..1da2d666a4 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -4,11 +4,15 @@
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 
 import socket
+import glob
 
 from spack.package import *
-from spack.pkg.builtin.camp import hip_repair_options
 
 
+def hip_repair_entries(entries, spec):
+    # there is only one dir like this, but the version component is unknown
+    entries.append(cmake_cache_path("HIP_CLANG_INCLUDE_PATH", glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0]))
+
 class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
     """RAJA Parallel Framework."""
 
@@ -128,7 +132,7 @@ def initconfig_hardware_entries(self):
         if "+rocm" in spec:
             entries.append(cmake_cache_option("ENABLE_HIP", True))
             entries.append(cmake_cache_path("HIP_ROOT_DIR", "{0}".format(spec["hip"].prefix)))
-            hip_repair_options(entries, spec)
+            hip_repair_entries(entries, spec)
             archs = self.spec.variants["amdgpu_target"].value
             if archs != "none":
                 arch_str = ",".join(archs)

From fd38921999bfe345729d9dd7d709576162031586 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 19:18:56 +0200
Subject: [PATCH 09/94] Activate tests variant in all specs

---
 .gitlab/corona-build-and-test-extra.yml |  2 +-
 .gitlab/custom-jobs-and-variables.yml   |  6 +++---
 .gitlab/lassen-build-and-test-extra.yml | 14 +++++++-------
 .gitlab/ruby-build-and-test-extra.yml   |  8 ++++----
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml
index a94300f85b..cbe0c4a9f0 100644
--- a/.gitlab/corona-build-and-test-extra.yml
+++ b/.gitlab/corona-build-and-test-extra.yml
@@ -23,6 +23,6 @@
 
 rocm_5_1_1_clang_13_0_0_desul_atomics:
   variables:
-    SPEC: " +rocm ~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 ^hip@5.1.1 ^blt@develop"
+    SPEC: " +rocm ~openmp +tests +desul amdgpu_target=gfx906 %clang@13.0.0 ^hip@5.1.1 ^blt@develop"
   extends: .build_and_test_on_corona
 
diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index b615686b5a..1bb569fcad 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -18,7 +18,7 @@ variables:
 # Arguments for job level allocation
   RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=45 --nodes=1"
 # Project specific variants for ruby
-  PROJECT_RUBY_VARIANTS: "+openmp "
+  PROJECT_RUBY_VARIANTS: "+openmp +tests"
 # Project specific deps for ruby
   PROJECT_RUBY_DEPS: ""
 
@@ -28,7 +28,7 @@ variables:
 # Arguments for job level allocation
   CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1"
 # Project specific variants for corona
-  PROJECT_CORONA_VARIANTS: "~openmp "
+  PROJECT_CORONA_VARIANTS: "~openmp +tests"
 # Project specific deps for corona
   PROJECT_CORONA_DEPS: "^blt@develop "
 
@@ -37,7 +37,7 @@ variables:
 # Arguments for job level allocation
   LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60"
 # Project specific variants for lassen
-  PROJECT_LASSEN_VARIANTS: "+openmp "
+  PROJECT_LASSEN_VARIANTS: "+openmp +tests"
 # Project specific deps for lassen
   PROJECT_LASSEN_DEPS: ""
 
diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml
index 16a1230515..6dbf25ea2f 100644
--- a/.gitlab/lassen-build-and-test-extra.yml
+++ b/.gitlab/lassen-build-and-test-extra.yml
@@ -89,7 +89,7 @@ xl_16_1_1_12_gcc_8_3_1_cuda_11_1_0:
 
 clang_14_0_5:
   variables:
-    SPEC: " +openmp %clang@14.0.5"
+    SPEC: " +openmp +tests %clang@14.0.5"
   extends: .build_and_test_on_lassen
 
 ##########
@@ -98,18 +98,18 @@ clang_14_0_5:
 
 clang_12_0_1_cuda_11_5_0:
   variables:
-    SPEC: " +openmp +cuda cuda_arch=70 %clang@12.0.1 ^cuda@11.5.0"
+    SPEC: " +openmp +tests +cuda cuda_arch=70 %clang@12.0.1 ^cuda@11.5.0"
   extends: .build_and_test_on_lassen
 
 gcc_8_3_1_cuda_11_1_0:
   variables:
-    SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.1.0"
+    SPEC: " +openmp +tests +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.1.0"
   extends: .build_and_test_on_lassen
 
 gcc_8_3_1_cuda_11_5_0_ats_disabled:
   extends: .build_and_test_on_lassen
   variables:
-    SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0"
+    SPEC: " +openmp +tests +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0"
     LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 60"
 
 ##########
@@ -118,16 +118,16 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled:
 
 clang_13_0_1_libcpp:
   variables:
-    SPEC: " +openmp %clang@13.0.1+libcpp"
+    SPEC: " +openmp +tests %clang@13.0.1+libcpp"
   extends: .build_and_test_on_lassen
 
 clang_14_0_5_asan:
   variables:
-    SPEC: " +openmp %clang@14.0.5 cxxflags=-fsanitize=address"
+    SPEC: " +openmp +tests %clang@14.0.5 cxxflags=-fsanitize=address"
     ASAN_OPTIONS: "detect_leaks=1"
   extends: .build_and_test_on_lassen
 
 gcc_8_3_1_cuda_10_1_168_desul_atomics:
   variables:
-    SPEC: " +openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
+    SPEC: " +openmp +tests +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
   extends: .build_and_test_on_lassen
diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml
index 9bebc62530..1aaceec243 100644
--- a/.gitlab/ruby-build-and-test-extra.yml
+++ b/.gitlab/ruby-build-and-test-extra.yml
@@ -35,24 +35,24 @@ pgi_20_1_gcc_local_8_3_1:
 
 clang_9_0_0_openmp_off:
   variables:
-    SPEC: " ~openmp %clang@9.0.0"
+    SPEC: " ~openmp +tests %clang@9.0.0"
   extends: .build_and_test_on_ruby
 
 gcc_8_1_0_openmp_default:
   variables:
-    SPEC: " %gcc@8.1.0"
+    SPEC: " +tests %gcc@8.1.0"
     RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1"
   extends: .build_and_test_on_ruby
 
 icpc_19_1_0:
   variables:
-    SPEC: " +openmp %intel@19.1.0"
+    SPEC: " +openmp +tests %intel@19.1.0"
     RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1"
   extends: .build_and_test_on_ruby
 
 # OTHERS
 clang_10_0_1_gcc_8_3_1_desul_atomics:
   variables:
-    SPEC: " +openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
+    SPEC: " +openmp +tests +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
   extends: .build_and_test_on_ruby
 

From 6fa1fe6c69e9722469784bddfbd030cbd5ef3f1c Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 19:21:12 +0200
Subject: [PATCH 10/94] Fix add desul option: wrong function name

---
 scripts/spack_packages/raja/package.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 1da2d666a4..f5dba192e8 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -157,13 +157,13 @@ def initconfig_package_entries(self):
         if "+libcpp" in spec:
             cflags += " ".join([cflags,"-DGTEST_HAS_CXXABI_H_=0"])
         if cflags:
-            entries.append(cmake_cache_entry("CMAKE_C_FLAGS", cflags))
+            entries.append(cmake_cache_option("CMAKE_C_FLAGS", cflags))
 
         cxxflags = " ".join(spec.compiler_flags["cxxflags"])
         if "+libcpp" in spec:
             cxxflags += " ".join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"])
         if cxxflags:
-            entries.append(cmake_cache_entry("CMAKE_CXX_FLAGS", cxxflags))
+            entries.append(cmake_cache_option("CMAKE_CXX_FLAGS", cxxflags))
 
         if "+desul" in spec:
             entries.append(cmake_cache_string("BLT_CXX_STD","c++14"))

From aa98a157a7f1473316379f148ccf62fd329d99dc Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 21:31:02 +0200
Subject: [PATCH 11/94] Fix add libcpp: wrong cmake cache function

---
 scripts/spack_packages/raja/package.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index f5dba192e8..189e85e41d 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -157,13 +157,13 @@ def initconfig_package_entries(self):
         if "+libcpp" in spec:
             cflags += " ".join([cflags,"-DGTEST_HAS_CXXABI_H_=0"])
         if cflags:
-            entries.append(cmake_cache_option("CMAKE_C_FLAGS", cflags))
+            entries.append(cmake_cache_string("CMAKE_C_FLAGS", cflags))
 
         cxxflags = " ".join(spec.compiler_flags["cxxflags"])
         if "+libcpp" in spec:
             cxxflags += " ".join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"])
         if cxxflags:
-            entries.append(cmake_cache_option("CMAKE_CXX_FLAGS", cxxflags))
+            entries.append(cmake_cache_string("CMAKE_CXX_FLAGS", cxxflags))
 
         if "+desul" in spec:
             entries.append(cmake_cache_string("BLT_CXX_STD","c++14"))

From 59b20b07998502635d31150a0ca2d8d8e713d938 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 19 Sep 2022 22:22:12 +0200
Subject: [PATCH 12/94] Let +tests variant actually do activate tests

---
 scripts/spack_packages/raja/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 189e85e41d..e8b923aed2 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -189,7 +189,7 @@ def initconfig_package_entries(self):
         # Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS which
         # is used by the spack compiler wrapper.  This can go away when BLT
         # removes -Werror from GTest flags
-        if self.spec.satisfies("%clang target=ppc64le:") or not self.run_tests:
+        if self.spec.satisfies("%clang target=ppc64le:") or ( not self.run_tests and not "+tests" in spec):
             entries.append(cmake_cache_option("ENABLE_TESTS", False))
         else:
             entries.append(cmake_cache_option("ENABLE_TESTS", True))

From ebc9f1ec96605b0d59a4029e0765e9c4911a8c57 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Tue, 20 Sep 2022 14:49:31 +0200
Subject: [PATCH 13/94] Do not build shared libs

---
 .gitlab/custom-jobs-and-variables.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index 1bb569fcad..cc838751d9 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -18,7 +18,7 @@ variables:
 # Arguments for job level allocation
   RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=45 --nodes=1"
 # Project specific variants for ruby
-  PROJECT_RUBY_VARIANTS: "+openmp +tests"
+  PROJECT_RUBY_VARIANTS: "~shared +openmp +tests"
 # Project specific deps for ruby
   PROJECT_RUBY_DEPS: ""
 
@@ -28,7 +28,7 @@ variables:
 # Arguments for job level allocation
   CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1"
 # Project specific variants for corona
-  PROJECT_CORONA_VARIANTS: "~openmp +tests"
+  PROJECT_CORONA_VARIANTS: "~shared ~openmp +tests"
 # Project specific deps for corona
   PROJECT_CORONA_DEPS: "^blt@develop "
 
@@ -37,7 +37,7 @@ variables:
 # Arguments for job level allocation
   LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60"
 # Project specific variants for lassen
-  PROJECT_LASSEN_VARIANTS: "+openmp +tests"
+  PROJECT_LASSEN_VARIANTS: "~shared +openmp +tests"
 # Project specific deps for lassen
   PROJECT_LASSEN_DEPS: ""
 

From d17eaad5e67eed3a6b9211116f75a8791b7636be Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Tue, 20 Sep 2022 15:00:50 +0200
Subject: [PATCH 14/94] Add missing RAJA_HOST_CONFIG_LOADED to hostconfig file
 through spack package

---
 scripts/spack_packages/raja/package.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index e8b923aed2..122e9d41b8 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -194,6 +194,8 @@ def initconfig_package_entries(self):
         else:
             entries.append(cmake_cache_option("ENABLE_TESTS", True))
 
+        entries.append(cmake_cache_option("RAJA_HOST_CONFIG_LOADED", True))
+
         return entries
 
     def cmake_args(self):

From af976e204ad06536e294d999ce890f30dff63432 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Tue, 20 Sep 2022 15:34:40 +0200
Subject: [PATCH 15/94] Add missing CMAKE_HIP_ARCHITECTURE option to hostconfig
 through spack package

---
 scripts/spack_packages/raja/package.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 122e9d41b8..7a1f3b1f73 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -139,6 +139,9 @@ def initconfig_hardware_entries(self):
                 entries.append(
                     cmake_cache_string("HIP_HIPCC_FLAGS", "--amdgpu-target={0}".format(arch_str))
                 )
+                entries.append(
+                    cmake_cache_string("CMAKE_HIP_ARCHITECTURES", arch_str)
+                )
         else:
             entries.append(cmake_cache_option("ENABLE_HIP", False))
 

From 0fe3d49c27e38a109b3e9bef7553a4966ba50b34 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Tue, 20 Sep 2022 15:52:22 +0200
Subject: [PATCH 16/94] WARN: Test that lassen tests pass, but remove a
 workaround potentially needed with regular spack builds

---
 scripts/spack_packages/raja/package.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 7a1f3b1f73..365c7743ee 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -189,10 +189,13 @@ def initconfig_package_entries(self):
         else:
             entries.append(cmake_cache_option("ENABLE_EXERCISES", "+exercises" in spec))
 
-        # Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS which
-        # is used by the spack compiler wrapper.  This can go away when BLT
-        # removes -Werror from GTest flags
-        if self.spec.satisfies("%clang target=ppc64le:") or ( not self.run_tests and not "+tests" in spec):
+        ### #TODO: Treat the workaround when building tests with spack wrapper
+        ### #      For now, removing it to test CI, which builds tests outside of wrapper.
+        ### # Work around spack adding -march=ppc64le to SPACK_TARGET_ARGS which
+        ### # is used by the spack compiler wrapper.  This can go away when BLT
+        ### # removes -Werror from GTest flags
+        ### if self.spec.satisfies("%clang target=ppc64le:") or ( not self.run_tests and not "+tests" in spec):
+        if not self.run_tests and not "+tests" in spec:
             entries.append(cmake_cache_option("ENABLE_TESTS", False))
         else:
             entries.append(cmake_cache_option("ENABLE_TESTS", True))

From eb97f73f836afa6d49491d3b1938a395880c9bfe Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 3 Oct 2022 10:39:44 +0200
Subject: [PATCH 17/94] Add missing ~shared variant to extra specs

---
 .gitlab/corona-build-and-test-extra.yml |  2 +-
 .gitlab/lassen-build-and-test-extra.yml | 14 +++++++-------
 .gitlab/ruby-build-and-test-extra.yml   |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml
index cbe0c4a9f0..29624d700f 100644
--- a/.gitlab/corona-build-and-test-extra.yml
+++ b/.gitlab/corona-build-and-test-extra.yml
@@ -23,6 +23,6 @@
 
 rocm_5_1_1_clang_13_0_0_desul_atomics:
   variables:
-    SPEC: " +rocm ~openmp +tests +desul amdgpu_target=gfx906 %clang@13.0.0 ^hip@5.1.1 ^blt@develop"
+    SPEC: " ~shared +rocm ~openmp +tests +desul amdgpu_target=gfx906 %clang@13.0.0 ^hip@5.1.1 ^blt@develop"
   extends: .build_and_test_on_corona
 
diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml
index 6dbf25ea2f..ab28bf01da 100644
--- a/.gitlab/lassen-build-and-test-extra.yml
+++ b/.gitlab/lassen-build-and-test-extra.yml
@@ -89,7 +89,7 @@ xl_16_1_1_12_gcc_8_3_1_cuda_11_1_0:
 
 clang_14_0_5:
   variables:
-    SPEC: " +openmp +tests %clang@14.0.5"
+    SPEC: " ~shared +openmp +tests %clang@14.0.5"
   extends: .build_and_test_on_lassen
 
 ##########
@@ -98,18 +98,18 @@ clang_14_0_5:
 
 clang_12_0_1_cuda_11_5_0:
   variables:
-    SPEC: " +openmp +tests +cuda cuda_arch=70 %clang@12.0.1 ^cuda@11.5.0"
+    SPEC: " ~shared +openmp +tests +cuda cuda_arch=70 %clang@12.0.1 ^cuda@11.5.0"
   extends: .build_and_test_on_lassen
 
 gcc_8_3_1_cuda_11_1_0:
   variables:
-    SPEC: " +openmp +tests +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.1.0"
+    SPEC: " ~shared +openmp +tests +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.1.0"
   extends: .build_and_test_on_lassen
 
 gcc_8_3_1_cuda_11_5_0_ats_disabled:
   extends: .build_and_test_on_lassen
   variables:
-    SPEC: " +openmp +tests +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0"
+    SPEC: " ~shared +openmp +tests +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0"
     LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 60"
 
 ##########
@@ -118,16 +118,16 @@ gcc_8_3_1_cuda_11_5_0_ats_disabled:
 
 clang_13_0_1_libcpp:
   variables:
-    SPEC: " +openmp +tests %clang@13.0.1+libcpp"
+    SPEC: " ~shared +openmp +tests %clang@13.0.1+libcpp"
   extends: .build_and_test_on_lassen
 
 clang_14_0_5_asan:
   variables:
-    SPEC: " +openmp +tests %clang@14.0.5 cxxflags=-fsanitize=address"
+    SPEC: " ~shared +openmp +tests %clang@14.0.5 cxxflags=-fsanitize=address"
     ASAN_OPTIONS: "detect_leaks=1"
   extends: .build_and_test_on_lassen
 
 gcc_8_3_1_cuda_10_1_168_desul_atomics:
   variables:
-    SPEC: " +openmp +tests +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
+    SPEC: " ~shared +openmp +tests +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
   extends: .build_and_test_on_lassen
diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml
index 1aaceec243..33faa56442 100644
--- a/.gitlab/ruby-build-and-test-extra.yml
+++ b/.gitlab/ruby-build-and-test-extra.yml
@@ -35,24 +35,24 @@ pgi_20_1_gcc_local_8_3_1:
 
 clang_9_0_0_openmp_off:
   variables:
-    SPEC: " ~openmp +tests %clang@9.0.0"
+    SPEC: " ~shared ~openmp +tests %clang@9.0.0"
   extends: .build_and_test_on_ruby
 
 gcc_8_1_0_openmp_default:
   variables:
-    SPEC: " +tests %gcc@8.1.0"
+    SPEC: " ~shared +tests %gcc@8.1.0"
     RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1"
   extends: .build_and_test_on_ruby
 
 icpc_19_1_0:
   variables:
-    SPEC: " +openmp +tests %intel@19.1.0"
+    SPEC: " ~shared +openmp +tests %intel@19.1.0"
     RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1"
   extends: .build_and_test_on_ruby
 
 # OTHERS
 clang_10_0_1_gcc_8_3_1_desul_atomics:
   variables:
-    SPEC: " +openmp +tests +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
+    SPEC: " ~shared +openmp +tests +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
   extends: .build_and_test_on_ruby
 

From 254857868c93e3ca8a2855d62fe7787722aa268a Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 3 Oct 2022 19:58:58 +0200
Subject: [PATCH 18/94] Fix incompatible tests variant specification

---
 .gitlab/lassen-build-and-test-extra.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml
index ab28bf01da..85ced723c9 100644
--- a/.gitlab/lassen-build-and-test-extra.yml
+++ b/.gitlab/lassen-build-and-test-extra.yml
@@ -67,6 +67,12 @@ xl_16_1_1_12_cuda_11_1_1:
   allow_failure: true
   extends: .build_and_test_on_lassen
 
+# Overriding shared spec: Do not run test on that config. This spec will be removed soon.
+xl_16_1_1_12_gcc_8_3_1_cuda_11_0_2:
+  variables:
+    SPEC: "~shared +openmp +cuda ~tests %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@11.0.2 ${PROJECT_LASSEN_DEPS}"
+  extends: .build_and_test_on_lassen
+
 # Overriding shared spec: Longer allocation + Extra flags + Allow failure + Updated cuda
 xl_16_1_1_12_gcc_8_3_1_cuda_11_1_0:
   variables:

From cc0810ace3fd9f77fc8daf7f9805fa3e59879eae Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 10 Oct 2022 10:56:23 +0200
Subject: [PATCH 19/94] Fix attempt: specify cuda arch

---
 .gitlab/lassen-build-and-test-extra.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml
index 85ced723c9..9fdf3ccec2 100644
--- a/.gitlab/lassen-build-and-test-extra.yml
+++ b/.gitlab/lassen-build-and-test-extra.yml
@@ -67,16 +67,16 @@ xl_16_1_1_12_cuda_11_1_1:
   allow_failure: true
   extends: .build_and_test_on_lassen
 
-# Overriding shared spec: Do not run test on that config. This spec will be removed soon.
+# Overriding shared spec: Do not run test on that config + specify cuda arch. This spec will be removed soon.
 xl_16_1_1_12_gcc_8_3_1_cuda_11_0_2:
   variables:
-    SPEC: "~shared +openmp +cuda ~tests %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@11.0.2 ${PROJECT_LASSEN_DEPS}"
+    SPEC: "~shared +openmp +cuda ~tests %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cuda_arch=70 ^cuda@11.0.2 ${PROJECT_LASSEN_DEPS}"
   extends: .build_and_test_on_lassen
 
 # Overriding shared spec: Longer allocation + Extra flags + Allow failure + Updated cuda
 xl_16_1_1_12_gcc_8_3_1_cuda_11_1_0:
   variables:
-    SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cuda_arch=70 ^cuda@11.1.0 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}"
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cuda_arch=70 ^cuda@11.1.0 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}"
     LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120"
   allow_failure: true
   extends: .build_and_test_on_lassen

From 3c0c12aef84fa2bbb71d0444d14b0ea777650035 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Wed, 2 Nov 2022 22:33:50 +0100
Subject: [PATCH 20/94] Update radiuss-spack-configs to get newer cmake on
 corona

---
 blt                           | 2 +-
 scripts/radiuss-spack-configs | 2 +-
 tpl/camp                      | 2 +-
 tpl/desul                     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/blt b/blt
index 97ea54d892..e35f490a8a 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit 97ea54d892b4b1d56736830575c3db62e3d7674d
+Subproject commit e35f490a8a8b1689e99b5f4308b5251f97eb36cf
diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 1ce0f4421c..6f4b566f03 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 1ce0f4421cfe6be4200ae9aa8abd113e09ee4c2d
+Subproject commit 6f4b566f031a6dd6669c1dfbabf008ad57759055
diff --git a/tpl/camp b/tpl/camp
index 9a6b8216a9..3a7486edb8 160000
--- a/tpl/camp
+++ b/tpl/camp
@@ -1 +1 @@
-Subproject commit 9a6b8216a9c5f6d8f05a77fc1402fa7e91043d5c
+Subproject commit 3a7486edb8b1c50ce36ecace56384d32a1009e4f
diff --git a/tpl/desul b/tpl/desul
index e4b65e00a8..ac4eb0229a 160000
--- a/tpl/desul
+++ b/tpl/desul
@@ -1 +1 @@
-Subproject commit e4b65e00a8f26cfc7b59cf5f2fb75a24f69111ab
+Subproject commit ac4eb0229a75a715b4d80f64ffff56c578ef6f41

From ccee7575447726d4e884c2f84af2c26d406e6591 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 3 Nov 2022 10:39:02 +0100
Subject: [PATCH 21/94] Update radiuss-spack-configs to fix cmake versions

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 6f4b566f03..8d955b4a49 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 6f4b566f031a6dd6669c1dfbabf008ad57759055
+Subproject commit 8d955b4a49406a12dc44f1a9baab7ab7fa9c68df

From f4eac778a19a678e664198077d20f0e9c0a5ddbc Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 3 Nov 2022 11:39:37 +0100
Subject: [PATCH 22/94] Add new constraint on CMake version in RAJA package

---
 scripts/spack_packages/raja/package.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 365c7743ee..1288f0a67c 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -73,8 +73,9 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
     depends_on("camp@main", when="@develop")
     depends_on("camp+openmp", when="+openmp")
 
-    depends_on("cmake@:3.20", when="+rocm", type="build")
+    depends_on("cmake@3.23:", when="@2022.10.0:")
     depends_on("cmake@3.14:", when="@2022.03.0:")
+    depends_on("cmake@:3.20", when="@2022.03.0:2022.03 +rocm", type="build")
 
     depends_on("llvm-openmp", when="+openmp %apple-clang")
 

From 294e25dee79e542692b8ed3a88236110a6af8ac5 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 3 Nov 2022 12:17:16 +0100
Subject: [PATCH 23/94] More accurate CMake version requirements is RAJA
 package

---
 scripts/spack_packages/raja/package.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 1288f0a67c..6f1d2196cc 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -73,8 +73,9 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
     depends_on("camp@main", when="@develop")
     depends_on("camp+openmp", when="+openmp")
 
-    depends_on("cmake@3.23:", when="@2022.10.0:")
-    depends_on("cmake@3.14:", when="@2022.03.0:")
+    depends_on("cmake@3.20:", when="@2022.10.0:", type="build")
+    depends_on("cmake@3.23:", when="@2022.10.0: +rocm", type="build")
+    depends_on("cmake@3.14:", when="@2022.03.0:", type="build")
     depends_on("cmake@:3.20", when="@2022.03.0:2022.03 +rocm", type="build")
 
     depends_on("llvm-openmp", when="+openmp %apple-clang")

From 43b4432939e0ad7dc4f60789ac0f6984b0cbbe14 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 3 Nov 2022 14:26:35 +0100
Subject: [PATCH 24/94] Add latest releases

---
 scripts/spack_packages/raja/package.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 6f1d2196cc..73be8b9674 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -24,6 +24,9 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
 
     version("develop", branch="develop", submodules=False)
     version("main", branch="main", submodules=False)
+    version("2022.10.1", tag="v2022.10.1", submodules=False)
+    version("2022.10.0", tag="v2022.10.0", submodules=False)
+    version("2022.03.1", tag="v2022.03.1", submodules=False)
     version("2022.03.0", tag="v2022.03.0", submodules=False)
     version("0.14.0", tag="v0.14.0", submodules="True")
     version("0.13.0", tag="v0.13.0", submodules="True")

From d73cdd016201d5b841a24aa0f1a447ece1d4d3b2 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Nov 2022 13:52:52 -0700
Subject: [PATCH 25/94] Fix use of privatizer in tbb forall

---
 include/RAJA/policy/tbb/forall.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/RAJA/policy/tbb/forall.hpp b/include/RAJA/policy/tbb/forall.hpp
index de811a4251..3f8bd966db 100644
--- a/include/RAJA/policy/tbb/forall.hpp
+++ b/include/RAJA/policy/tbb/forall.hpp
@@ -99,9 +99,9 @@ forall_impl(resources::Host host_res,
       [=](const brange& r, ForallParam fp) {
         using RAJA::internal::thread_privatize;
         auto privatizer = thread_privatize(loop_body);
-        auto body = privatizer.get_priv();
+        auto& body = privatizer.get_priv();
         for (auto i = r.begin(); i != r.end(); ++i)
-          expt::invoke_body(fp, loop_body, b[i]);
+          expt::invoke_body(fp, body, b[i]);
         return fp;
       },
 
@@ -138,7 +138,7 @@ forall_impl(resources::Host host_res,
   ::tbb::parallel_for(brange(0, dist, p.grain_size), [=](const brange& r) {
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(loop_body);
-    auto body = privatizer.get_priv();
+    auto& body = privatizer.get_priv();
     for (auto i = r.begin(); i != r.end(); ++i)
       body(b[i]);
   });
@@ -196,9 +196,9 @@ forall_impl(resources::Host host_res,
       [=](const brange& r, ForallParam fp) {
         using RAJA::internal::thread_privatize;
         auto privatizer = thread_privatize(loop_body);
-        auto body = privatizer.get_priv();
+        auto& body = privatizer.get_priv();
         for (auto i = r.begin(); i != r.end(); ++i)
-          expt::invoke_body(fp, loop_body, b[i]);
+          expt::invoke_body(fp, body, b[i]);
         return fp;
       },
 
@@ -240,7 +240,7 @@ forall_impl(resources::Host host_res,
       [=](const brange& r) {
         using RAJA::internal::thread_privatize;
         auto privatizer = thread_privatize(loop_body);
-        auto body = privatizer.get_priv();
+        auto& body = privatizer.get_priv();
         for (auto i = r.begin(); i != r.end(); ++i)
           body(b[i]);
       },

From 5f5f11f81c595d218a9792b5012fc82f7c943bdb Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Nov 2022 13:53:30 -0700
Subject: [PATCH 26/94] Add toss4 gcc and icpx scripts

---
 host-configs/lc-builds/toss4/gcc_X.cmake  | 16 +++++++
 host-configs/lc-builds/toss4/icpx_X.cmake | 16 +++++++
 scripts/lc-builds/toss4_gcc.sh            | 41 ++++++++++++++++++
 scripts/lc-builds/toss4_icpx.sh           | 51 +++++++++++++++++++++++
 4 files changed, 124 insertions(+)
 create mode 100755 host-configs/lc-builds/toss4/gcc_X.cmake
 create mode 100755 host-configs/lc-builds/toss4/icpx_X.cmake
 create mode 100755 scripts/lc-builds/toss4_gcc.sh
 create mode 100755 scripts/lc-builds/toss4_icpx.sh

diff --git a/host-configs/lc-builds/toss4/gcc_X.cmake b/host-configs/lc-builds/toss4/gcc_X.cmake
new file mode 100755
index 0000000000..017fabca22
--- /dev/null
+++ b/host-configs/lc-builds/toss4/gcc_X.cmake
@@ -0,0 +1,16 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_GNU" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-Ofast -march=native" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Ofast -march=native -g" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss4/icpx_X.cmake b/host-configs/lc-builds/toss4/icpx_X.cmake
new file mode 100755
index 0000000000..3b1e6cd194
--- /dev/null
+++ b/host-configs/lc-builds/toss4/icpx_X.cmake
@@ -0,0 +1,16 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -march=native" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh
new file mode 100755
index 0000000000..e33fd2ed83
--- /dev/null
+++ b/scripts/lc-builds/toss4_gcc.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [ "$1" == "" ]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss4_gcc.sh 10.3.1"
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+BUILD_SUFFIX=lc_toss4-gcc-${COMP_VER}
+
+echo
+echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.21.1
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/gcc/gcc-${COMP_VER}/bin/g++ \
+  -DBLT_CXX_STD=c++14 \
+  -C ../host-configs/lc-builds/toss4/gcc_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh
new file mode 100755
index 0000000000..dcbe916968
--- /dev/null
+++ b/scripts/lc-builds/toss4_icpx.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [ "$1" == "" ]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss4_icpx.sh 2022.1.0"
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+USE_TBB=On
+
+BUILD_SUFFIX=lc_toss4-icpx-${COMP_VER}
+
+echo
+echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.21.1
+
+#
+# Note: we are using the intel-tce install path as the vanilla intel install
+# path is not in /usr/tce/packages
+#
+
+cmake \
+  -DBLT_CXX_STD=c++14 \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-tce/intel-${COMP_VER}/bin/icpx \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/intel-tce/intel-${COMP_VER}/bin/icx \
+  -DBLT_CXX_STD=c++14 \
+  -C ../host-configs/lc-builds/toss4/icpx_X.cmake \
+  -DENABLE_OPENMP=On \
+  -DRAJA_ENABLE_TBB=${USE_TBB} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..

From ea35c6b3a61cb2fa482d00ccc2cef24162f90bc6 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 3 Nov 2022 16:48:17 -0700
Subject: [PATCH 27/94] Add toss4 icpc script

---
 host-configs/lc-builds/toss4/icpc_X.cmake | 16 +++++++
 scripts/lc-builds/toss4_gcc.sh            |  2 +-
 scripts/lc-builds/toss4_icpc.sh           | 52 +++++++++++++++++++++++
 scripts/lc-builds/toss4_icpx.sh           |  2 +-
 4 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100755 host-configs/lc-builds/toss4/icpc_X.cmake
 create mode 100755 scripts/lc-builds/toss4_icpc.sh

diff --git a/host-configs/lc-builds/toss4/icpc_X.cmake b/host-configs/lc-builds/toss4/icpc_X.cmake
new file mode 100755
index 0000000000..9810b093f6
--- /dev/null
+++ b/host-configs/lc-builds/toss4/icpc_X.cmake
@@ -0,0 +1,16 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_ICC" CACHE STRING "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -march=native -ansi-alias -diag-disable cpu-dispatch" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g" CACHE STRING "")
+
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/scripts/lc-builds/toss4_gcc.sh b/scripts/lc-builds/toss4_gcc.sh
index e33fd2ed83..532d2ff130 100755
--- a/scripts/lc-builds/toss4_gcc.sh
+++ b/scripts/lc-builds/toss4_gcc.sh
@@ -20,7 +20,7 @@ shift 1
 BUILD_SUFFIX=lc_toss4-gcc-${COMP_VER}
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh
new file mode 100755
index 0000000000..e4d091a6ae
--- /dev/null
+++ b/scripts/lc-builds/toss4_icpc.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [ "$1" == "" ]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss4_icpc.sh 2021.6.0"
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+USE_TBB=On
+
+BUILD_SUFFIX=lc_toss4-icpc-${COMP_VER}
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.21.1
+
+##
+# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
+# times at a potential cost of slower 'forall' execution.
+##
+
+cmake \
+  -DBLT_CXX_STD=c++14 \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \
+  -DBLT_CXX_STD=c++14 \
+  -C ../host-configs/lc-builds/toss4/icpc_X.cmake \
+  -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \
+  -DENABLE_OPENMP=On \
+  -DRAJA_ENABLE_TBB=${USE_TBB} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh
index dcbe916968..37181a4efd 100755
--- a/scripts/lc-builds/toss4_icpx.sh
+++ b/scripts/lc-builds/toss4_icpx.sh
@@ -22,7 +22,7 @@ USE_TBB=On
 BUILD_SUFFIX=lc_toss4-icpx-${COMP_VER}
 
 echo
-echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
 echo "Configuration extra arguments:"
 echo "   $@"
 echo

From 31691cd27de76672ad8bdfe3d93e61b556e7eebb Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Fri, 4 Nov 2022 15:20:01 +0100
Subject: [PATCH 28/94] add latest camp version + update blt dependency

---
 scripts/spack_packages/camp/package.py | 1 +
 scripts/spack_packages/raja/package.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
index 8f40f6b295..c0aae50c4e 100644
--- a/scripts/spack_packages/camp/package.py
+++ b/scripts/spack_packages/camp/package.py
@@ -29,6 +29,7 @@ class Camp(CMakePackage, CudaPackage, ROCmPackage):
     maintainers = ["trws"]
 
     version("main", branch="main", submodules="True")
+    version('2022.10.0', sha256='3561c3ef00bbcb61fe3183c53d49b110e54910f47e7fc689ad9ccce57e55d6b8')
     version("2022.03.2", sha256="bc4aaeacfe8f2912e28f7a36fc731ab9e481bee15f2c6daf0cb208eed3f201eb")
     version("2022.03.0", sha256="e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721")
     version("0.3.0", sha256="129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb")
diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 73be8b9674..e91687bdb1 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -64,6 +64,7 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
     variant("desul", default=False, description="Build Desul Atomics backend")
 
     depends_on("blt")
+    depends_on("blt@0.5.2:", type="build", when="@2022.10.0:")
     depends_on("blt@0.5.0:", type="build", when="@0.14.1:")
     depends_on("blt@0.4.1", type="build", when="@0.14.0")
     depends_on("blt@0.4.0:", type="build", when="@0.13.0")

From ec96ec3bde53aa6856edff66cc22f422ac8de4dc Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Fri, 4 Nov 2022 15:23:17 +0100
Subject: [PATCH 29/94] Minor cosmetic change

---
 scripts/spack_packages/camp/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
index c0aae50c4e..446f8e8ded 100644
--- a/scripts/spack_packages/camp/package.py
+++ b/scripts/spack_packages/camp/package.py
@@ -29,7 +29,7 @@ class Camp(CMakePackage, CudaPackage, ROCmPackage):
     maintainers = ["trws"]
 
     version("main", branch="main", submodules="True")
-    version('2022.10.0', sha256='3561c3ef00bbcb61fe3183c53d49b110e54910f47e7fc689ad9ccce57e55d6b8')
+    version("2022.10.0", sha256="3561c3ef00bbcb61fe3183c53d49b110e54910f47e7fc689ad9ccce57e55d6b8")
     version("2022.03.2", sha256="bc4aaeacfe8f2912e28f7a36fc731ab9e481bee15f2c6daf0cb208eed3f201eb")
     version("2022.03.0", sha256="e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721")
     version("0.3.0", sha256="129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb")

From 18516a8c1aa095190ed846986d5299c4041143e3 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Fri, 4 Nov 2022 16:50:21 +0100
Subject: [PATCH 30/94] Cosmetic sync from Umpire

---
 scripts/spack_packages/camp/package.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
index 446f8e8ded..f5563ffe19 100644
--- a/scripts/spack_packages/camp/package.py
+++ b/scripts/spack_packages/camp/package.py
@@ -39,7 +39,7 @@ class Camp(CMakePackage, CudaPackage, ROCmPackage):
 
     # TODO: figure out gtest dependency and then set this default True.
     variant("tests", default=False, description="Build tests")
-    variant("openmp", default=False, description="Build OpenMP support")
+    variant("openmp", default=False, description="Build with OpenMP support")
 
     depends_on("cub", when="+cuda")
 
@@ -52,11 +52,11 @@ def cmake_args(self):
 
         options.append("-DBLT_SOURCE_DIR={0}".format(spec["blt"].prefix))
 
-        options.append("-DENABLE_OPENMP=" + ("On" if "+openmp" in spec else "Off"))
         if "+cuda" in spec:
-            options.extend(
-                ["-DENABLE_CUDA=ON", "-DCUDA_TOOLKIT_ROOT_DIR=%s" % (spec["cuda"].prefix)]
-            )
+            options.extend([
+                "-DENABLE_CUDA=ON",
+                "-DCUDA_TOOLKIT_ROOT_DIR=%s" % (spec["cuda"].prefix)
+            ])
 
             if not spec.satisfies("cuda_arch=none"):
                 cuda_arch = spec.variants["cuda_arch"].value
@@ -68,7 +68,10 @@ def cmake_args(self):
             options.append("-DENABLE_CUDA=OFF")
 
         if "+rocm" in spec:
-            options.extend(["-DENABLE_HIP=ON", "-DHIP_ROOT_DIR={0}".format(spec["hip"].prefix)])
+            options.extend([
+                "-DENABLE_HIP=ON",
+                "-DHIP_ROOT_DIR={0}".format(spec["hip"].prefix)
+            ])
 
             hip_repair_options(options, spec)
 
@@ -79,6 +82,7 @@ def cmake_args(self):
         else:
             options.append("-DENABLE_HIP=OFF")
 
+        options.append(self.define_from_variant("ENABLE_OPENMP", "openmp"))
         options.append(self.define_from_variant("ENABLE_TESTS", "tests"))
 
         return options

From fe27236878e91f3d609a2f16077d0293f20591fe Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Fri, 4 Nov 2022 17:06:51 +0100
Subject: [PATCH 31/94] Add newest BLT package to get blt@5.2.0

---
 scripts/spack_packages/blt/package.py | 40 +++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 scripts/spack_packages/blt/package.py

diff --git a/scripts/spack_packages/blt/package.py b/scripts/spack_packages/blt/package.py
new file mode 100644
index 0000000000..87fb122dff
--- /dev/null
+++ b/scripts/spack_packages/blt/package.py
@@ -0,0 +1,40 @@
+# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from spack.package import *
+
+
+class Blt(Package):
+    """BLT is a streamlined CMake-based foundation for Building, Linking and
+    Testing large-scale high performance computing (HPC) applications."""
+
+    homepage = "https://github.com/LLNL/blt"
+    url = "https://github.com/LLNL/blt/archive/v0.4.0.tar.gz"
+    git = "https://github.com/LLNL/blt.git"
+    tags = ["radiuss"]
+
+    maintainers = ["white238", "davidbeckingsale"]
+
+    version("develop", branch="develop")
+    version("main", branch="main")
+    # Note: 0.4.0+ contains a breaking change to BLT created targets
+    #  if you export targets this could cause problems in downstream
+    #  projects if not handled properly. More info here:
+    #  https://llnl-blt.readthedocs.io/en/develop/tutorial/exporting_targets.html
+    version("0.5.2", sha256="95b924cfbb2bddd9b1a92e96603b2fd485a19721d59ddf8ff50baefc1714d7ea")
+    version("0.5.1", sha256="ff7e87eefc48704a0721b66174612b945955adaa0a56aa69dd0473074fa4badf")
+    version("0.5.0", sha256="5f680ef922d0e0a7ff1b1a5fc8aa107cd4f543ad888cbc9b12639bea72a6ab1f")
+    version("0.4.1", sha256="16cc3e067ddcf48b99358107e5035a17549f52dcc701a35cd18a9d9f536826c1")
+    version("0.4.0", sha256="f3bc45d28b9b2eb6df43b75d4f6f89a1557d73d012da7b75bac1be0574767193")
+    version("0.3.6", sha256="6276317c29e7ff8524fbea47d9288ddb40ac06e9f9da5e878bf9011e2c99bf71")
+    version("0.3.5", sha256="68a1c224bb9203461ae6f5ab0ff3c50b4a58dcce6c2d2799489a1811f425fb84")
+    version("0.3.0", sha256="bb917a67cb7335d6721c997ba9c5dca70506006d7bba5e0e50033dd0836481a5")
+    version("0.2.5", sha256="3a000f60194e47b3e5623cc528cbcaf88f7fea4d9620b3c7446ff6658dc582a5")
+    version("0.2.0", sha256="c0cadf1269c2feb189e398a356e3c49170bc832df95e5564e32bdbb1eb0fa1b3")
+
+    depends_on("cmake", type="run")
+
+    def install(self, spec, prefix):
+        install_tree(".", prefix)

From efb6f9a1efbe6bc59634377acdda62c6d4842fb6 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 4 Nov 2022 11:38:23 -0700
Subject: [PATCH 32/94] Remove redundant BLT_CXX_STD

---
 scripts/lc-builds/toss3_icpc.sh | 1 -
 scripts/lc-builds/toss4_icpc.sh | 1 -
 scripts/lc-builds/toss4_icpx.sh | 1 -
 3 files changed, 3 deletions(-)

diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh
index 5f2474bb44..7e0487dacd 100755
--- a/scripts/lc-builds/toss3_icpc.sh
+++ b/scripts/lc-builds/toss3_icpc.sh
@@ -50,7 +50,6 @@ module load cmake/3.20.2
 ##
 
 cmake \
-  -DBLT_CXX_STD=c++14 \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icpc \
   -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \
diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh
index e4d091a6ae..bc1f9427cf 100755
--- a/scripts/lc-builds/toss4_icpc.sh
+++ b/scripts/lc-builds/toss4_icpc.sh
@@ -38,7 +38,6 @@ module load cmake/3.21.1
 ##
 
 cmake \
-  -DBLT_CXX_STD=c++14 \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \
   -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \
diff --git a/scripts/lc-builds/toss4_icpx.sh b/scripts/lc-builds/toss4_icpx.sh
index 37181a4efd..7c2a3c4561 100755
--- a/scripts/lc-builds/toss4_icpx.sh
+++ b/scripts/lc-builds/toss4_icpx.sh
@@ -38,7 +38,6 @@ module load cmake/3.21.1
 #
 
 cmake \
-  -DBLT_CXX_STD=c++14 \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-tce/intel-${COMP_VER}/bin/icpx \
   -DCMAKE_C_COMPILER=/usr/tce/packages/intel-tce/intel-${COMP_VER}/bin/icx \

From 337ea9ec0b5f003570cffcb7d69511fb64227cfd Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 4 Nov 2022 11:49:34 -0700
Subject: [PATCH 33/94] Add icpc omp shared lib message

---
 scripts/lc-builds/toss4_icpc.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh
index bc1f9427cf..90d72d92be 100755
--- a/scripts/lc-builds/toss4_icpc.sh
+++ b/scripts/lc-builds/toss4_icpc.sh
@@ -49,3 +49,15 @@ cmake \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you may need to add some intel openmp libraries to your"
+echo "  LD_LIBRARY_PATH to run with openmp."
+echo
+echo "    LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/tce/packages/intel-classic-tce/intel-classic-2021.6.0/compiler/lib/intel64_lin"
+echo
+echo "***********************************************************************"

From 8ebec2caede03fc16a9e74411113f2321728e1ca Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 4 Nov 2022 15:32:14 -0700
Subject: [PATCH 34/94] use tce wrappers with icpc

---
 scripts/lc-builds/toss4_icpc.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/lc-builds/toss4_icpc.sh b/scripts/lc-builds/toss4_icpc.sh
index 90d72d92be..4c46e80f9c 100755
--- a/scripts/lc-builds/toss4_icpc.sh
+++ b/scripts/lc-builds/toss4_icpc.sh
@@ -39,8 +39,8 @@ module load cmake/3.21.1
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icpc \
-  -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic/intel-classic-${COMP_VER}/bin/icc \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel-classic-tce/intel-classic-${COMP_VER}/bin/icpc \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/intel-classic-tce/intel-classic-${COMP_VER}/bin/icc \
   -DBLT_CXX_STD=c++14 \
   -C ../host-configs/lc-builds/toss4/icpc_X.cmake \
   -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \

From a7130048ba26bf0be1e72ec194b199700a546d46 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 7 Nov 2022 15:46:26 +0100
Subject: [PATCH 35/94] Update camp package with latest spack develop + update
 camp requirement

---
 scripts/spack_packages/camp/package.py | 10 ++++++++++
 scripts/spack_packages/raja/package.py |  1 +
 2 files changed, 11 insertions(+)

diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
index f5563ffe19..4e25af8022 100644
--- a/scripts/spack_packages/camp/package.py
+++ b/scripts/spack_packages/camp/package.py
@@ -16,6 +16,16 @@ def hip_repair_options(options, spec):
     )
 
 
+def hip_repair_cache(options, spec):
+    # there is only one dir like this, but the version component is unknown
+    options.append(
+        cmake_cache_path(
+            "HIP_CLANG_INCLUDE_PATH",
+            glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0],
+        )
+    )
+
+
 class Camp(CMakePackage, CudaPackage, ROCmPackage):
     """
     Compiler agnostic metaprogramming library providing concepts,
diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index e91687bdb1..557afbe422 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -72,6 +72,7 @@ class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
 
     depends_on("camp@0.2.2:0.2.3", when="@0.14.0")
     depends_on("camp@0.1.0", when="@0.10.0:0.13.0")
+    depends_on("camp@2022.10.0:", when="@2022.10.0:")
     depends_on("camp@2022.03.2:", when="@2022.03.0:")
     depends_on("camp@main", when="@main")
     depends_on("camp@main", when="@develop")

From 77229afe7c98561361127d3defdf323cea228bb0 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Mon, 7 Nov 2022 16:08:34 +0100
Subject: [PATCH 36/94] Share utility function definition with camp

---
 scripts/spack_packages/raja/package.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 557afbe422..16e1996a97 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -7,12 +7,9 @@
 import glob
 
 from spack.package import *
+from spack.pkg.builtin.camp import hip_repair_cache
 
 
-def hip_repair_entries(entries, spec):
-    # there is only one dir like this, but the version component is unknown
-    entries.append(cmake_cache_path("HIP_CLANG_INCLUDE_PATH", glob.glob("{}/lib/clang/*/include".format(spec["llvm-amdgpu"].prefix))[0]))
-
 class Raja(CachedCMakePackage, CudaPackage, ROCmPackage):
     """RAJA Parallel Framework."""
 
@@ -139,7 +136,7 @@ def initconfig_hardware_entries(self):
         if "+rocm" in spec:
             entries.append(cmake_cache_option("ENABLE_HIP", True))
             entries.append(cmake_cache_path("HIP_ROOT_DIR", "{0}".format(spec["hip"].prefix)))
-            hip_repair_entries(entries, spec)
+            hip_repair_cache(entries, spec)
             archs = self.spec.variants["amdgpu_target"].value
             if archs != "none":
                 arch_str = ",".join(archs)

From ab01fa1dee8895c97cbd101fbd8d6f0529b9056f Mon Sep 17 00:00:00 2001
From: Kenny Weiss <kennyweiss@users.noreply.github.com>
Date: Mon, 7 Nov 2022 18:56:00 -0800
Subject: [PATCH 37/94] Exports `roctx` when present

---
 cmake/SetupPackages.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/SetupPackages.cmake b/cmake/SetupPackages.cmake
index 45dadbee4b..9e057f3a50 100644
--- a/cmake/SetupPackages.cmake
+++ b/cmake/SetupPackages.cmake
@@ -101,13 +101,16 @@ if (RAJA_ENABLE_HIP AND RAJA_ENABLE_ROCTX)
   include(FindRoctracer)
   blt_import_library(NAME roctx
                      INCLUDES ${ROCTX_INCLUDE_DIRS}
-                     LIBRARIES ${ROCTX_LIBRARIES})
+                     LIBRARIES ${ROCTX_LIBRARIES}
+                     EXPORTABLE ON
+                     TREAT_INCLUDES_AS_SYSTEM ON)
 endif ()
 
 set(TPL_DEPS)
 blt_list_append(TO TPL_DEPS ELEMENTS nvtoolsext IF RAJA_ENABLE_NV_TOOLS_EXT)
 blt_list_append(TO TPL_DEPS ELEMENTS cub IF RAJA_ENABLE_EXTERNAL_CUB)
 blt_list_append(TO TPL_DEPS ELEMENTS rocPRIM IF RAJA_ENABLE_EXTERNAL_ROCPRIM)
+blt_list_append(TO TPL_DEPS ELEMENTS roctx IF RAJA_ENABLE_ROCTX)
 
 set(RAJA_NEEDS_BLT_TPLS False)
 if (RAJA_ENABLE_CUDA OR RAJA_ENABLE_HIP OR RAJA_ENABLE_OPENMP OR RAJA_ENABLE_MPI)

From b799a5e52edf77e2c3aed104066de7f23f39ce1a Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 10 Nov 2022 11:06:29 +0100
Subject: [PATCH 38/94] Adds a proper compiler entry + attempt to use hip CXX
 compiler

---
 scripts/spack_packages/raja/package.py | 52 +++++++++++++++++++-------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 16e1996a97..113d8e5956 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -108,13 +108,50 @@ def cache_name(self):
         hostname = socket.gethostname()
         if "SYS_TYPE" in env:
             hostname = hostname.rstrip("1234567890")
-        return "{0}-{1}-{2}@{3}.cmake".format(
+        return "{0}-{1}-{2}@{3}#{4}.cmake".format(
             hostname,
             self._get_sys_type(self.spec),
             self.spec.compiler.name,
             self.spec.compiler.version,
+            self.spec.dag_hash
         )
 
+    def initconfig_compiler_entries(self):
+        spec = self.spec
+        # Default entries are already defined in CachedCMakePackage, inherit them:
+        entries = super(Raja, self).initconfig_compiler_entries()
+
+        # Switch to hip as a CPP compiler.
+        # adrienbernede-22-11:
+        #   This was only done in upstream Spack raja package.
+        #   I could not find the equivalent logic in Spack source, so keeping it.
+        if "+rocm" in spec:
+            entries.insert(0, cmake_cache_path("CMAKE_CXX_COMPILER", spec["hip"].hipcc))
+
+        # Override CachedCMakePackage CMAKE_C_FLAGS and CMAKE_CXX_FLAGS add
+        # +libcpp specific flags
+        flags = spec.compiler_flags
+
+        # use global spack compiler flags
+        cppflags = " ".join(flags["cppflags"])
+        if cppflags:
+            # avoid always ending up with " " with no flags defined
+            cppflags += " "
+
+        cflags = cppflags + " ".join(flags["cflags"])
+        if "+libcpp" in spec:
+            cflags += " ".join([cflags,"-DGTEST_HAS_CXXABI_H_=0"])
+        if cflags:
+            entries.append(cmake_cache_string("CMAKE_C_FLAGS", cflags))
+
+        cxxflags = cppflags + " ".join(flags["cxxflags"])
+        if "+libcpp" in spec:
+            cxxflags += " ".join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"])
+        if cxxflags:
+            entries.append(cmake_cache_string("CMAKE_CXX_FLAGS", cxxflags))
+
+        return entries
+
     def initconfig_hardware_entries(self):
         spec = self.spec
         entries = super(Raja, self).initconfig_hardware_entries()
@@ -159,19 +196,6 @@ def initconfig_package_entries(self):
 
         entries.append(cmake_cache_option("RAJA_ENABLE_DESUL_ATOMICS", "+desul" in spec))
 
-        # use global spack compiler flags
-        cflags = " ".join(spec.compiler_flags["cflags"])
-        if "+libcpp" in spec:
-            cflags += " ".join([cflags,"-DGTEST_HAS_CXXABI_H_=0"])
-        if cflags:
-            entries.append(cmake_cache_string("CMAKE_C_FLAGS", cflags))
-
-        cxxflags = " ".join(spec.compiler_flags["cxxflags"])
-        if "+libcpp" in spec:
-            cxxflags += " ".join([cxxflags,"-stdlib=libc++ -DGTEST_HAS_CXXABI_H_=0"])
-        if cxxflags:
-            entries.append(cmake_cache_string("CMAKE_CXX_FLAGS", cxxflags))
-
         if "+desul" in spec:
             entries.append(cmake_cache_string("BLT_CXX_STD","c++14"))
             if "+cuda" in spec:

From d76633a08ff7397653da0d5536f617d646335236 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 10 Nov 2022 11:23:22 +0100
Subject: [PATCH 39/94] Fix call to get dag_hash

---
 scripts/spack_packages/raja/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 113d8e5956..5798a961e7 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -113,7 +113,7 @@ def cache_name(self):
             self._get_sys_type(self.spec),
             self.spec.compiler.name,
             self.spec.compiler.version,
-            self.spec.dag_hash
+            self.spec.dag_hash(8)
         )
 
     def initconfig_compiler_entries(self):

From c04d7425825ab5460369f439252c6f7568d2fc8c Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 10 Nov 2022 11:33:39 +0100
Subject: [PATCH 40/94] Fix "#" not allowed

---
 scripts/spack_packages/raja/package.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 5798a961e7..e414fa70bd 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -108,7 +108,7 @@ def cache_name(self):
         hostname = socket.gethostname()
         if "SYS_TYPE" in env:
             hostname = hostname.rstrip("1234567890")
-        return "{0}-{1}-{2}@{3}#{4}.cmake".format(
+        return "{0}-{1}-{2}@{3}-{4}.cmake".format(
             hostname,
             self._get_sys_type(self.spec),
             self.spec.compiler.name,

From 18c14e2162d7d901cff1a66118e40f9251efcb4a Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <a.bernede@woptim.dev>
Date: Thu, 10 Nov 2022 15:35:18 +0100
Subject: [PATCH 41/94] Cosmetic reordering + add build type

---
 scripts/spack_packages/raja/package.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index e414fa70bd..875e63107d 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -194,6 +194,24 @@ def initconfig_package_entries(self):
 
         option_prefix = "RAJA_" if spec.satisfies("@0.14.0:") else ""
 
+        # TPL locations
+        entries.append("#------------------{0}".format("-" * 60))
+        entries.append("# TPLs")
+        entries.append("#------------------{0}\n".format("-" * 60))
+
+        entries.append(cmake_cache_path("BLT_SOURCE_DIR", spec["blt"].prefix))
+        if "camp" in self.spec:
+            entries.append(cmake_cache_path("camp_DIR", spec["camp"].prefix))
+
+        # Build options
+        entries.append("#------------------{0}".format("-" * 60))
+        entries.append("# Build Options")
+        entries.append("#------------------{0}\n".format("-" * 60))
+
+        entries.append(cmake_cache_string(
+            "CMAKE_BUILD_TYPE", spec.variants["build_type"].value))
+        entries.append(cmake_cache_option("BUILD_SHARED_LIBS", "+shared" in spec))
+
         entries.append(cmake_cache_option("RAJA_ENABLE_DESUL_ATOMICS", "+desul" in spec))
 
         if "+desul" in spec:
@@ -201,10 +219,6 @@ def initconfig_package_entries(self):
             if "+cuda" in spec:
                 entries.append(cmake_cache_string("CMAKE_CUDA_STANDARD", "14"))
 
-        entries.append(cmake_cache_path("BLT_SOURCE_DIR", spec["blt"].prefix))
-        if "camp" in self.spec:
-            entries.append(cmake_cache_path("camp_DIR", spec["camp"].prefix))
-        entries.append(cmake_cache_option("BUILD_SHARED_LIBS", "+shared" in spec))
         entries.append(
             cmake_cache_option("{}ENABLE_EXAMPLES".format(option_prefix), "+examples" in spec)
         )

From af59e4f074e967c132e33a00f2636b90c15144d3 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 11 Nov 2022 10:37:12 -0800
Subject: [PATCH 42/94] Add forall_param_pack where missing

---
 include/RAJA/policy/openmp_target/kernel/For.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index c5d60d01a9..08f64e227a 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -57,7 +57,7 @@ struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
+    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 

From 3196e70f638fafdcab206d1f7a0ec3570428d597 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 14 Nov 2022 10:56:04 -0800
Subject: [PATCH 43/94] initial try to add chai support to launch

---
 include/RAJA/policy/cuda/launch.hpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 9221925176..0f5b79aa50 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -69,11 +69,23 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
                           static_cast<cuda_dim_member_t>(params.threads.value[1]),
                           static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
+    std::cout<<"launching <RAJA::cuda_launch_t<async, 1>"<<std::endl;
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
     if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
          blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
+
+      util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
+      util::callPreCapturePlugins(context);
+
+      using RAJA::util::trigger_updates_before;
+      trigger_updates_before(body_in);
+
+      util::callPostCapturePlugins(context);
+
+      util::callPreLaunchPlugins(context);
+
       RAJA_FT_BEGIN;
       {
         //
@@ -89,6 +101,9 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
       }
 
       RAJA_FT_END;
+
+      util::callPostLaunchPlugins(context);
+
     }
 
   };

From 14c9f3f23e1f6f74b6840f5f13ca44cdca6db94d Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 14 Nov 2022 12:16:18 -0800
Subject: [PATCH 44/94] Use compile time blocksize in forall

---
 include/RAJA/policy/cuda/forall.hpp | 10 ++++++----
 include/RAJA/policy/hip/forall.hpp  | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index bc0936c353..ce278c4ee2 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -81,15 +81,17 @@ cuda_dim_t getGridDim(cuda_dim_member_t len, cuda_dim_t blockDim)
  *
  ******************************************************************************
  */
+template <size_t BlockSize>
 __device__ __forceinline__ unsigned int getGlobalIdx_1D_1D()
 {
   unsigned int blockId = blockIdx.x;
-  unsigned int threadId = blockId * blockDim.x + threadIdx.x;
+  unsigned int threadId = blockId * BlockSize + threadIdx.x;
   return threadId;
 }
+template <size_t BlockSize>
 __device__ __forceinline__ unsigned int getGlobalNumThreads_1D_1D()
 {
-  unsigned int numThreads = blockDim.x * gridDim.x;
+  unsigned int numThreads = BlockSize * gridDim.x;
   return numThreads;
 }
 
@@ -144,7 +146,7 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
+  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D<BlockSize>());
   if (ii < length) {
     body(idx[ii]);
   }
@@ -167,7 +169,7 @@ __launch_bounds__(BlockSize, 1) __global__
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
+  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D<BlockSize>());
   if ( ii < length )
   {
     RAJA::expt::invoke_body( f_params, body, idx[ii] );
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index ab3109b354..1aa447633e 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -77,15 +77,17 @@ hip_dim_t getGridDim(hip_dim_member_t len, hip_dim_t blockDim)
  *
  ******************************************************************************
  */
+template <size_t BlockSize>
 __device__ __forceinline__ unsigned int getGlobalIdx_1D_1D()
 {
   unsigned int blockId = blockIdx.x;
-  unsigned int threadId = blockId * blockDim.x + threadIdx.x;
+  unsigned int threadId = blockId * BlockSize + threadIdx.x;
   return threadId;
 }
+template <size_t BlockSize>
 __device__ __forceinline__ unsigned int getGlobalNumThreads_1D_1D()
 {
-  unsigned int numThreads = blockDim.x * gridDim.x;
+  unsigned int numThreads = BlockSize * gridDim.x;
   return numThreads;
 }
 
@@ -139,7 +141,7 @@ __launch_bounds__(BlockSize, 1) __global__
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
+  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D<BlockSize>());
   if (ii < length) {
     body(idx[ii]);
   }
@@ -161,7 +163,7 @@ __launch_bounds__(BlockSize, 1) __global__
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
   auto& body = privatizer.get_priv();
-  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
+  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D<BlockSize>());
   if ( ii < length )
   {
     RAJA::expt::invoke_body( f_params, body, idx[ii] );

From 1d38664ee5ea35bf405689c275a7b96e2114089e Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 14 Nov 2022 12:58:33 -0800
Subject: [PATCH 45/94] reorg of plugin calls

---
 include/RAJA/policy/cuda/launch.hpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 0f5b79aa50..308641e80e 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -76,16 +76,6 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
          blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
 
-      util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
-      util::callPreCapturePlugins(context);
-
-      using RAJA::util::trigger_updates_before;
-      trigger_updates_before(body_in);
-
-      util::callPostCapturePlugins(context);
-
-      util::callPreLaunchPlugins(context);
-
       RAJA_FT_BEGIN;
       {
         //
@@ -93,17 +83,27 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
         //
         BODY body = RAJA::cuda::make_launch_body(gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
+        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
         void *args[] = {(void*)&body};
         RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
 
-      util::callPostLaunchPlugins(context);
-
     }
 
   };

From d8e1edfde04deca756e93c1c38788aa934cd9c1f Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 14 Nov 2022 13:07:30 -0800
Subject: [PATCH 46/94] working version of launch + chai

---
 include/RAJA/policy/cuda/launch.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 308641e80e..0cf23eb9ad 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -69,7 +69,6 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
                           static_cast<cuda_dim_member_t>(params.threads.value[1]),
                           static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
-    std::cout<<"launching <RAJA::cuda_launch_t<async, 1>"<<std::endl;
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
     if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
@@ -83,11 +82,14 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
         //
         BODY body = RAJA::cuda::make_launch_body(gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
+        //
+        // Configure plugins
+        //
         util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
         util::callPreCapturePlugins(context);
 
         using RAJA::util::trigger_updates_before;
-        trigger_updates_before(body);
+        auto p_body = trigger_updates_before(body);
 
         util::callPostCapturePlugins(context);
 
@@ -96,7 +98,7 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
+        void *args[] = {(void*)&p_body};
         RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
 
         util::callPostLaunchPlugins(context);

From 3643233a3a994477d3c3b2a0b928995d885cc44a Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 14 Nov 2022 13:46:46 -0800
Subject: [PATCH 47/94] plugin support for cuda and hip

---
 include/RAJA/policy/cuda/#launch.hpp# | 1202 +++++++++++++++++++++++++
 include/RAJA/policy/cuda/launch.hpp   |   63 +-
 include/RAJA/policy/hip/launch.hpp    |   68 +-
 3 files changed, 1319 insertions(+), 14 deletions(-)
 create mode 100644 include/RAJA/policy/cuda/#launch.hpp#

diff --git a/include/RAJA/policy/cuda/#launch.hpp# b/include/RAJA/policy/cuda/#launch.hpp#
new file mode 100644
index 0000000000..d98dd20c5b
--- /dev/null
+++ b/include/RAJA/policy/cuda/#launch.hpp#
@@ -0,0 +1,1202 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing user interface for RAJA::launch::cuda
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_launch_cuda_HPP
+#define RAJA_pattern_launch_cuda_HPP
+
+#include "RAJA/pattern/launch/launch_core.hpp"
+#include "RAJA/pattern/detail/privatizer.hpp"
+#include "RAJA/policy/cuda/policy.hpp"
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
+#include "RAJA/util/resource.hpp"
+
+namespace RAJA
+{
+
+template <typename BODY>
+__global__ void launch_global_fcn(BODY body_in)
+{
+
+  LaunchContext ctx;
+
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(body_in);
+  auto& body = privatizer.get_priv();
+
+  //Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
+  body(ctx);
+}
+
+template <bool async>
+struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
+// cuda_launch_t num_threads set to 1, but not used in launch of kernel
+
+  template <typename BODY_IN>
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
+  {
+
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    resources::Cuda cuda_res = resources::Cuda::get_default();
+    //
+    // Compute the number of blocks and threads
+    //
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+
+      RAJA_FT_BEGIN;
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&p_body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
+      }
+
+      RAJA_FT_END;
+
+    }
+
+  };
+
+  template <typename BODY_IN>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
+  {
+
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn<BODY>;
+
+    // Get the concrete resource
+    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(
+            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&p_body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
+      }
+
+      RAJA_FT_END;
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+};
+
+
+template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
+__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
+    void launch_global_fcn_fixed(BODY body_in)
+{
+
+  LaunchContext ctx;
+
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(body_in);
+  auto& body = privatizer.get_priv();
+
+  //Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
+  body(ctx);
+}
+
+template <bool async, int nthreads, size_t BLOCKS_PER_SM>
+struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+
+  template <typename BODY_IN>
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
+  {
+
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
+
+    resources::Cuda cuda_res = resources::Cuda::get_default();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(
+            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&p_body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
+      }
+
+      RAJA_FT_END;
+    }
+
+  }
+}
+
+  template <typename BODY_IN>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
+  {
+
+    using BODY = camp::decay<BODY_IN>;
+
+    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
+
+    //Get the concrete resource
+    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+
+    // Only launch kernel if we have something to iterate over
+    constexpr cuda_dim_member_t zero = 0;
+    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
+         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+
+      RAJA_FT_BEGIN;
+
+      {
+        //
+        // Privatize the loop_body, using make_launch_body to setup reductions
+        //
+        BODY body = RAJA::cuda::make_launch_body(
+            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+
+
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
+        //
+        // Launch the kernel
+        //
+        void *args[] = {(void*)&p_body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
+      }
+
+      RAJA_FT_END;
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+};
+
+/*
+   CUDA global thread mapping
+*/
+template<int ... DIM>
+struct cuda_global_thread;
+
+using cuda_global_thread_x = cuda_global_thread<0>;
+using cuda_global_thread_y = cuda_global_thread<1>;
+using cuda_global_thread_z = cuda_global_thread<2>;
+
+template <typename SEGMENT, int DIM>
+struct LoopExecute<cuda_global_thread<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM>(threadIdx) +
+        internal::get_cuda_dim<DIM>(blockDim)*internal::get_cuda_dim<DIM>(blockIdx);
+
+      if (tx < len) body(*(segment.begin() + tx));
+    }
+  }
+};
+
+using cuda_global_thread_xy = cuda_global_thread<0,1>;
+using cuda_global_thread_xz = cuda_global_thread<0,2>;
+using cuda_global_thread_yx = cuda_global_thread<1,0>;
+using cuda_global_thread_yz = cuda_global_thread<1,2>;
+using cuda_global_thread_zx = cuda_global_thread<2,0>;
+using cuda_global_thread_zy = cuda_global_thread<2,1>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<cuda_global_thread<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(threadIdx) +
+        internal::get_cuda_dim<DIM0>(blockDim)*internal::get_cuda_dim<DIM0>(blockIdx);
+
+      const int ty = internal::get_cuda_dim<DIM1>(threadIdx) +
+        internal::get_cuda_dim<DIM1>(blockDim)*internal::get_cuda_dim<DIM1>(blockIdx);
+
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+using cuda_global_thread_xyz = cuda_global_thread<0,1,2>;
+using cuda_global_thread_xzy = cuda_global_thread<0,2,1>;
+using cuda_global_thread_yxz = cuda_global_thread<1,0,2>;
+using cuda_global_thread_yzx = cuda_global_thread<1,2,0>;
+using cuda_global_thread_zxy = cuda_global_thread<2,0,1>;
+using cuda_global_thread_zyx = cuda_global_thread<2,1,0>;
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<cuda_global_thread<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(threadIdx) +
+        internal::get_cuda_dim<DIM0>(blockDim)*internal::get_cuda_dim<DIM0>(blockIdx);
+
+      const int ty = internal::get_cuda_dim<DIM1>(threadIdx) +
+        internal::get_cuda_dim<DIM1>(blockDim)*internal::get_cuda_dim<DIM1>(blockIdx);
+
+      const int tz = internal::get_cuda_dim<DIM2>(threadIdx) +
+        internal::get_cuda_dim<DIM2>(blockDim)*internal::get_cuda_dim<DIM2>(blockIdx);
+
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment1.begin() + ty));
+    }
+  }
+};
+
+/*
+Reshape threads in a block into a 1D iteration space
+*/
+template<int ... dim>
+struct cuda_flatten_block_threads_direct{};
+
+using cuda_flatten_block_threads_xy_direct = cuda_flatten_block_threads_direct<0,1>;
+using cuda_flatten_block_threads_xz_direct = cuda_flatten_block_threads_direct<0,2>;
+using cuda_flatten_block_threads_yx_direct = cuda_flatten_block_threads_direct<1,0>;
+using cuda_flatten_block_threads_yz_direct = cuda_flatten_block_threads_direct<1,2>;
+using cuda_flatten_block_threads_zx_direct = cuda_flatten_block_threads_direct<2,0>;
+using cuda_flatten_block_threads_zy_direct = cuda_flatten_block_threads_direct<2,1>;
+
+using cuda_flatten_block_threads_xyz_direct = cuda_flatten_block_threads_direct<0,1,2>;
+using cuda_flatten_block_threads_xzy_direct = cuda_flatten_block_threads_direct<0,2,1>;
+using cuda_flatten_block_threads_yxz_direct = cuda_flatten_block_threads_direct<1,0,2>;
+using cuda_flatten_block_threads_yzx_direct = cuda_flatten_block_threads_direct<1,2,0>;
+using cuda_flatten_block_threads_zxy_direct = cuda_flatten_block_threads_direct<2,0,1>;
+using cuda_flatten_block_threads_zyx_direct = cuda_flatten_block_threads_direct<2,1,0>;
+
+template<int ... dim>
+struct cuda_flatten_block_threads_loop{};
+
+using cuda_flatten_block_threads_xy_loop = cuda_flatten_block_threads_loop<0,1>;
+using cuda_flatten_block_threads_xz_loop = cuda_flatten_block_threads_loop<0,2>;
+using cuda_flatten_block_threads_yx_loop = cuda_flatten_block_threads_loop<1,0>;
+using cuda_flatten_block_threads_yz_loop = cuda_flatten_block_threads_loop<1,2>;
+using cuda_flatten_block_threads_zx_loop = cuda_flatten_block_threads_loop<2,0>;
+using cuda_flatten_block_threads_zy_loop = cuda_flatten_block_threads_loop<2,1>;
+
+using cuda_flatten_block_threads_xyz_loop = cuda_flatten_block_threads_loop<0,1,2>;
+using cuda_flatten_block_threads_xzy_loop = cuda_flatten_block_threads_loop<0,2,1>;
+using cuda_flatten_block_threads_yxz_loop = cuda_flatten_block_threads_loop<1,0,2>;
+using cuda_flatten_block_threads_yzx_loop = cuda_flatten_block_threads_loop<1,2,0>;
+using cuda_flatten_block_threads_zxy_loop = cuda_flatten_block_threads_loop<2,0,1>;
+using cuda_flatten_block_threads_zyx_loop = cuda_flatten_block_threads_loop<2,1,0>;
+
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<cuda_flatten_block_threads_direct<DIM0, DIM1>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
+      const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
+      const int bx = internal::get_cuda_dim<DIM0>(blockDim);
+      const int tid = tx + bx*ty;
+
+      if (tid < len) body(*(segment.begin() + tid));
+    }
+  }
+};
+
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<cuda_flatten_block_threads_loop<DIM0, DIM1>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int len = segment.end() - segment.begin();
+
+    const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
+    const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
+
+    const int bx = internal::get_cuda_dim<DIM0>(blockDim);
+    const int by = internal::get_cuda_dim<DIM1>(blockDim);
+
+    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
+      body(*(segment.begin() + tid));
+    }
+
+  }
+};
+
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<cuda_flatten_block_threads_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
+      const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
+      const int tz = internal::get_cuda_dim<DIM2>(threadIdx);
+      const int bx = internal::get_cuda_dim<DIM0>(blockDim);
+      const int by = internal::get_cuda_dim<DIM1>(blockDim);
+      const int tid = tx + bx*(ty + by*tz);
+
+      if (tid < len) body(*(segment.begin() + tid));
+    }
+  }
+};
+
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<cuda_flatten_block_threads_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int len = segment.end() - segment.begin();
+
+    const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
+    const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
+    const int tz = internal::get_cuda_dim<DIM2>(threadIdx);
+    const int bx = internal::get_cuda_dim<DIM0>(blockDim);
+    const int by = internal::get_cuda_dim<DIM1>(blockDim);
+    const int bz = internal::get_cuda_dim<DIM2>(blockDim);
+
+    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
+      body(*(segment.begin() + tid));
+    }
+
+  }
+};
+
+
+/*
+  CUDA thread loops with block strides
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = internal::get_cuda_dim<DIM>(threadIdx);
+         tx < len;
+         tx += internal::get_cuda_dim<DIM>(blockDim) )
+    {
+      body(*(segment.begin() + tx));
+    }
+  }
+};
+
+/*
+  CUDA thread direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM>(threadIdx);
+      if (tx < len) body(*(segment.begin() + tx));
+    }
+  }
+};
+
+
+/*
+  CUDA block loops with grid strides
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx = internal::get_cuda_dim<DIM>(blockIdx);
+         bx < len;
+         bx += internal::get_cuda_dim<DIM>(gridDim) ) {
+      body(*(segment.begin() + bx));
+    }
+  }
+};
+
+/*
+  CUDA block direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bx = internal::get_cuda_dim<DIM>(blockIdx);
+      if (bx < len) body(*(segment.begin() + bx));
+    }
+  }
+};
+
+/*
+  CUDA thread loops with block strides + Return Index
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = internal::get_cuda_dim<DIM>(threadIdx);
+         tx < len;
+         tx += internal::get_cuda_dim<DIM>(blockDim) )
+    {
+      body(*(segment.begin() + tx), tx);
+    }
+  }
+};
+
+/*
+  CUDA thread direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM>(threadIdx);
+      if (tx < len) body(*(segment.begin() + tx), tx);
+    }
+  }
+};
+
+/*
+  CUDA block loops with grid strides
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx = internal::get_cuda_dim<DIM>(blockIdx);
+         bx < len;
+         bx += internal::get_cuda_dim<DIM>(gridDim) ) {
+      body(*(segment.begin() + bx), bx);
+    }
+  }
+};
+
+/*
+  CUDA block direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bx = internal::get_cuda_dim<DIM>(blockIdx);
+      if (bx < len) body(*(segment.begin() + bx), bx);
+    }
+  }
+};
+
+// perfectly nested cuda direct policies
+using cuda_block_xy_nested_direct = cuda_block_xyz_direct<0,1>;
+using cuda_block_xz_nested_direct = cuda_block_xyz_direct<0,2>;
+using cuda_block_yx_nested_direct = cuda_block_xyz_direct<1,0>;
+using cuda_block_yz_nested_direct = cuda_block_xyz_direct<1,2>;
+using cuda_block_zx_nested_direct = cuda_block_xyz_direct<2,0>;
+using cuda_block_zy_nested_direct = cuda_block_xyz_direct<2,1>;
+
+using cuda_block_xyz_nested_direct = cuda_block_xyz_direct<0,1,2>;
+using cuda_block_xzy_nested_direct = cuda_block_xyz_direct<0,2,1>;
+using cuda_block_yxz_nested_direct = cuda_block_xyz_direct<1,0,2>;
+using cuda_block_yzx_nested_direct = cuda_block_xyz_direct<1,2,0>;
+using cuda_block_zxy_nested_direct = cuda_block_xyz_direct<2,0,1>;
+using cuda_block_zyx_nested_direct = cuda_block_xyz_direct<2,1,0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<cuda_block_xyz_direct<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
+      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<cuda_block_xyz_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
+      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
+      const int tz = internal::get_cuda_dim<DIM2>(blockIdx);
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz));
+    }
+  }
+};
+
+/*
+  Perfectly nested cuda direct policies
+  Return local index
+*/
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopICountExecute<cuda_block_xyz_direct<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
+      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+             tx, ty);
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopICountExecute<cuda_block_xyz_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
+      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
+      const int tz = internal::get_cuda_dim<DIM2>(blockIdx);
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz), tx, ty, tz);
+    }
+  }
+};
+
+// perfectly nested cuda loop policies
+using cuda_block_xy_nested_loop = cuda_block_xyz_loop<0,1>;
+using cuda_block_xz_nested_loop = cuda_block_xyz_loop<0,2>;
+using cuda_block_yx_nested_loop = cuda_block_xyz_loop<1,0>;
+using cuda_block_yz_nested_loop = cuda_block_xyz_loop<1,2>;
+using cuda_block_zx_nested_loop = cuda_block_xyz_loop<2,0>;
+using cuda_block_zy_nested_loop = cuda_block_xyz_loop<2,1>;
+
+using cuda_block_xyz_nested_loop = cuda_block_xyz_loop<0,1,2>;
+using cuda_block_xzy_nested_loop = cuda_block_xyz_loop<0,2,1>;
+using cuda_block_yxz_nested_loop = cuda_block_xyz_loop<1,0,2>;
+using cuda_block_yzx_nested_loop = cuda_block_xyz_loop<1,2,0>;
+using cuda_block_zxy_nested_loop = cuda_block_xyz_loop<2,0,1>;
+using cuda_block_zyx_nested_loop = cuda_block_xyz_loop<2,1,0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<cuda_block_xyz_loop<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+
+      for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
+           bx < len0;
+           bx += internal::get_cuda_dim<DIM0>(gridDim))
+      {
+        for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
+             by < len1;
+             by += internal::get_cuda_dim<DIM1>(gridDim))
+        {
+
+          body(*(segment0.begin() + bx), *(segment1.begin() + by));
+        }
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<cuda_block_xyz_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
+         bx < len0;
+         bx += internal::get_cuda_dim<DIM0>(gridDim))
+    {
+
+      for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
+           by < len1;
+           by += internal::get_cuda_dim<DIM1>(gridDim))
+      {
+
+        for (int bz = internal::get_cuda_dim<DIM2>(blockIdx);
+             bz < len2;
+             bz += internal::get_cuda_dim<DIM2>(gridDim))
+        {
+
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz));
+        }
+      }
+    }
+  }
+};
+
+/*
+  perfectly nested cuda loop policies + returns local index
+*/
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopICountExecute<cuda_block_xyz_loop<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+
+      for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
+           bx < len0;
+           bx += internal::get_cuda_dim<DIM0>(gridDim))
+      {
+        for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
+             by < len1;
+             by += internal::get_cuda_dim<DIM1>(gridDim))
+        {
+
+          body(*(segment0.begin() + bx), *(segment1.begin() + by), bx, by);
+        }
+      }
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopICountExecute<cuda_block_xyz_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
+         bx < len0;
+         bx += internal::get_cuda_dim<DIM0>(gridDim))
+    {
+
+      for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
+           by < len1;
+           by += internal::get_cuda_dim<DIM1>(gridDim))
+      {
+
+        for (int bz = internal::get_cuda_dim<DIM2>(blockIdx);
+             bz < len2;
+             bz += internal::get_cuda_dim<DIM2>(gridDim))
+        {
+
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz), bx, by, bz);
+        }
+      }
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
+         tx < len;
+         tx += internal::get_cuda_dim<DIM>(blockDim) * tile_size)
+    {
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
+    if(tx < len)
+    {
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
+
+         tx < len;
+
+         tx += internal::get_cuda_dim<DIM>(gridDim) * tile_size)
+    {
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int tx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
+    if(tx < len){
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+//Tile execute + return index
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
+         tx < len;
+         tx += internal::get_cuda_dim<DIM>(blockDim) * tile_size)
+    {
+      body(segment.slice(tx, tile_size), tx/tile_size);
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
+    if(tx < len)
+    {
+      body(segment.slice(tx, tile_size), tx/tile_size);
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
+
+         bx < len;
+
+         bx += internal::get_cuda_dim<DIM>(gridDim) * tile_size)
+    {
+      body(segment.slice(bx, tile_size), bx/tile_size);
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int bx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
+    if(bx < len){
+      body(segment.slice(bx, tile_size), bx/tile_size);
+    }
+  }
+};
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 0cf23eb9ad..6c22ced4a9 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -148,13 +148,26 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
         BODY body = RAJA::cuda::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        {
-          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-        }
+        void *args[] = {(void*)&p_body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -223,15 +236,31 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         BODY body = RAJA::cuda::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
+        void *args[] = {(void*)&p_body};
         RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
-      }
+
+    }
   }
 
   template <typename BODY_IN>
@@ -272,13 +301,27 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         BODY body = RAJA::cuda::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
+
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        {
-          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-        }
+        void *args[] = {(void*)&p_body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index edeb6a91b6..d8d3164841 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -82,11 +82,26 @@ struct LaunchExecute<RAJA::hip_launch_t<async, 0>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::hip_launch_t<async,0>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
+        void *args[] = {(void*)&p_body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -130,11 +145,26 @@ struct LaunchExecute<RAJA::hip_launch_t<async, 0>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
+        void *args[] = {(void*)&p_body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -200,11 +230,26 @@ struct LaunchExecute<RAJA::hip_launch_t<async, nthreads>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::hip_launch_t<async, nthreads>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
+        void *args[] = {(void*)&p_body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -248,11 +293,26 @@ struct LaunchExecute<RAJA::hip_launch_t<async, nthreads>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
+        //
+        // Configure plugins
+        //
+        util::PluginContext context{util::make_context<RAJA::hip_launch_t<async, nthreads>>()};
+        util::callPreCapturePlugins(context);
+
+        using RAJA::util::trigger_updates_before;
+        auto p_body = trigger_updates_before(body);
+
+        util::callPostCapturePlugins(context);
+
+        util::callPreLaunchPlugins(context);
+
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
+        void *args[] = {(void*)&p_body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
+
+        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;

From efc103d1d284028021a1b84a50da065fded0c657 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 14 Nov 2022 14:53:30 -0800
Subject: [PATCH 48/94] update omp seq launch to support plugins

---
 include/RAJA/policy/loop/launch.hpp   | 37 ++++++++++++++++++++++-----
 include/RAJA/policy/openmp/launch.hpp | 36 ++++++++++++++++++++++++--
 2 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/loop/launch.hpp b/include/RAJA/policy/loop/launch.hpp
index 50d5ff1c0a..06b2d6332a 100644
--- a/include/RAJA/policy/loop/launch.hpp
+++ b/include/RAJA/policy/loop/launch.hpp
@@ -47,10 +47,24 @@ struct LaunchExecute<RAJA::seq_launch_t> {
 
     ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
-    body(ctx);
+    //
+    // Configure plugins
+    //
+    util::PluginContext context{util::make_context<RAJA::seq_launch_t>()};
+    util::callPreCapturePlugins(context);
+
+    using RAJA::util::trigger_updates_before;
+    auto p_body = trigger_updates_before(body);
+
+    util::callPostCapturePlugins(context);
+
+    util::callPreLaunchPlugins(context);
+
+    p_body(ctx);
 
     free(ctx.shared_mem_ptr);
     ctx.shared_mem_ptr = nullptr;
+    util::callPostLaunchPlugins(context);
   }
 
   template <typename BODY>
@@ -59,15 +73,26 @@ struct LaunchExecute<RAJA::seq_launch_t> {
   {
 
     LaunchContext ctx;
+    ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
-    char *kernel_local_mem = new char[params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    //
+    // Configure plugins
+    //
+    util::PluginContext context{util::make_context<RAJA::seq_launch_t>()};
+    util::callPreCapturePlugins(context);
 
-    body(ctx);
+    using RAJA::util::trigger_updates_before;
+    auto p_body = trigger_updates_before(body);
 
-    delete[] kernel_local_mem;
-    ctx.shared_mem_ptr = nullptr;
+    util::callPostCapturePlugins(context);
+
+    util::callPreLaunchPlugins(context);
+
+    p_body(ctx);
 
+    free(ctx.shared_mem_ptr);
+    ctx.shared_mem_ptr = nullptr;
+    util::callPostLaunchPlugins(context);
     return resources::EventProxy<resources::Resource>(res);
   }
 
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 7ec9a7c5ed..191125c33d 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -32,12 +32,26 @@ struct LaunchExecute<RAJA::omp_launch_t> {
   template <typename BODY>
   static void exec(LaunchParams const &params, const char *, BODY const &body)
   {
+
+    //
+    // Configure plugins
+    //
+    util::PluginContext context{util::make_context<RAJA::omp_launch_t>()};
+    util::callPreCapturePlugins(context);
+
+    using RAJA::util::trigger_updates_before;
+    auto p_body = trigger_updates_before(body);
+
+    util::callPostCapturePlugins(context);
+
+    util::callPreLaunchPlugins(context);
+
     RAJA::region<RAJA::omp_parallel_region>([&]() {
 
         LaunchContext ctx;
 
         using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(body);
+        auto loop_body = thread_privatize(p_body);
 
         ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
@@ -46,18 +60,34 @@ struct LaunchExecute<RAJA::omp_launch_t> {
         free(ctx.shared_mem_ptr);
         ctx.shared_mem_ptr = nullptr;
     });
+
+    util::callPostLaunchPlugins(context);
   }
 
   template <typename BODY>
   static resources::EventProxy<resources::Resource>
   exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body)
   {
+
+    //
+    // Configure plugins
+    //
+    util::PluginContext context{util::make_context<RAJA::omp_launch_t>()};
+    util::callPreCapturePlugins(context);
+
+    using RAJA::util::trigger_updates_before;
+    auto p_body = trigger_updates_before(body);
+
+    util::callPostCapturePlugins(context);
+
+    util::callPreLaunchPlugins(context);
+
     RAJA::region<RAJA::omp_parallel_region>([&]() {
 
         LaunchContext ctx;
 
         using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(body);
+        auto loop_body = thread_privatize(p_body);
 
         ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
@@ -67,6 +97,8 @@ struct LaunchExecute<RAJA::omp_launch_t> {
         ctx.shared_mem_ptr = nullptr;
     });
 
+    util::callPostLaunchPlugins(context);
+
     return resources::EventProxy<resources::Resource>(res);
   }
 

From 6a1980f543b0da220a379073dd0ddcb46940ea6d Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 14 Nov 2022 14:54:09 -0800
Subject: [PATCH 49/94] remove scratch file

---
 include/RAJA/policy/cuda/#launch.hpp# | 1202 -------------------------
 1 file changed, 1202 deletions(-)
 delete mode 100644 include/RAJA/policy/cuda/#launch.hpp#

diff --git a/include/RAJA/policy/cuda/#launch.hpp# b/include/RAJA/policy/cuda/#launch.hpp#
deleted file mode 100644
index d98dd20c5b..0000000000
--- a/include/RAJA/policy/cuda/#launch.hpp#
+++ /dev/null
@@ -1,1202 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   RAJA header file containing user interface for RAJA::launch::cuda
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_pattern_launch_cuda_HPP
-#define RAJA_pattern_launch_cuda_HPP
-
-#include "RAJA/pattern/launch/launch_core.hpp"
-#include "RAJA/pattern/detail/privatizer.hpp"
-#include "RAJA/policy/cuda/policy.hpp"
-#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
-#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
-#include "RAJA/util/resource.hpp"
-
-namespace RAJA
-{
-
-template <typename BODY>
-__global__ void launch_global_fcn(BODY body_in)
-{
-
-  LaunchContext ctx;
-
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
-
-  //Set pointer to shared memory
-  extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
-
-  body(ctx);
-}
-
-template <bool async>
-struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
-// cuda_launch_t num_threads set to 1, but not used in launch of kernel
-
-  template <typename BODY_IN>
-  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
-  {
-
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    resources::Cuda cuda_res = resources::Cuda::get_default();
-    //
-    // Compute the number of blocks and threads
-    //
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-
-      RAJA_FT_BEGIN;
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&p_body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
-      }
-
-      RAJA_FT_END;
-
-    }
-
-  };
-
-  template <typename BODY_IN>
-  static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
-  {
-
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn<BODY>;
-
-    // Get the concrete resource
-    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&p_body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-};
-
-
-template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
-__launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_global_fcn_fixed(BODY body_in)
-{
-
-  LaunchContext ctx;
-
-  using RAJA::internal::thread_privatize;
-  auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
-
-  //Set pointer to shared memory
-  extern __shared__ char raja_shmem_ptr[];
-  ctx.shared_mem_ptr = raja_shmem_ptr;
-
-  body(ctx);
-}
-
-template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
-
-  template <typename BODY_IN>
-  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
-  {
-
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
-
-    resources::Cuda cuda_res = resources::Cuda::get_default();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
-
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&p_body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
-      }
-
-      RAJA_FT_END;
-    }
-
-  }
-}
-
-  template <typename BODY_IN>
-  static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
-  {
-
-    using BODY = camp::decay<BODY_IN>;
-
-    auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
-
-    //Get the concrete resource
-    resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
-
-    //
-    // Compute the number of blocks and threads
-    //
-
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
-
-    // Only launch kernel if we have something to iterate over
-    constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
-
-      RAJA_FT_BEGIN;
-
-      {
-        //
-        // Privatize the loop_body, using make_launch_body to setup reductions
-        //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
-
-
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
-        //
-        // Launch the kernel
-        //
-        void *args[] = {(void*)&p_body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
-      }
-
-      RAJA_FT_END;
-    }
-
-    return resources::EventProxy<resources::Resource>(res);
-  }
-
-};
-
-/*
-   CUDA global thread mapping
-*/
-template<int ... DIM>
-struct cuda_global_thread;
-
-using cuda_global_thread_x = cuda_global_thread<0>;
-using cuda_global_thread_y = cuda_global_thread<1>;
-using cuda_global_thread_z = cuda_global_thread<2>;
-
-template <typename SEGMENT, int DIM>
-struct LoopExecute<cuda_global_thread<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM>(threadIdx) +
-        internal::get_cuda_dim<DIM>(blockDim)*internal::get_cuda_dim<DIM>(blockIdx);
-
-      if (tx < len) body(*(segment.begin() + tx));
-    }
-  }
-};
-
-using cuda_global_thread_xy = cuda_global_thread<0,1>;
-using cuda_global_thread_xz = cuda_global_thread<0,2>;
-using cuda_global_thread_yx = cuda_global_thread<1,0>;
-using cuda_global_thread_yz = cuda_global_thread<1,2>;
-using cuda_global_thread_zx = cuda_global_thread<2,0>;
-using cuda_global_thread_zy = cuda_global_thread<2,1>;
-
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<cuda_global_thread<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
-  {
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(threadIdx) +
-        internal::get_cuda_dim<DIM0>(blockDim)*internal::get_cuda_dim<DIM0>(blockIdx);
-
-      const int ty = internal::get_cuda_dim<DIM1>(threadIdx) +
-        internal::get_cuda_dim<DIM1>(blockDim)*internal::get_cuda_dim<DIM1>(blockIdx);
-
-      if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
-    }
-  }
-};
-
-using cuda_global_thread_xyz = cuda_global_thread<0,1,2>;
-using cuda_global_thread_xzy = cuda_global_thread<0,2,1>;
-using cuda_global_thread_yxz = cuda_global_thread<1,0,2>;
-using cuda_global_thread_yzx = cuda_global_thread<1,2,0>;
-using cuda_global_thread_zxy = cuda_global_thread<2,0,1>;
-using cuda_global_thread_zyx = cuda_global_thread<2,1,0>;
-
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<cuda_global_thread<DIM0, DIM1, DIM2>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
-  {
-    const int len2 = segment2.end() - segment2.begin();
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(threadIdx) +
-        internal::get_cuda_dim<DIM0>(blockDim)*internal::get_cuda_dim<DIM0>(blockIdx);
-
-      const int ty = internal::get_cuda_dim<DIM1>(threadIdx) +
-        internal::get_cuda_dim<DIM1>(blockDim)*internal::get_cuda_dim<DIM1>(blockIdx);
-
-      const int tz = internal::get_cuda_dim<DIM2>(threadIdx) +
-        internal::get_cuda_dim<DIM2>(blockDim)*internal::get_cuda_dim<DIM2>(blockIdx);
-
-      if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
-             *(segment1.begin() + ty));
-    }
-  }
-};
-
-/*
-Reshape threads in a block into a 1D iteration space
-*/
-template<int ... dim>
-struct cuda_flatten_block_threads_direct{};
-
-using cuda_flatten_block_threads_xy_direct = cuda_flatten_block_threads_direct<0,1>;
-using cuda_flatten_block_threads_xz_direct = cuda_flatten_block_threads_direct<0,2>;
-using cuda_flatten_block_threads_yx_direct = cuda_flatten_block_threads_direct<1,0>;
-using cuda_flatten_block_threads_yz_direct = cuda_flatten_block_threads_direct<1,2>;
-using cuda_flatten_block_threads_zx_direct = cuda_flatten_block_threads_direct<2,0>;
-using cuda_flatten_block_threads_zy_direct = cuda_flatten_block_threads_direct<2,1>;
-
-using cuda_flatten_block_threads_xyz_direct = cuda_flatten_block_threads_direct<0,1,2>;
-using cuda_flatten_block_threads_xzy_direct = cuda_flatten_block_threads_direct<0,2,1>;
-using cuda_flatten_block_threads_yxz_direct = cuda_flatten_block_threads_direct<1,0,2>;
-using cuda_flatten_block_threads_yzx_direct = cuda_flatten_block_threads_direct<1,2,0>;
-using cuda_flatten_block_threads_zxy_direct = cuda_flatten_block_threads_direct<2,0,1>;
-using cuda_flatten_block_threads_zyx_direct = cuda_flatten_block_threads_direct<2,1,0>;
-
-template<int ... dim>
-struct cuda_flatten_block_threads_loop{};
-
-using cuda_flatten_block_threads_xy_loop = cuda_flatten_block_threads_loop<0,1>;
-using cuda_flatten_block_threads_xz_loop = cuda_flatten_block_threads_loop<0,2>;
-using cuda_flatten_block_threads_yx_loop = cuda_flatten_block_threads_loop<1,0>;
-using cuda_flatten_block_threads_yz_loop = cuda_flatten_block_threads_loop<1,2>;
-using cuda_flatten_block_threads_zx_loop = cuda_flatten_block_threads_loop<2,0>;
-using cuda_flatten_block_threads_zy_loop = cuda_flatten_block_threads_loop<2,1>;
-
-using cuda_flatten_block_threads_xyz_loop = cuda_flatten_block_threads_loop<0,1,2>;
-using cuda_flatten_block_threads_xzy_loop = cuda_flatten_block_threads_loop<0,2,1>;
-using cuda_flatten_block_threads_yxz_loop = cuda_flatten_block_threads_loop<1,0,2>;
-using cuda_flatten_block_threads_yzx_loop = cuda_flatten_block_threads_loop<1,2,0>;
-using cuda_flatten_block_threads_zxy_loop = cuda_flatten_block_threads_loop<2,0,1>;
-using cuda_flatten_block_threads_zyx_loop = cuda_flatten_block_threads_loop<2,1,0>;
-
-template<typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<cuda_flatten_block_threads_direct<DIM0, DIM1>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
-      const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
-      const int bx = internal::get_cuda_dim<DIM0>(blockDim);
-      const int tid = tx + bx*ty;
-
-      if (tid < len) body(*(segment.begin() + tid));
-    }
-  }
-};
-
-template<typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<cuda_flatten_block_threads_loop<DIM0, DIM1>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-    const int len = segment.end() - segment.begin();
-
-    const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
-    const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
-
-    const int bx = internal::get_cuda_dim<DIM0>(blockDim);
-    const int by = internal::get_cuda_dim<DIM1>(blockDim);
-
-    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
-      body(*(segment.begin() + tid));
-    }
-
-  }
-};
-
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<cuda_flatten_block_threads_direct<DIM0, DIM1, DIM2>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-    const int len = segment.end() - segment.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
-      const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
-      const int tz = internal::get_cuda_dim<DIM2>(threadIdx);
-      const int bx = internal::get_cuda_dim<DIM0>(blockDim);
-      const int by = internal::get_cuda_dim<DIM1>(blockDim);
-      const int tid = tx + bx*(ty + by*tz);
-
-      if (tid < len) body(*(segment.begin() + tid));
-    }
-  }
-};
-
-template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<cuda_flatten_block_threads_loop<DIM0, DIM1, DIM2>, SEGMENT>
-{
-  template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-    const int len = segment.end() - segment.begin();
-
-    const int tx = internal::get_cuda_dim<DIM0>(threadIdx);
-    const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
-    const int tz = internal::get_cuda_dim<DIM2>(threadIdx);
-    const int bx = internal::get_cuda_dim<DIM0>(blockDim);
-    const int by = internal::get_cuda_dim<DIM1>(blockDim);
-    const int bz = internal::get_cuda_dim<DIM2>(blockDim);
-
-    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
-      body(*(segment.begin() + tid));
-    }
-
-  }
-};
-
-
-/*
-  CUDA thread loops with block strides
-*/
-template <typename SEGMENT, int DIM>
-struct LoopExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int tx = internal::get_cuda_dim<DIM>(threadIdx);
-         tx < len;
-         tx += internal::get_cuda_dim<DIM>(blockDim) )
-    {
-      body(*(segment.begin() + tx));
-    }
-  }
-};
-
-/*
-  CUDA thread direct mappings
-*/
-template <typename SEGMENT, int DIM>
-struct LoopExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM>(threadIdx);
-      if (tx < len) body(*(segment.begin() + tx));
-    }
-  }
-};
-
-
-/*
-  CUDA block loops with grid strides
-*/
-template <typename SEGMENT, int DIM>
-struct LoopExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int bx = internal::get_cuda_dim<DIM>(blockIdx);
-         bx < len;
-         bx += internal::get_cuda_dim<DIM>(gridDim) ) {
-      body(*(segment.begin() + bx));
-    }
-  }
-};
-
-/*
-  CUDA block direct mappings
-*/
-template <typename SEGMENT, int DIM>
-struct LoopExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-    {
-      const int bx = internal::get_cuda_dim<DIM>(blockIdx);
-      if (bx < len) body(*(segment.begin() + bx));
-    }
-  }
-};
-
-/*
-  CUDA thread loops with block strides + Return Index
-*/
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int tx = internal::get_cuda_dim<DIM>(threadIdx);
-         tx < len;
-         tx += internal::get_cuda_dim<DIM>(blockDim) )
-    {
-      body(*(segment.begin() + tx), tx);
-    }
-  }
-};
-
-/*
-  CUDA thread direct mappings
-*/
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM>(threadIdx);
-      if (tx < len) body(*(segment.begin() + tx), tx);
-    }
-  }
-};
-
-/*
-  CUDA block loops with grid strides
-*/
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int bx = internal::get_cuda_dim<DIM>(blockIdx);
-         bx < len;
-         bx += internal::get_cuda_dim<DIM>(gridDim) ) {
-      body(*(segment.begin() + bx), bx);
-    }
-  }
-};
-
-/*
-  CUDA block direct mappings
-*/
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-    {
-      const int bx = internal::get_cuda_dim<DIM>(blockIdx);
-      if (bx < len) body(*(segment.begin() + bx), bx);
-    }
-  }
-};
-
-// perfectly nested cuda direct policies
-using cuda_block_xy_nested_direct = cuda_block_xyz_direct<0,1>;
-using cuda_block_xz_nested_direct = cuda_block_xyz_direct<0,2>;
-using cuda_block_yx_nested_direct = cuda_block_xyz_direct<1,0>;
-using cuda_block_yz_nested_direct = cuda_block_xyz_direct<1,2>;
-using cuda_block_zx_nested_direct = cuda_block_xyz_direct<2,0>;
-using cuda_block_zy_nested_direct = cuda_block_xyz_direct<2,1>;
-
-using cuda_block_xyz_nested_direct = cuda_block_xyz_direct<0,1,2>;
-using cuda_block_xzy_nested_direct = cuda_block_xyz_direct<0,2,1>;
-using cuda_block_yxz_nested_direct = cuda_block_xyz_direct<1,0,2>;
-using cuda_block_yzx_nested_direct = cuda_block_xyz_direct<1,2,0>;
-using cuda_block_zxy_nested_direct = cuda_block_xyz_direct<2,0,1>;
-using cuda_block_zyx_nested_direct = cuda_block_xyz_direct<2,1,0>;
-
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<cuda_block_xyz_direct<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
-  {
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
-      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
-      if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
-    }
-  }
-};
-
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<cuda_block_xyz_direct<DIM0, DIM1, DIM2>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
-  {
-    const int len2 = segment2.end() - segment2.begin();
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
-      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
-      const int tz = internal::get_cuda_dim<DIM2>(blockIdx);
-      if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
-             *(segment2.begin() + tz));
-    }
-  }
-};
-
-/*
-  Perfectly nested cuda direct policies
-  Return local index
-*/
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<cuda_block_xyz_direct<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
-  {
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
-      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
-      if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             tx, ty);
-    }
-  }
-};
-
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<cuda_block_xyz_direct<DIM0, DIM1, DIM2>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
-  {
-    const int len2 = segment2.end() - segment2.begin();
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-      const int tx = internal::get_cuda_dim<DIM0>(blockIdx);
-      const int ty = internal::get_cuda_dim<DIM1>(blockIdx);
-      const int tz = internal::get_cuda_dim<DIM2>(blockIdx);
-      if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
-             *(segment2.begin() + tz), tx, ty, tz);
-    }
-  }
-};
-
-// perfectly nested cuda loop policies
-using cuda_block_xy_nested_loop = cuda_block_xyz_loop<0,1>;
-using cuda_block_xz_nested_loop = cuda_block_xyz_loop<0,2>;
-using cuda_block_yx_nested_loop = cuda_block_xyz_loop<1,0>;
-using cuda_block_yz_nested_loop = cuda_block_xyz_loop<1,2>;
-using cuda_block_zx_nested_loop = cuda_block_xyz_loop<2,0>;
-using cuda_block_zy_nested_loop = cuda_block_xyz_loop<2,1>;
-
-using cuda_block_xyz_nested_loop = cuda_block_xyz_loop<0,1,2>;
-using cuda_block_xzy_nested_loop = cuda_block_xyz_loop<0,2,1>;
-using cuda_block_yxz_nested_loop = cuda_block_xyz_loop<1,0,2>;
-using cuda_block_yzx_nested_loop = cuda_block_xyz_loop<1,2,0>;
-using cuda_block_zxy_nested_loop = cuda_block_xyz_loop<2,0,1>;
-using cuda_block_zyx_nested_loop = cuda_block_xyz_loop<2,1,0>;
-
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<cuda_block_xyz_loop<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
-  {
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-
-      for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
-           bx < len0;
-           bx += internal::get_cuda_dim<DIM0>(gridDim))
-      {
-        for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
-             by < len1;
-             by += internal::get_cuda_dim<DIM1>(gridDim))
-        {
-
-          body(*(segment0.begin() + bx), *(segment1.begin() + by));
-        }
-      }
-    }
-  }
-};
-
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<cuda_block_xyz_loop<DIM0, DIM1, DIM2>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
-  {
-    const int len2 = segment2.end() - segment2.begin();
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-
-    for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
-         bx < len0;
-         bx += internal::get_cuda_dim<DIM0>(gridDim))
-    {
-
-      for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
-           by < len1;
-           by += internal::get_cuda_dim<DIM1>(gridDim))
-      {
-
-        for (int bz = internal::get_cuda_dim<DIM2>(blockIdx);
-             bz < len2;
-             bz += internal::get_cuda_dim<DIM2>(gridDim))
-        {
-
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
-               *(segment2.begin() + bz));
-        }
-      }
-    }
-  }
-};
-
-/*
-  perfectly nested cuda loop policies + returns local index
-*/
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<cuda_block_xyz_loop<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
-  {
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-    {
-
-      for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
-           bx < len0;
-           bx += internal::get_cuda_dim<DIM0>(gridDim))
-      {
-        for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
-             by < len1;
-             by += internal::get_cuda_dim<DIM1>(gridDim))
-        {
-
-          body(*(segment0.begin() + bx), *(segment1.begin() + by), bx, by);
-        }
-      }
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<cuda_block_xyz_loop<DIM0, DIM1, DIM2>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
-  {
-    const int len2 = segment2.end() - segment2.begin();
-    const int len1 = segment1.end() - segment1.begin();
-    const int len0 = segment0.end() - segment0.begin();
-
-    for (int bx = internal::get_cuda_dim<DIM0>(blockIdx);
-         bx < len0;
-         bx += internal::get_cuda_dim<DIM0>(gridDim))
-    {
-
-      for (int by = internal::get_cuda_dim<DIM1>(blockIdx);
-           by < len1;
-           by += internal::get_cuda_dim<DIM1>(gridDim))
-      {
-
-        for (int bz = internal::get_cuda_dim<DIM2>(blockIdx);
-             bz < len2;
-             bz += internal::get_cuda_dim<DIM2>(gridDim))
-        {
-
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
-               *(segment2.begin() + bz), bx, by, bz);
-        }
-      }
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM>
-struct TileExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
-         tx < len;
-         tx += internal::get_cuda_dim<DIM>(blockDim) * tile_size)
-    {
-      body(segment.slice(tx, tile_size));
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM>
-struct TileExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
-    if(tx < len)
-    {
-      body(segment.slice(tx, tile_size));
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM>
-struct TileExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int tx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
-
-         tx < len;
-
-         tx += internal::get_cuda_dim<DIM>(gridDim) * tile_size)
-    {
-      body(segment.slice(tx, tile_size));
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM>
-struct TileExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    int tx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
-    if(tx < len){
-      body(segment.slice(tx, tile_size));
-    }
-  }
-};
-
-//Tile execute + return index
-template <typename SEGMENT, int DIM>
-struct TileICountExecute<cuda_thread_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
-         tx < len;
-         tx += internal::get_cuda_dim<DIM>(blockDim) * tile_size)
-    {
-      body(segment.slice(tx, tile_size), tx/tile_size);
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM>
-struct TileICountExecute<cuda_thread_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    int tx = internal::get_cuda_dim<DIM>(threadIdx) * tile_size;
-    if(tx < len)
-    {
-      body(segment.slice(tx, tile_size), tx/tile_size);
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM>
-struct TileICountExecute<cuda_block_xyz_loop<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    for (int bx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
-
-         bx < len;
-
-         bx += internal::get_cuda_dim<DIM>(gridDim) * tile_size)
-    {
-      body(segment.slice(bx, tile_size), bx/tile_size);
-    }
-  }
-};
-
-
-template <typename SEGMENT, int DIM>
-struct TileICountExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
-  {
-
-    const int len = segment.end() - segment.begin();
-
-    int bx = internal::get_cuda_dim<DIM>(blockIdx) * tile_size;
-    if(bx < len){
-      body(segment.slice(bx, tile_size), bx/tile_size);
-    }
-  }
-};
-
-}  // namespace RAJA
-#endif

From f4973ce0e0c3925da3a990d95cb125cce52c8239 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 14 Nov 2022 14:55:57 -0800
Subject: [PATCH 50/94] remove extra lines

---
 include/RAJA/policy/cuda/launch.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 6c22ced4a9..0d84f01754 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -74,7 +74,6 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
     if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
          blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
-
       RAJA_FT_BEGIN;
       {
         //
@@ -105,7 +104,6 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
       }
 
       RAJA_FT_END;
-
     }
 
   };

From 77c80cb6edfae6ac01d7e4a52625e2a63026750c Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Tue, 15 Nov 2022 13:13:53 -0800
Subject: [PATCH 51/94] Update build scripts to work with newer cmake version

---
 scripts/lc-builds/blueos_clang_omptarget.sh | 3 ++-
 scripts/lc-builds/blueos_xl_omptarget.sh    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh
index e0aa7104a7..67a477288b 100755
--- a/scripts/lc-builds/blueos_clang_omptarget.sh
+++ b/scripts/lc-builds/blueos_clang_omptarget.sh
@@ -40,7 +40,8 @@ cmake \
   -DENABLE_OPENMP=On \
   -DENABLE_CUDA=Off \
   -DRAJA_ENABLE_TARGET_OPENMP=On \
-  -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
+  -DBLT_OPENMP_COMPILE_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
+  -DBLT_OPENMP_LINK_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..
diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh
index 2b3fadcbb0..6eb7e162b2 100755
--- a/scripts/lc-builds/blueos_xl_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl_omptarget.sh
@@ -37,7 +37,8 @@ cmake \
   -C ../host-configs/lc-builds/blueos/xl_X.cmake \
   -DENABLE_OPENMP=On \
   -DRAJA_ENABLE_TARGET_OPENMP=On \
-  -DOpenMP_CXX_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \
+  -DBLT_OPENMP_COMPILE_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \
+  -DBLT_OPENMP_LINK_FLAGS="-qoffload;-qsmp=omp;-qalias=noansi" \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
   ..

From 289949c74bb0fe91e4bfb73cabc89cd4a682719c Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 15 Nov 2022 14:32:08 -0800
Subject: [PATCH 52/94] integration test for plugin and launch

---
 test/include/RAJA_test-plugin-launchpol.hpp   | 35 ++++++++
 test/integration/plugin/CMakeLists.txt        | 11 +++
 .../plugin/test-plugin-launch.cpp.in          | 38 +++++++++
 .../plugin/tests/test-plugin-launch.hpp       | 85 +++++++++++++++++++
 4 files changed, 169 insertions(+)
 create mode 100644 test/include/RAJA_test-plugin-launchpol.hpp
 create mode 100644 test/integration/plugin/test-plugin-launch.cpp.in
 create mode 100644 test/integration/plugin/tests/test-plugin-launch.hpp

diff --git a/test/include/RAJA_test-plugin-launchpol.hpp b/test/include/RAJA_test-plugin-launchpol.hpp
new file mode 100644
index 0000000000..862f2018dd
--- /dev/null
+++ b/test/include/RAJA_test-plugin-launchpol.hpp
@@ -0,0 +1,35 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Kernel execution policy lists used throughout plugin tests
+//
+
+#ifndef __RAJA_test_plugin_kernelpol_HPP__
+#define __RAJA_test_plugin_kernelpol_HPP__
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/list.hpp"
+
+// Sequential execution policy types
+using SequentialPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>>;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipPluginLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>>;
+
+#endif
+
+#endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/integration/plugin/CMakeLists.txt b/test/integration/plugin/CMakeLists.txt
index da41e2a0bd..0d8d56643b 100644
--- a/test/integration/plugin/CMakeLists.txt
+++ b/test/integration/plugin/CMakeLists.txt
@@ -32,6 +32,17 @@ foreach( BACKEND ${PLUGIN_BACKENDS} )
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
 
+foreach( BACKEND ${PLUGIN_BACKENDS} )
+  configure_file( test-plugin-launch.cpp.in
+                  test-plugin-launch-${BACKEND}.cpp )
+  raja_add_test( NAME test-plugin-launch-${BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-launch-${BACKEND}.cpp
+                         plugin_to_test.cpp )
+
+  target_include_directories(test-plugin-launch-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
 set(DISPATCHERS Direct IndirectFunction IndirectVirtual)
 
 foreach( BACKEND ${PLUGIN_BACKENDS} )
diff --git a/test/integration/plugin/test-plugin-launch.cpp.in b/test/integration/plugin/test-plugin-launch.cpp.in
new file mode 100644
index 0000000000..7df20f0ba4
--- /dev/null
+++ b/test/integration/plugin/test-plugin-launch.cpp.in
@@ -0,0 +1,38 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-platform.hpp"
+
+#include "RAJA_test-plugin-launchpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-plugin-launch.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@PluginLaunchTypes =
+  Test< camp::cartesian_product<@BACKEND@PluginLaunchExecPols,
+                                @BACKEND@ResourceList,
+                                @BACKEND@PlatformList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               PluginLaunchTest,
+                               @BACKEND@PluginLaunchTypes);
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
new file mode 100644
index 0000000000..68f3dfbbaf
--- /dev/null
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -0,0 +1,85 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic integration tests for plugins with launch.
+///
+
+#ifndef __TEST_PLUGIN_LAUNCH_HPP__
+#define __TEST_PLUGIN_LAUNCH_HPP__
+
+#include "test-plugin.hpp"
+
+
+// Check that the plugin is called with the right Platform.
+// Check that the plugin is called the correct number of times,
+// once before and after each launch capture for the capture counter,
+// once before and after each launch invocation for the launch counter.
+
+// test with basic launch
+template <typename LaunchPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
+void PluginLaunchTestImpl()
+{
+  SetupPluginVars spv(WORKING_RES::get_default());
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  for (int i = 0; i < 10; i++) {
+
+    PluginTestCallable p_callable{data};
+
+    RAJA::launch<LaunchPolicy>
+      (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+      {
+        p_callable(i);
+      });
+
+    CounterData loop_data;
+    plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
+    ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
+  }
+
+  CounterData plugin_data;
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+
+  plugin_test_resource->deallocate(data);
+}
+
+
+TYPED_TEST_SUITE_P(PluginLaunchTest);
+template <typename T>
+class PluginLaunchTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(PluginLaunchTest, PluginLaunch)
+{
+  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  PluginLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(PluginLaunchTest,
+                            PluginLaunch);
+
+#endif  //__TEST_PLUGIN_LAUNCH_HPP__

From 9623c68717d112a52a8140fc53cfd4b1ae4668d1 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 15 Nov 2022 15:12:19 -0800
Subject: [PATCH 53/94] move plugin work to launch_core

---
 include/RAJA/pattern/launch/launch_core.hpp | 60 ++++++++++++++--
 include/RAJA/policy/cuda/launch.hpp         | 80 +++------------------
 include/RAJA/policy/loop/launch.hpp         | 37 ++--------
 3 files changed, 72 insertions(+), 105 deletions(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 12d6f16f6f..455ae733f8 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -227,8 +227,20 @@ void launch(LaunchParams const &params, const char *kernel_name, BODY const &bod
 {
   //Take the first policy as we assume the second policy is not user defined.
   //We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  util::callPreCapturePlugins(context);
+
+  using RAJA::util::trigger_updates_before;
+  auto p_body = trigger_updates_before(body);
+
+  util::callPostCapturePlugins(context);
+
+  util::callPreLaunchPlugins(context);
+
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
-  launch_t::exec(params, kernel_name, body);
+  launch_t::exec(params, kernel_name, p_body);
+
+  util::callPostLaunchPlugins(context);
 }
 
 
@@ -242,22 +254,41 @@ void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
 template <typename POLICY_LIST, typename BODY>
 void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
 {
+
+  //
+  //Configure plugins
+  //
+  util::PluginContext context{place == ExecPlace::HOST ?
+      util::make_context<typename POLICY_LIST::host_policy_t>()
+      : util::make_context<typename POLICY_LIST::device_policy_t>()};
+
+  util::callPreCapturePlugins(context);
+
+  using RAJA::util::trigger_updates_before;
+  auto p_body = trigger_updates_before(body);
+
+  util::callPostCapturePlugins(context);
+
+  util::callPreLaunchPlugins(context);
+
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      launch_t::exec(params, kernel_name, body);
+      launch_t::exec(params, kernel_name, p_body);
       break;
     }
 #ifdef RAJA_DEVICE_ACTIVE
   case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      launch_t::exec(params, kernel_name, body);
+      launch_t::exec(params, kernel_name, p_body);
       break;
     }
 #endif
     default:
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
+
+  util::callPostLaunchPlugins(context);
 }
 
 // Helper function to retrieve a resource based on the run-time policy - if a device is active
@@ -297,21 +328,40 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
     place = RAJA::ExecPlace::DEVICE;
   }
 
+  //
+  //Configure plugins
+  //
+  util::PluginContext context{place == ExecPlace::HOST ?
+      util::make_context<typename POLICY_LIST::host_policy_t>()
+      : util::make_context<typename POLICY_LIST::device_policy_t>()};
+
+  util::callPreCapturePlugins(context);
+
+  using RAJA::util::trigger_updates_before;
+  auto p_body = trigger_updates_before(body);
+
+  util::callPostCapturePlugins(context);
+
+  util::callPreLaunchPlugins(context);
+
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      return launch_t::exec(res, params, kernel_name, body); break;
+      return launch_t::exec(res, params, kernel_name, p_body); break;
     }
 #ifdef RAJA_DEVICE_ACTIVE
     case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      return launch_t::exec(res, params, kernel_name, body); break;
+      return launch_t::exec(res, params, kernel_name, p_body); break;
     }
 #endif
     default: {
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
     }
   }
+
+  util::callPostLaunchPlugins(context);
+
   //Should not get here;
   return resources::EventProxy<resources::Resource>(res);
 }
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 0d84f01754..9221925176 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -81,26 +81,11 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
         //
         BODY body = RAJA::cuda::make_launch_body(gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
+        void *args[] = {(void*)&body};
         RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -146,26 +131,13 @@ struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
         BODY body = RAJA::cuda::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
+        void *args[] = {(void*)&body};
+        {
+          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+        }
       }
 
       RAJA_FT_END;
@@ -234,31 +206,15 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         BODY body = RAJA::cuda::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
+        void *args[] = {(void*)&body};
         RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
-
-    }
+      }
   }
 
   template <typename BODY_IN>
@@ -299,27 +255,13 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         BODY body = RAJA::cuda::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
-
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
+        void *args[] = {(void*)&body};
+        {
+          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
+        }
       }
 
       RAJA_FT_END;
diff --git a/include/RAJA/policy/loop/launch.hpp b/include/RAJA/policy/loop/launch.hpp
index 06b2d6332a..50d5ff1c0a 100644
--- a/include/RAJA/policy/loop/launch.hpp
+++ b/include/RAJA/policy/loop/launch.hpp
@@ -47,24 +47,10 @@ struct LaunchExecute<RAJA::seq_launch_t> {
 
     ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
-    //
-    // Configure plugins
-    //
-    util::PluginContext context{util::make_context<RAJA::seq_launch_t>()};
-    util::callPreCapturePlugins(context);
-
-    using RAJA::util::trigger_updates_before;
-    auto p_body = trigger_updates_before(body);
-
-    util::callPostCapturePlugins(context);
-
-    util::callPreLaunchPlugins(context);
-
-    p_body(ctx);
+    body(ctx);
 
     free(ctx.shared_mem_ptr);
     ctx.shared_mem_ptr = nullptr;
-    util::callPostLaunchPlugins(context);
   }
 
   template <typename BODY>
@@ -73,26 +59,15 @@ struct LaunchExecute<RAJA::seq_launch_t> {
   {
 
     LaunchContext ctx;
-    ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
-    //
-    // Configure plugins
-    //
-    util::PluginContext context{util::make_context<RAJA::seq_launch_t>()};
-    util::callPreCapturePlugins(context);
+    char *kernel_local_mem = new char[params.shared_mem_size];
+    ctx.shared_mem_ptr = kernel_local_mem;
 
-    using RAJA::util::trigger_updates_before;
-    auto p_body = trigger_updates_before(body);
+    body(ctx);
 
-    util::callPostCapturePlugins(context);
-
-    util::callPreLaunchPlugins(context);
-
-    p_body(ctx);
-
-    free(ctx.shared_mem_ptr);
+    delete[] kernel_local_mem;
     ctx.shared_mem_ptr = nullptr;
-    util::callPostLaunchPlugins(context);
+
     return resources::EventProxy<resources::Resource>(res);
   }
 

From 954eb6aa1dc50638f5d29a628830d4fa3476c045 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 15 Nov 2022 15:15:06 -0800
Subject: [PATCH 54/94] Empty-Commit


From e6635d00750e20ad07ee1306a5ad8f6a01820533 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 15 Nov 2022 15:16:35 -0800
Subject: [PATCH 55/94] revert hip openmp launch  plugin work

---
 include/RAJA/policy/hip/launch.hpp    | 68 ++-------------------------
 include/RAJA/policy/openmp/launch.hpp | 36 +-------------
 2 files changed, 6 insertions(+), 98 deletions(-)

diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index d8d3164841..edeb6a91b6 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -82,26 +82,11 @@ struct LaunchExecute<RAJA::hip_launch_t<async, 0>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::hip_launch_t<async,0>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
+        void *args[] = {(void*)&body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -145,26 +130,11 @@ struct LaunchExecute<RAJA::hip_launch_t<async, 0>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::cuda_launch_t<async,1>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
+        void *args[] = {(void*)&body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -230,26 +200,11 @@ struct LaunchExecute<RAJA::hip_launch_t<async, nthreads>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::hip_launch_t<async, nthreads>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
+        void *args[] = {(void*)&body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
@@ -293,26 +248,11 @@ struct LaunchExecute<RAJA::hip_launch_t<async, nthreads>> {
         BODY body = RAJA::hip::make_launch_body(
             gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
-        //
-        // Configure plugins
-        //
-        util::PluginContext context{util::make_context<RAJA::hip_launch_t<async, nthreads>>()};
-        util::callPreCapturePlugins(context);
-
-        using RAJA::util::trigger_updates_before;
-        auto p_body = trigger_updates_before(body);
-
-        util::callPostCapturePlugins(context);
-
-        util::callPreLaunchPlugins(context);
-
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&p_body};
+        void *args[] = {(void*)&body};
         RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
-
-        util::callPostLaunchPlugins(context);
       }
 
       RAJA_FT_END;
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 191125c33d..7ec9a7c5ed 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -32,26 +32,12 @@ struct LaunchExecute<RAJA::omp_launch_t> {
   template <typename BODY>
   static void exec(LaunchParams const &params, const char *, BODY const &body)
   {
-
-    //
-    // Configure plugins
-    //
-    util::PluginContext context{util::make_context<RAJA::omp_launch_t>()};
-    util::callPreCapturePlugins(context);
-
-    using RAJA::util::trigger_updates_before;
-    auto p_body = trigger_updates_before(body);
-
-    util::callPostCapturePlugins(context);
-
-    util::callPreLaunchPlugins(context);
-
     RAJA::region<RAJA::omp_parallel_region>([&]() {
 
         LaunchContext ctx;
 
         using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(p_body);
+        auto loop_body = thread_privatize(body);
 
         ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
@@ -60,34 +46,18 @@ struct LaunchExecute<RAJA::omp_launch_t> {
         free(ctx.shared_mem_ptr);
         ctx.shared_mem_ptr = nullptr;
     });
-
-    util::callPostLaunchPlugins(context);
   }
 
   template <typename BODY>
   static resources::EventProxy<resources::Resource>
   exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body)
   {
-
-    //
-    // Configure plugins
-    //
-    util::PluginContext context{util::make_context<RAJA::omp_launch_t>()};
-    util::callPreCapturePlugins(context);
-
-    using RAJA::util::trigger_updates_before;
-    auto p_body = trigger_updates_before(body);
-
-    util::callPostCapturePlugins(context);
-
-    util::callPreLaunchPlugins(context);
-
     RAJA::region<RAJA::omp_parallel_region>([&]() {
 
         LaunchContext ctx;
 
         using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(p_body);
+        auto loop_body = thread_privatize(body);
 
         ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
 
@@ -97,8 +67,6 @@ struct LaunchExecute<RAJA::omp_launch_t> {
         ctx.shared_mem_ptr = nullptr;
     });
 
-    util::callPostLaunchPlugins(context);
-
     return resources::EventProxy<resources::Resource>(res);
   }
 

From 93dc420d9d0636fbcf0cb1966c6118552b8fa732 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 15 Nov 2022 15:50:49 -0800
Subject: [PATCH 56/94] guards for device

---
 include/RAJA/pattern/launch/launch_core.hpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 455ae733f8..2a419293a3 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -258,9 +258,13 @@ void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name
   //
   //Configure plugins
   //
+#ifdef RAJA_DEVICE_ACTIVE
   util::PluginContext context{place == ExecPlace::HOST ?
       util::make_context<typename POLICY_LIST::host_policy_t>()
       : util::make_context<typename POLICY_LIST::device_policy_t>()};
+#else
+  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+#endif
 
   util::callPreCapturePlugins(context);
 
@@ -331,9 +335,13 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
   //
   //Configure plugins
   //
+#ifdef RAJA_DEVICE_ACTIVE
   util::PluginContext context{place == ExecPlace::HOST ?
       util::make_context<typename POLICY_LIST::host_policy_t>()
       : util::make_context<typename POLICY_LIST::device_policy_t>()};
+#else
+  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+#endif
 
   util::callPreCapturePlugins(context);
 

From 9297a85c6c245febee2988407d5d3dbd3242009e Mon Sep 17 00:00:00 2001
From: mdavis36 <davis291@llnl.gov>
Date: Wed, 16 Nov 2022 11:33:02 -0800
Subject: [PATCH 57/94] Export desul_atomics target in RAJA-config.cmake

---
 share/raja/cmake/RAJA-config.cmake.in | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 3baa8b68a2..5763554f6b 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -53,6 +53,19 @@ if (NOT TARGET camp)
     @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/camp)
 endif ()
 
+if (NOT TARGET desul_atomics)
+  set(RAJA_DESUL_DIR "@desul_DIR@")
+  if(NOT desul_DIR) 
+    set(desul_DIR ${RAJA_DESUL_DIR}) 
+  endif()
+
+  find_dependency(desul_atomics CONFIG NO_DEFAULT_PATH PATHS 
+    ${desul_DIR}
+    ${desul_DIR}/lib/cmake/desul
+    @PACKAGE_CMAKE_INSTALL_PREFIX@
+    @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/desul)
+endif ()
+
 set(BLT_TGTS "${CMAKE_CURRENT_LIST_DIR}/bltTargets.cmake")
 if(EXISTS "${BLT_TGTS}")
 include("${BLT_TGTS}")

From 7c277ab6e31c89dab164a0adc80f0bb6a18bb11c Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 17 Nov 2022 09:32:56 -0800
Subject: [PATCH 58/94] clean up pass

---
 include/RAJA/pattern/launch/launch_core.hpp   |  6 ++++--
 .../plugin/tests/test-plugin-launch.hpp       | 20 +++++++++++--------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 2a419293a3..12b850e370 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -326,7 +326,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
 {
 
   ExecPlace place;
-  if(res.get_platform() == camp::resources::v1::Platform::host) {
+  if(res.get_platform() == RAJA::Platform::host) {
     place = RAJA::ExecPlace::HOST;
   }else{
     place = RAJA::ExecPlace::DEVICE;
@@ -370,7 +370,9 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
 
   util::callPostLaunchPlugins(context);
 
-  //Should not get here;
+  RAJA_ABORT_OR_THROW("Unknown launch place");
+
+  //^^ RAJA will abort before getting here
   return resources::EventProxy<resources::Resource>(res);
 }
 
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 68f3dfbbaf..44b3191265 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -32,14 +32,18 @@ void PluginLaunchTestImpl()
 
   for (int i = 0; i < 10; i++) {
 
-    PluginTestCallable p_callable{data};
-
-    RAJA::launch<LaunchPolicy>
-      (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
-      {
-        p_callable(i);
-      });
+    //Keep PluginTestCallable within a scope to ensure
+    //destruction, consistent with other test
+    {
+      PluginTestCallable p_callable{data};
+
+      RAJA::launch<LaunchPolicy>
+        (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+         {
+           p_callable(i);
+         });
+    }
 
     CounterData loop_data;
     plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));

From daa0c23bb14b7c1ec69bfb65a2ff80d7f22e8254 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 17 Nov 2022 10:04:08 -0800
Subject: [PATCH 59/94] rename DEVICE -> RAJA_DEVICE_BACKEND

---
 include/RAJA/pattern/params/reducer.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 5c4858a14a..a738e67af0 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -5,10 +5,10 @@
 #include "RAJA/util/SoAPtr.hpp"
 
 #if defined(RAJA_CUDA_ACTIVE)
-#define DEVICE cuda
+#define RAJA_DEVICE_BACKEND cuda
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #elif defined(RAJA_HIP_ACTIVE)
-#define DEVICE hip
+#define RAJA_DEVICE_BACKEND hip
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #endif
 
@@ -91,7 +91,7 @@ namespace detail
 #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
     // Device related attributes.
     value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, RAJA::DEVICE::device_mempool_type> device_mem;
+    RAJA::detail::SoAPtr<value_type, RAJA::RAJA_DEVICE_BACKEND::device_mempool_type> device_mem;
     unsigned int * device_count = nullptr;
 #endif
 

From c01f02cb73dc934438a488793b005b855dd50fdb Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 17 Nov 2022 10:58:25 -0800
Subject: [PATCH 60/94] use using instead of macro

---
 include/RAJA/pattern/params/reducer.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index a738e67af0..5317420d5b 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -5,11 +5,11 @@
 #include "RAJA/util/SoAPtr.hpp"
 
 #if defined(RAJA_CUDA_ACTIVE)
-#define RAJA_DEVICE_BACKEND cuda
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
-#define RAJA_DEVICE_BACKEND hip
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
+using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #endif
 
 namespace RAJA
@@ -91,11 +91,11 @@ namespace detail
 #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
     // Device related attributes.
     value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, RAJA::RAJA_DEVICE_BACKEND::device_mempool_type> device_mem;
+    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
     unsigned int * device_count = nullptr;
 #endif
 
-    using ARG_TUP_T = camp::tuple<value_type*>; 
+    using ARG_TUP_T = camp::tuple<value_type*>;
     RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&val); }
 
     using ARG_LIST_T = typename ARG_TUP_T::TList;

From c9444e323100d50da9c25863009738528fd9d710 Mon Sep 17 00:00:00 2001
From: mdavis36 <davis291@llnl.gov>
Date: Thu, 17 Nov 2022 15:29:14 -0800
Subject: [PATCH 61/94] Only export desul targets when enabled.

---
 share/raja/cmake/RAJA-config.cmake.in | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 5763554f6b..9560b784d7 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -53,18 +53,20 @@ if (NOT TARGET camp)
     @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/camp)
 endif ()
 
-if (NOT TARGET desul_atomics)
-  set(RAJA_DESUL_DIR "@desul_DIR@")
-  if(NOT desul_DIR) 
-    set(desul_DIR ${RAJA_DESUL_DIR}) 
-  endif()
+if (@RAJA_ENABLE_DESUL_ATOMICS@)
+  if (NOT TARGET desul_atomics)
+    set(RAJA_DESUL_DIR "@desul_DIR@")
+    if(NOT desul_DIR) 
+      set(desul_DIR ${RAJA_DESUL_DIR}) 
+    endif()
 
-  find_dependency(desul_atomics CONFIG NO_DEFAULT_PATH PATHS 
-    ${desul_DIR}
-    ${desul_DIR}/lib/cmake/desul
-    @PACKAGE_CMAKE_INSTALL_PREFIX@
-    @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/desul)
-endif ()
+    find_dependency(desul_atomics CONFIG NO_DEFAULT_PATH PATHS 
+      ${desul_DIR}
+      ${desul_DIR}/lib/cmake/desul
+      @PACKAGE_CMAKE_INSTALL_PREFIX@
+      @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/desul)
+  endif ()
+endif()
 
 set(BLT_TGTS "${CMAKE_CURRENT_LIST_DIR}/bltTargets.cmake")
 if(EXISTS "${BLT_TGTS}")

From 9db00e6bb7748f7f18eb3491916093adddde8b9d Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 18 Nov 2022 12:08:01 -0800
Subject: [PATCH 62/94] Update user docs for new compiler flags.

---
 docs/sphinx/user_guide/getting_started.rst | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst
index e4483d25c3..1c525d2dfa 100644
--- a/docs/sphinx/user_guide/getting_started.rst
+++ b/docs/sphinx/user_guide/getting_started.rst
@@ -323,16 +323,19 @@ OpenMP
 ^^^^^^^
 
 To use OpenMP target offload GPU execution, additional options may need to be
-passed to the compiler. The variable ``OpenMP_CXX_FLAGS`` is used for this.
-Option syntax follows the CMake *list* pattern. For example, to specify OpenMP 
-target options for NVIDIA GPUs using a clang-based compiler, one may do
-something like::
+passed to the compiler. BLT variables are used for this. Option syntax follows 
+the CMake *list* pattern. For example, to specify OpenMP target options for 
+NVIDIA GPUs using a clang-based compiler, one may do something like::
 
    cmake \
      ... \
-     -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
+     -DBLT_OPENMP_COMPILE_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
+     -DBLT_OPENMP_LINK_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
      ...
 
+Compiler flags are passed to other compilers similarly, using flags specific to
+the compiler. Typically, the compile and link flags are the same as shown here.
+
 ----------------------------------------
 RAJA Example Build Configuration Files
 ----------------------------------------

From bdb72fed43b412dd67b3d373b80d85e526eda8b1 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 22 Nov 2022 13:09:33 -0800
Subject: [PATCH 63/94] build blt target before camp

---
 share/raja/cmake/RAJA-config.cmake.in | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 3baa8b68a2..af522db812 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -40,6 +40,14 @@ if (@RAJA_NEEDS_BLT_TPLS@)
   endif ()
 endif()
 
+#Needs to come before camp 
+#Camp performs a check on the BLT target
+set(BLT_TGTS "${CMAKE_CURRENT_LIST_DIR}/bltTargets.cmake")
+if(EXISTS "${BLT_TGTS}")
+include("${BLT_TGTS}")
+endif()
+unset(BLT_TGTS)
+
 if (NOT TARGET camp)
   set(RAJA_CAMP_DIR "@camp_DIR@")
   if(NOT camp_DIR) 
@@ -53,10 +61,5 @@ if (NOT TARGET camp)
     @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/camp)
 endif ()
 
-set(BLT_TGTS "${CMAKE_CURRENT_LIST_DIR}/bltTargets.cmake")
-if(EXISTS "${BLT_TGTS}")
-include("${BLT_TGTS}")
-endif()
-unset(BLT_TGTS)
 include("${CMAKE_CURRENT_LIST_DIR}/RAJATargets.cmake")
 check_required_components("@PROJECT_NAME@")

From 75845dfe3cebc8ea9a3b1120d2f411a90a205c79 Mon Sep 17 00:00:00 2001
From: Michael Davis <davis291@llnl.gov>
Date: Wed, 23 Nov 2022 07:40:18 -0800
Subject: [PATCH 64/94] build_and_test removing desul condition

---
 scripts/gitlab/build_and_test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index 2c0202ae9f..2c7041fefb 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -221,9 +221,9 @@ then
         echo "ERROR: failure(s) while running CTest" && exit 1
     fi
 
-    if grep -q -i "ENABLE_HIP.*ON" ${hostconfig_path} || grep -q -i "RAJA_ENABLE_DESUL_ATOMICS.*ON" ${hostconfig_path}
+    if grep -q -i "ENABLE_HIP.*ON" ${hostconfig_path}
     then
-        echo "WARNING: not testing install with HIP or desul"
+        echo "WARNING: not testing install with HIP"
     else
         if [[ ! -d ${install_dir} ]]
         then

From b5c872c6470a49b6d37516834c453b64eb3a3b03 Mon Sep 17 00:00:00 2001
From: Michael Davis <davis291@llnl.gov>
Date: Wed, 23 Nov 2022 12:33:34 -0800
Subject: [PATCH 65/94] Update RAJA-config.cmake.in

desul_atomics -> desul in find_dependency
---
 share/raja/cmake/RAJA-config.cmake.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 9560b784d7..2f4969104e 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -60,7 +60,7 @@ if (@RAJA_ENABLE_DESUL_ATOMICS@)
       set(desul_DIR ${RAJA_DESUL_DIR}) 
     endif()
 
-    find_dependency(desul_atomics CONFIG NO_DEFAULT_PATH PATHS 
+    find_dependency(desul CONFIG NO_DEFAULT_PATH PATHS 
       ${desul_DIR}
       ${desul_DIR}/lib/cmake/desul
       @PACKAGE_CMAKE_INSTALL_PREFIX@

From b58bbc7f0cf69acea19dfddff880276c0c67aa64 Mon Sep 17 00:00:00 2001
From: Robert Chen <chen59@llnl.gov>
Date: Mon, 28 Nov 2022 18:20:12 -0800
Subject: [PATCH 66/94] Remove OpenMPTarget tests for WorkGroup Dispatcher.

---
 test/unit/workgroup/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/unit/workgroup/CMakeLists.txt b/test/unit/workgroup/CMakeLists.txt
index 0de12ba336..3cd18b7ef5 100644
--- a/test/unit/workgroup/CMakeLists.txt
+++ b/test/unit/workgroup/CMakeLists.txt
@@ -74,6 +74,10 @@ if(RAJA_TEST_EXHAUSTIVE OR NOT RAJA_COMPILER MATCHES "RAJA_COMPILER_Intel")
 endif()
 
 set(Dispatcher_SUBTESTS Single)
+if(RAJA_ENABLE_TARGET_OPENMP)
+  # WorkGroup dispatcher for OpenMPTarget not implemented yet
+  list(REMOVE_ITEM BACKENDS OpenMPTarget)
+endif()
 buildunitworkgrouptest(Dispatcher    "${Dispatcher_SUBTESTS}"  "${DISPATCHERS}" "${BACKENDS}")
 
 set(WorkStorage_SUBTESTS Constructor Iterator InsertCall Multiple)

From 13f02551610989be34d421e1e283e018965e7ef8 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Tue, 29 Nov 2022 11:55:32 -0800
Subject: [PATCH 67/94] Update release notes and version number to prepare for
 patch release.

---
 CMakeLists.txt   |  2 +-
 RELEASE_NOTES.md | 27 +++++++++++++++++++++++++++
 docs/conf.py     |  2 +-
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b8ebfb77e1..ff6600cd13 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ include(CMakeDependentOption)
 # Set version number
 set(RAJA_VERSION_MAJOR 2022)
 set(RAJA_VERSION_MINOR 10)
-set(RAJA_VERSION_PATCHLEVEL 2)
+set(RAJA_VERSION_PATCHLEVEL 3)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 0f6b40cdf4..05bc1d6e35 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -19,6 +19,33 @@ Notable changes include:
 
   * Bug fixes/improvements:
 
+Version 2022.10.3 -- Release date 2022-12-xx
+============================================
+
+This release fixes a few issues that were found after the v2022.10.2 release.
+
+Notable changes include:
+
+  * Update camp submodule to ....
+  * Update BLT submodule to ....
+
+  * Properly export 'roctx' target when CMake variable RAJA_ENABLE_ROCTX is on. 
+  * Add missing template parameter pack argument in RAJA::statement::For
+    execution policy construct used in RAJA::kernel implementation for OpenMP 
+    target back-end.
+  * Change to use compile-time GPU thread block size in RAJA::forall 
+    implementation. This improves performance of GPU kernels, especially 
+    those using the RAJA HIP back-end.
+  * Added RAJA plugin support, including CHAI support, for RAJA::launch.
+  * Renamed 'DEVICE' macro to 'RAJA_DEVICE_BACKEND' to prevent name conflicts
+    with other libraries.
+  * Updated User Guide documentation about CMake variable used to pass 
+    compiler flags for OpenMP target back-end. This changed with CMake
+    minimum required version bump in v2022.10.0.
+  * Adjust ordering of BLT and camp target inclusion in RAJA CMake usage to 
+    fix an issue with projects using external camp vs. RAJA submodule.
+    
+
 
 Version 2022.10.2 -- Release date 2022-11-08
 ============================================
diff --git a/docs/conf.py b/docs/conf.py
index 20c5ef60e0..51e0336b1a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -88,7 +88,7 @@
 # The short X.Y version.
 version = u'2022.10'
 # The full version, including alpha/beta/rc tags.
-release = u'2022.10.2'
+release = u'2022.10.3'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

From 07af608f9671ae0bd2521c42f840dc16c3485809 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Tue, 29 Nov 2022 15:39:16 -0800
Subject: [PATCH 68/94] Cache PACKAGE_PREFIX_DIR

---
 share/raja/cmake/RAJA-config.cmake.in | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 507b882e2f..87864bc17f 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -32,6 +32,9 @@
 #
 @PACKAGE_INIT@
 
+# caceh the prefix dir (could be overriden by find_dependency)
+set(RAJA_PACKAGE_PREFIX_DIR ${PACKAGE_PREFIX_DIR})
+
 include(CMakeFindDependencyMacro)
 
 if (@RAJA_NEEDS_BLT_TPLS@)
@@ -57,8 +60,8 @@ if (NOT TARGET camp)
   find_dependency(camp CONFIG NO_DEFAULT_PATH PATHS 
     ${camp_DIR}
     ${camp_DIR}/lib/cmake/camp
-    @PACKAGE_CMAKE_INSTALL_PREFIX@
-    @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/camp)
+    ${RAJA_PACKAGE_PREFIX_DIR}
+    ${RAJA_PACKAGE_PREFIX_DIR}/lib/cmake/camp)
 endif ()
 
 if (@RAJA_ENABLE_DESUL_ATOMICS@)
@@ -71,8 +74,8 @@ if (@RAJA_ENABLE_DESUL_ATOMICS@)
     find_dependency(desul CONFIG NO_DEFAULT_PATH PATHS 
       ${desul_DIR}
       ${desul_DIR}/lib/cmake/desul
-      @PACKAGE_CMAKE_INSTALL_PREFIX@
-      @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/desul)
+      ${RAJA_PACKAGE_PREFIX_DIR}
+      ${RAJA_PACKAGE_PREFIX_DIR}/lib/cmake/desul)
   endif ()
 endif()
 

From 1ab2134a4dc4bac1d45c6ada9f577c2bdb19f82d Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Tue, 29 Nov 2022 15:46:35 -0800
Subject: [PATCH 69/94] Update share/raja/cmake/RAJA-config.cmake.in

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 share/raja/cmake/RAJA-config.cmake.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 87864bc17f..629e404b56 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -32,7 +32,7 @@
 #
 @PACKAGE_INIT@
 
-# caceh the prefix dir (could be overriden by find_dependency)
+# cache the prefix dir (could be overriden by find_dependency)
 set(RAJA_PACKAGE_PREFIX_DIR ${PACKAGE_PREFIX_DIR})
 
 include(CMakeFindDependencyMacro)

From 889f0c131cbad9eae6b8c74fecea4cfaca6e6ee4 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 30 Nov 2022 09:02:15 -0800
Subject: [PATCH 70/94] Use craycc instead of loading module

---
 scripts/lc-builds/toss4_cce_hip.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh
index 187e797a5f..503ec252c5 100755
--- a/scripts/lc-builds/toss4_cce_hip.sh
+++ b/scripts/lc-builds/toss4_cce_hip.sh
@@ -41,12 +41,10 @@ mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
 module load cmake/3.24.2
 
-module load cce/${COMP_VER}
-
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_C_COMPILER=$(which cc) \
-  -DCMAKE_CXX_COMPILER=$(which CC) \
+  -DCMAKE_C_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/craycc" \
+  -DCMAKE_CXX_COMPILER="/usr/tce/packages/cce-tce/cce-${COMP_VER}/bin/crayCC" \
   -DHIP_PATH=/opt/rocm-${HIP_VER}/hip \
   -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \
   -DGPU_TARGETS=${HIP_ARCH} \
@@ -69,7 +67,7 @@ echo "  Please note that you have to have a consistent build environment"
 echo "  when you make RAJA as cmake may reconfigure; load the appropriate"
 echo "  cce module (${COMP_VER}) when building."
 echo
-echo "    module load cce/${COMP_VER}"
+echo "    module load cce-tce/${COMP_VER}"
 echo "    srun -n1 make"
 echo
 echo "***********************************************************************"

From fb6e0c45f66161cde8b69fc1fefc785cc7470f92 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 30 Nov 2022 09:06:27 -0800
Subject: [PATCH 71/94] Add note about fp64 atomics with gfx90a

---
 scripts/lc-builds/toss4_amdclang.sh      | 3 +++
 scripts/lc-builds/toss4_amdclang_asan.sh | 3 +++
 scripts/lc-builds/toss4_cce_hip.sh       | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh
index 615060490e..4aaa36b207 100755
--- a/scripts/lc-builds/toss4_amdclang.sh
+++ b/scripts/lc-builds/toss4_amdclang.sh
@@ -43,6 +43,9 @@ echo "Creating build directory build_${BUILD_SUFFIX} and generating configuratio
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
 
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
diff --git a/scripts/lc-builds/toss4_amdclang_asan.sh b/scripts/lc-builds/toss4_amdclang_asan.sh
index a886d1c0b6..02925093ff 100755
--- a/scripts/lc-builds/toss4_amdclang_asan.sh
+++ b/scripts/lc-builds/toss4_amdclang_asan.sh
@@ -43,6 +43,9 @@ echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in i
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
 
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh
index 503ec252c5..fc325e0948 100755
--- a/scripts/lc-builds/toss4_cce_hip.sh
+++ b/scripts/lc-builds/toss4_cce_hip.sh
@@ -34,6 +34,9 @@ echo "Creating build directory build_${BUILD_SUFFIX} and generating configuratio
 echo "Configuration extra arguments:"
 echo "   $@"
 echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
 
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}

From 9d593e3e28f3f15467ac0a3895b66490628438da Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 30 Nov 2022 09:44:33 -0800
Subject: [PATCH 72/94] Add `debug-find` to cmake invocation

---
 scripts/gitlab/build_and_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index 2c7041fefb..0d6fff8ef2 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -232,7 +232,7 @@ then
 
         cd ${install_dir}/examples/RAJA/using-with-cmake
         mkdir build && cd build
-        if ! $cmake_exe -C ../host-config.cmake ..; then
+        if ! $cmake_exe --debug-find -C ../host-config.cmake ..; then
         echo "ERROR: running $cmake_exe for using-with-cmake test" && exit 1
         fi
 

From f8b3bfa5b60366a56ac895d169c64c3ec921de63 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 30 Nov 2022 10:23:11 -0800
Subject: [PATCH 73/94] Bump cmake min version req for install test to match
 raja build.

---
 test/install/using-with-cmake/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/install/using-with-cmake/CMakeLists.txt b/test/install/using-with-cmake/CMakeLists.txt
index 32f8baa1da..27b397bbaf 100644
--- a/test/install/using-with-cmake/CMakeLists.txt
+++ b/test/install/using-with-cmake/CMakeLists.txt
@@ -5,7 +5,11 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-cmake_minimum_required(VERSION 3.14)
+if (ENABLE_HIP)
+  cmake_minimum_required(VERSION 3.23)
+else()
+  cmake_minimum_required(VERSION 3.20)
+endif()
 
  project(using_with_cmake)
 
@@ -19,4 +23,4 @@ cmake_minimum_required(VERSION 3.14)
 
  add_executable(using-with-cmake using-with-cmake.cpp)
  target_link_libraries(using-with-cmake RAJA)
- 
\ No newline at end of file
+ 

From f1f45a5a26fb3a9779181ec5ced1c331bf80086a Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 30 Nov 2022 10:26:46 -0800
Subject: [PATCH 74/94] Search for desul_atomics directly

---
 share/raja/cmake/RAJA-config.cmake.in | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index 629e404b56..a9271781cd 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -71,7 +71,7 @@ if (@RAJA_ENABLE_DESUL_ATOMICS@)
       set(desul_DIR ${RAJA_DESUL_DIR}) 
     endif()
 
-    find_dependency(desul CONFIG NO_DEFAULT_PATH PATHS 
+    find_dependency(desul_atomics CONFIG NO_DEFAULT_PATH PATHS 
       ${desul_DIR}
       ${desul_DIR}/lib/cmake/desul
       ${RAJA_PACKAGE_PREFIX_DIR}
@@ -79,11 +79,5 @@ if (@RAJA_ENABLE_DESUL_ATOMICS@)
   endif ()
 endif()
 
-set(BLT_TGTS "${CMAKE_CURRENT_LIST_DIR}/bltTargets.cmake")
-if(EXISTS "${BLT_TGTS}")
-include("${BLT_TGTS}")
-endif()
-unset(BLT_TGTS)
-
 include("${CMAKE_CURRENT_LIST_DIR}/RAJATargets.cmake")
 check_required_components("@PROJECT_NAME@")

From 5810ac5ce4ec5ce2a9e2ae886474cd1d1752c34c Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 30 Nov 2022 11:13:34 -0800
Subject: [PATCH 75/94] Remove --debug-find flag

---
 scripts/gitlab/build_and_test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index 0d6fff8ef2..2c7041fefb 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -232,7 +232,7 @@ then
 
         cd ${install_dir}/examples/RAJA/using-with-cmake
         mkdir build && cd build
-        if ! $cmake_exe --debug-find -C ../host-config.cmake ..; then
+        if ! $cmake_exe -C ../host-config.cmake ..; then
         echo "ERROR: running $cmake_exe for using-with-cmake test" && exit 1
         fi
 

From 9943ae45ac490d8ba78b317965b8637e31710e40 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 30 Nov 2022 13:30:07 -0800
Subject: [PATCH 76/94] Bump camp version and BLT to match camp

---
 RELEASE_NOTES.md | 4 ++--
 blt              | 2 +-
 tpl/camp         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 05bc1d6e35..107abdc759 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -26,8 +26,8 @@ This release fixes a few issues that were found after the v2022.10.2 release.
 
 Notable changes include:
 
-  * Update camp submodule to ....
-  * Update BLT submodule to ....
+  * Update camp submodule to v2022.10.1
+  * Update BLT submodule to commit 8c229991 (includes fixes for crayftn + hip)
 
   * Properly export 'roctx' target when CMake variable RAJA_ENABLE_ROCTX is on. 
   * Add missing template parameter pack argument in RAJA::statement::For
diff --git a/blt b/blt
index e35f490a8a..8c229991e6 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit e35f490a8a8b1689e99b5f4308b5251f97eb36cf
+Subproject commit 8c229991e65e7a9603c621b47cb3ba158bb7468c
diff --git a/tpl/camp b/tpl/camp
index 3a7486edb8..a1c74aade4 160000
--- a/tpl/camp
+++ b/tpl/camp
@@ -1 +1 @@
-Subproject commit 3a7486edb8b1c50ce36ecace56384d32a1009e4f
+Subproject commit a1c74aade443c6332e953e1ca5ad5e9e5b5baf21

From f0f75fb3dae41ff0d31ec262bb6ac040b547ae95 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 30 Nov 2022 13:55:09 -0800
Subject: [PATCH 77/94] Squash compiler warnings

---
 .../tests/test-dynamic-forall-resource-RangeSegment.hpp         | 2 ++
 test/integration/plugin/tests/test-plugin-launch.hpp            | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
index 0b09079093..fccdf6880b 100644
--- a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -66,7 +66,9 @@ TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
   using POLICY_LIST = typename camp::at<TypeParam, camp::num<2>>::type;
 
 
+#if defined(RAJA_DEVICE_ACTIVE)
   constexpr int N = camp::size<POLICY_LIST>::value;
+#endif
 
   //If N == 2 host, no openmp is available
   //If N == 3 host, openmp is available
diff --git a/test/integration/plugin/tests/test-plugin-launch.hpp b/test/integration/plugin/tests/test-plugin-launch.hpp
index 44b3191265..6aa1744b6c 100644
--- a/test/integration/plugin/tests/test-plugin-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-launch.hpp
@@ -39,7 +39,7 @@ void PluginLaunchTestImpl()
 
       RAJA::launch<LaunchPolicy>
         (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
-         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext RAJA_UNUSED_ARG(ctx))
          {
            p_callable(i);
          });

From 172eb7f0988265700dfd4f1aff16e17b36fcbf8a Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 30 Nov 2022 15:48:19 -0800
Subject: [PATCH 78/94] Finalize release notes for patch release.

---
 RELEASE_NOTES.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 107abdc759..71ca9d8b5e 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -30,6 +30,7 @@ Notable changes include:
   * Update BLT submodule to commit 8c229991 (includes fixes for crayftn + hip)
 
   * Properly export 'roctx' target when CMake variable RAJA_ENABLE_ROCTX is on. 
+  * Fix CMake logic for exporting desul targets when desul atomics are enabled.
   * Add missing template parameter pack argument in RAJA::statement::For
     execution policy construct used in RAJA::kernel implementation for OpenMP 
     target back-end.

From f6c366c131e2d311c659e4cf19b0fa4d6f0bfdfa Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Wed, 30 Nov 2022 15:50:52 -0800
Subject: [PATCH 79/94] Update release date

---
 RELEASE_NOTES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 71ca9d8b5e..60c58ac4d7 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -19,7 +19,7 @@ Notable changes include:
 
   * Bug fixes/improvements:
 
-Version 2022.10.3 -- Release date 2022-12-xx
+Version 2022.10.3 -- Release date 2022-12-01
 ============================================
 
 This release fixes a few issues that were found after the v2022.10.2 release.

From 837aae0668813c76b869b03bf3a86abb29fbb1a4 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 1 Dec 2022 09:06:59 -0800
Subject: [PATCH 80/94] fix post launch call

---
 include/RAJA/pattern/launch/launch_core.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 12b850e370..3c40fce0d5 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -355,12 +355,12 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      return launch_t::exec(res, params, kernel_name, p_body); break;
+      launch_t::exec(res, params, kernel_name, p_body); break;
     }
 #ifdef RAJA_DEVICE_ACTIVE
     case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      return launch_t::exec(res, params, kernel_name, p_body); break;
+      launch_t::exec(res, params, kernel_name, p_body); break;
     }
 #endif
     default: {

From bf26690fe602a390dca993bb92089ed7af8712b7 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 1 Dec 2022 09:23:24 -0800
Subject: [PATCH 81/94] Modifications based on reviewer comments.

---
 RELEASE_NOTES.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 60c58ac4d7..60259de6ae 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -38,8 +38,8 @@ Notable changes include:
     implementation. This improves performance of GPU kernels, especially 
     those using the RAJA HIP back-end.
   * Added RAJA plugin support, including CHAI support, for RAJA::launch.
-  * Renamed 'DEVICE' macro to 'RAJA_DEVICE_BACKEND' to prevent name conflicts
-    with other libraries.
+  * Replaced 'DEVICE' macro with alias to 'device_mem_pool_t' to prevent name 
+    conflicts with other libraries.
   * Updated User Guide documentation about CMake variable used to pass 
     compiler flags for OpenMP target back-end. This changed with CMake
     minimum required version bump in v2022.10.0.

From c75a011d47a617119668dee1803f4cf2eea3d7bb Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 1 Dec 2022 09:31:23 -0800
Subject: [PATCH 82/94] avoid error

---
 include/RAJA/pattern/launch/launch_core.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 3c40fce0d5..233350aefc 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -355,12 +355,16 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      launch_t::exec(res, params, kernel_name, p_body); break;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, params, kernel_name, p_body); break;
+      util::callPostLaunchPlugins(context);
+      return e_proxy;
     }
 #ifdef RAJA_DEVICE_ACTIVE
     case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      launch_t::exec(res, params, kernel_name, p_body); break;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, params, kernel_name, p_body); break;
+      util::callPostLaunchPlugins(context);
+      return e_proxy;
     }
 #endif
     default: {
@@ -368,8 +372,6 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
     }
   }
 
-  util::callPostLaunchPlugins(context);
-
   RAJA_ABORT_OR_THROW("Unknown launch place");
 
   //^^ RAJA will abort before getting here

From b81e03dd78b44f5a3f97aa8dd167952376e32bd6 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 1 Dec 2022 10:52:19 -0800
Subject: [PATCH 83/94] Update include/RAJA/pattern/launch/launch_core.hpp

---
 include/RAJA/pattern/launch/launch_core.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 233350aefc..a1ba1c3e38 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -355,7 +355,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
   switch (place) {
     case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, params, kernel_name, p_body); break;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, params, kernel_name, p_body);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }

From 7d29fe9cbcc95722a5b8c3e1600c4de62790bac2 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 1 Dec 2022 10:52:27 -0800
Subject: [PATCH 84/94] Update include/RAJA/pattern/launch/launch_core.hpp

---
 include/RAJA/pattern/launch/launch_core.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index a1ba1c3e38..b450436923 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -362,7 +362,7 @@ launch(RAJA::resources::Resource res, LaunchParams const &params, const char *ke
 #ifdef RAJA_DEVICE_ACTIVE
     case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, params, kernel_name, p_body); break;
+      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, params, kernel_name, p_body);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }

From 66947572fd567166c4a0d7e85a9e5a039f7fdde6 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 1 Dec 2022 12:29:06 -0800
Subject: [PATCH 85/94] add resource + plugin test + reduce launch API
 complexity

---
 include/RAJA/pattern/launch/launch_core.hpp   | 28 +-----
 test/include/RAJA_test-plugin-launchpol.hpp   |  4 +-
 .../RAJA_test-plugin-resource-launchpol.hpp   | 35 ++++++++
 test/integration/plugin/CMakeLists.txt        | 11 +++
 .../plugin/test-plugin-resource-launch.cpp.in | 38 ++++++++
 .../tests/test-plugin-resource-launch.hpp     | 89 +++++++++++++++++++
 6 files changed, 178 insertions(+), 27 deletions(-)
 create mode 100644 test/include/RAJA_test-plugin-resource-launchpol.hpp
 create mode 100644 test/integration/plugin/test-plugin-resource-launch.cpp.in
 create mode 100644 test/integration/plugin/tests/test-plugin-resource-launch.hpp

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index b450436923..28dca62ff0 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -255,36 +255,15 @@ template <typename POLICY_LIST, typename BODY>
 void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
 {
 
-  //
-  //Configure plugins
-  //
-#ifdef RAJA_DEVICE_ACTIVE
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>()
-      : util::make_context<typename POLICY_LIST::device_policy_t>()};
-#else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
-#endif
-
-  util::callPreCapturePlugins(context);
-
-  using RAJA::util::trigger_updates_before;
-  auto p_body = trigger_updates_before(body);
-
-  util::callPostCapturePlugins(context);
-
-  util::callPreLaunchPlugins(context);
-
+  //Forward to single policy launch API - simplifies testing of plugins
   switch (place) {
     case ExecPlace::HOST: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      launch_t::exec(params, kernel_name, p_body);
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(params, kernel_name, body);
       break;
     }
 #ifdef RAJA_DEVICE_ACTIVE
   case ExecPlace::DEVICE: {
-      using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      launch_t::exec(params, kernel_name, p_body);
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(params, kernel_name, body);
       break;
     }
 #endif
@@ -292,7 +271,6 @@ void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
 
-  util::callPostLaunchPlugins(context);
 }
 
 // Helper function to retrieve a resource based on the run-time policy - if a device is active
diff --git a/test/include/RAJA_test-plugin-launchpol.hpp b/test/include/RAJA_test-plugin-launchpol.hpp
index 862f2018dd..b3677144dc 100644
--- a/test/include/RAJA_test-plugin-launchpol.hpp
+++ b/test/include/RAJA_test-plugin-launchpol.hpp
@@ -9,8 +9,8 @@
 // Kernel execution policy lists used throughout plugin tests
 //
 
-#ifndef __RAJA_test_plugin_kernelpol_HPP__
-#define __RAJA_test_plugin_kernelpol_HPP__
+#ifndef __RAJA_test_plugin_launchpol_HPP__
+#define __RAJA_test_plugin_launchpol_HPP__
 
 #include "RAJA/RAJA.hpp"
 
diff --git a/test/include/RAJA_test-plugin-resource-launchpol.hpp b/test/include/RAJA_test-plugin-resource-launchpol.hpp
new file mode 100644
index 0000000000..c2c2dccb9a
--- /dev/null
+++ b/test/include/RAJA_test-plugin-resource-launchpol.hpp
@@ -0,0 +1,35 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Kernel execution policy lists used throughout plugin tests
+//
+
+#ifndef __RAJA_test_plugin_resource_launchpol_HPP__
+#define __RAJA_test_plugin_resource_launchpol_HPP__
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/list.hpp"
+
+// Sequential execution policy types
+using SequentialPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t>>;
+
+#if defined(RAJA_ENABLE_OPENMP)
+using OpenMPPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::omp_launch_t>>;
+#endif
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>>;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+using HipPluginResourceLaunchExecPols = camp::list<RAJA::LaunchPolicy<RAJA::seq_launch_t, RAJA::hip_launch_t<false>>>;
+
+#endif
+
+#endif  // __RAJA_test_plugin_kernelpol_HPP__
diff --git a/test/integration/plugin/CMakeLists.txt b/test/integration/plugin/CMakeLists.txt
index 0d8d56643b..360d63164c 100644
--- a/test/integration/plugin/CMakeLists.txt
+++ b/test/integration/plugin/CMakeLists.txt
@@ -43,6 +43,17 @@ foreach( BACKEND ${PLUGIN_BACKENDS} )
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
 
+foreach( BACKEND ${PLUGIN_BACKENDS} )
+  configure_file( test-plugin-resource-launch.cpp.in
+                  test-plugin-resource-launch-${BACKEND}.cpp )
+  raja_add_test( NAME test-plugin-resource-launch-${BACKEND}
+                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-resource-launch-${BACKEND}.cpp
+                         plugin_to_test.cpp )
+
+  target_include_directories(test-plugin-resource-launch-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+endforeach()
+
 set(DISPATCHERS Direct IndirectFunction IndirectVirtual)
 
 foreach( BACKEND ${PLUGIN_BACKENDS} )
diff --git a/test/integration/plugin/test-plugin-resource-launch.cpp.in b/test/integration/plugin/test-plugin-resource-launch.cpp.in
new file mode 100644
index 0000000000..573982ba38
--- /dev/null
+++ b/test/integration/plugin/test-plugin-resource-launch.cpp.in
@@ -0,0 +1,38 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-platform.hpp"
+
+#include "RAJA_test-plugin-resource-launchpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-plugin-resource-launch.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@PluginResourceLaunchTypes =
+  Test< camp::cartesian_product<@BACKEND@PluginResourceLaunchExecPols,
+                                @BACKEND@ResourceList,
+                                @BACKEND@PlatformList > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               PluginResourceLaunchTest,
+                               @BACKEND@PluginResourceLaunchTypes);
diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
new file mode 100644
index 0000000000..8895210f73
--- /dev/null
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -0,0 +1,89 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing basic integration tests for plugins with launch.
+///
+
+#ifndef __TEST_PLUGIN_RESOURCE_LAUNCH_HPP__
+#define __TEST_PLUGIN_RESOURCE_LAUNCH_HPP__
+
+#include "test-plugin.hpp"
+
+
+// Check that the plugin is called with the right Platform.
+// Check that the plugin is called the correct number of times,
+// once before and after each launch capture for the capture counter,
+// once before and after each launch invocation for the launch counter.
+
+// test with basic launch
+template <typename LaunchPolicy,
+          typename WORKING_RES,
+          RAJA::Platform PLATFORM>
+void PluginResourceLaunchTestImpl()
+{
+  SetupPluginVars spv(WORKING_RES::get_default());
+
+  CounterData* data = plugin_test_resource->allocate<CounterData>(10);
+
+  for (int i = 0; i < 10; i++) {
+
+    //Keep PluginTestCallable within a scope to ensure
+    //destruction, consistent with other test
+    {
+      PluginTestCallable p_callable{data};
+
+      RAJA::launch<LaunchPolicy>
+        (WORKING_RES::get_default(), RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+         [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+         {
+           p_callable(i);
+         });
+    }
+
+    CounterData loop_data;
+    plugin_test_resource->memcpy(&loop_data, &data[i], sizeof(CounterData));
+    ASSERT_EQ(loop_data.capture_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.capture_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.capture_counter_post,    i);
+    ASSERT_EQ(loop_data.launch_platform_active, PLATFORM);
+    ASSERT_EQ(loop_data.launch_counter_pre,     i+1);
+    ASSERT_EQ(loop_data.launch_counter_post,    i);
+  }
+
+  CounterData plugin_data;
+  plugin_test_resource->memcpy(&plugin_data, plugin_test_data, sizeof(CounterData));
+  ASSERT_EQ(plugin_data.capture_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.capture_counter_pre,     10);
+  ASSERT_EQ(plugin_data.capture_counter_post,    10);
+  ASSERT_EQ(plugin_data.launch_platform_active, RAJA::Platform::undefined);
+  ASSERT_EQ(plugin_data.launch_counter_pre,     10);
+  ASSERT_EQ(plugin_data.launch_counter_post,    10);
+
+  plugin_test_resource->deallocate(data);
+}
+
+
+TYPED_TEST_SUITE_P(PluginResourceLaunchTest);
+template <typename T>
+class PluginResourceLaunchTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(PluginResourceLaunchTest, PluginResourceLaunch)
+{
+  using LaunchPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType = typename camp::at<TypeParam, camp::num<1>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  PluginResourceLaunchTestImpl<LaunchPolicy, ResType, PlatformHolder::platform>( );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(PluginResourceLaunchTest,
+                            PluginResourceLaunch);
+
+#endif  //__TEST_PLUGIN_LAUNCH_HPP__

From 2b0aced1e3279f96ae55bf5893058dd1dcbb8610 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 1 Dec 2022 13:15:02 -0800
Subject: [PATCH 86/94] use non-default stream

---
 .../plugin/tests/test-plugin-resource-launch.hpp            | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/integration/plugin/tests/test-plugin-resource-launch.hpp b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
index 8895210f73..76e10a7118 100644
--- a/test/integration/plugin/tests/test-plugin-resource-launch.hpp
+++ b/test/integration/plugin/tests/test-plugin-resource-launch.hpp
@@ -26,7 +26,9 @@ template <typename LaunchPolicy,
           RAJA::Platform PLATFORM>
 void PluginResourceLaunchTestImpl()
 {
-  SetupPluginVars spv(WORKING_RES::get_default());
+  WORKING_RES res;
+
+  SetupPluginVars spv(res);
 
   CounterData* data = plugin_test_resource->allocate<CounterData>(10);
 
@@ -38,7 +40,7 @@ void PluginResourceLaunchTestImpl()
       PluginTestCallable p_callable{data};
 
       RAJA::launch<LaunchPolicy>
-        (WORKING_RES::get_default(), RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
+        (res, RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(1)),
          [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
          {
            p_callable(i);

From 057e90faee2ef90abda03a2c665603852445cfa8 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 1 Dec 2022 14:59:02 -0800
Subject: [PATCH 87/94] move device_mem_pool_t to raja detail namespace

---
 include/RAJA/pattern/params/reducer.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 5317420d5b..72a4417697 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -6,10 +6,8 @@
 
 #if defined(RAJA_CUDA_ACTIVE)
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
-using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
-using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #endif
 
 namespace RAJA
@@ -66,6 +64,14 @@ struct limits<RAJA::expt::ValLoc<T>> {
 
 namespace RAJA
 {
+namespace detail
+{
+#if defined(RAJA_CUDA_ACTIVE)
+using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+#elif defined(RAJA_HIP_ACTIVE)
+using device_mem_pool_t = RAJA::hip::device_mempool_type;
+#endif
+} //namespace detail
 
 namespace expt
 {
@@ -91,7 +97,7 @@ namespace detail
 #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
     // Device related attributes.
     value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+    RAJA::detail::SoAPtr<value_type, RAJA::detail::device_mem_pool_t> device_mem;
     unsigned int * device_count = nullptr;
 #endif
 

From 4597a788bae9fad7b3c597a0611351bbed91e8ab Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 1 Dec 2022 16:44:22 -0800
Subject: [PATCH 88/94] Move device_mem_pool_t into expt::Device namespace

---
 include/RAJA/pattern/params/reducer.hpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 72a4417697..d094a729c1 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -64,20 +64,18 @@ struct limits<RAJA::expt::ValLoc<T>> {
 
 namespace RAJA
 {
-namespace detail
-{
-#if defined(RAJA_CUDA_ACTIVE)
-using device_mem_pool_t = RAJA::cuda::device_mempool_type;
-#elif defined(RAJA_HIP_ACTIVE)
-using device_mem_pool_t = RAJA::hip::device_mempool_type;
-#endif
-} //namespace detail
 
 namespace expt
 {
 namespace detail
 {
 
+#if defined(RAJA_CUDA_ACTIVE)
+  using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+#elif defined(RAJA_HIP_ACTIVE)
+  using device_mem_pool_t = RAJA::hip::device_mempool_type;
+#endif
+
   //
   //
   // Basic Reducer
@@ -97,7 +95,7 @@ namespace detail
 #if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
     // Device related attributes.
     value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, RAJA::detail::device_mem_pool_t> device_mem;
+    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
     unsigned int * device_count = nullptr;
 #endif
 

From 655ab91538848c2ec53bc5c6c002aebb38c7426f Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 2 Dec 2022 10:16:10 -0800
Subject: [PATCH 89/94] Fix CMake warning for inconsistent naming of rocPRIM in
 find module

---
 cmake/SetupPackages.cmake                                 | 2 +-
 cmake/thirdparty/{FindRocPRIM.cmake => FindROCPRIM.cmake} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cmake/thirdparty/{FindRocPRIM.cmake => FindROCPRIM.cmake} (100%)

diff --git a/cmake/SetupPackages.cmake b/cmake/SetupPackages.cmake
index 9e057f3a50..451fc62ae2 100644
--- a/cmake/SetupPackages.cmake
+++ b/cmake/SetupPackages.cmake
@@ -82,7 +82,7 @@ if (RAJA_ENABLE_HIP)
   endif()
 
   if (RAJA_ENABLE_EXTERNAL_ROCPRIM)
-    include(cmake/thirdparty/FindRocPRIM.cmake)
+    include(cmake/thirdparty/FindROCPRIM.cmake)
     if (ROCPRIM_FOUND)
       blt_import_library(
         NAME rocPRIM
diff --git a/cmake/thirdparty/FindRocPRIM.cmake b/cmake/thirdparty/FindROCPRIM.cmake
similarity index 100%
rename from cmake/thirdparty/FindRocPRIM.cmake
rename to cmake/thirdparty/FindROCPRIM.cmake

From 08f0cdd0c8cc6cc6fe23b97d151dc79e25f12ba0 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 2 Dec 2022 13:03:14 -0800
Subject: [PATCH 90/94] Attempt to use best practices for rocPRIM find module

---
 cmake/SetupPackages.cmake          |  6 ++--
 cmake/thirdparty/FindROCPRIM.cmake | 29 ----------------
 cmake/thirdparty/FindrocPRIM.cmake | 54 ++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 32 deletions(-)
 delete mode 100644 cmake/thirdparty/FindROCPRIM.cmake
 create mode 100644 cmake/thirdparty/FindrocPRIM.cmake

diff --git a/cmake/SetupPackages.cmake b/cmake/SetupPackages.cmake
index 451fc62ae2..dc107e1fe3 100644
--- a/cmake/SetupPackages.cmake
+++ b/cmake/SetupPackages.cmake
@@ -82,11 +82,11 @@ if (RAJA_ENABLE_HIP)
   endif()
 
   if (RAJA_ENABLE_EXTERNAL_ROCPRIM)
-    include(cmake/thirdparty/FindROCPRIM.cmake)
-    if (ROCPRIM_FOUND)
+    find_package(rocPRIM)
+    if (rocPRIM_FOUND)
       blt_import_library(
         NAME rocPRIM
-        INCLUDES ${ROCPRIM_INCLUDE_DIRS}
+        INCLUDES ${rocPRIM_INCLUDE_DIRS}
         TREAT_INCLUDES_AS_SYSTEM ON
         EXPORTABLE ON)
     else()
diff --git a/cmake/thirdparty/FindROCPRIM.cmake b/cmake/thirdparty/FindROCPRIM.cmake
deleted file mode 100644
index 4279f48d1b..0000000000
--- a/cmake/thirdparty/FindROCPRIM.cmake
+++ /dev/null
@@ -1,29 +0,0 @@
-###############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and other RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-###############################################################################
-
-include (FindPackageHandleStandardArgs)
-
-find_path(ROCPRIM_INCLUDE_DIRS
-  NAMES rocprim/rocprim.hpp
-  HINTS
-    ${ROCPRIM_DIR}/
-    ${ROCPRIM_DIR}/include
-    ${ROCPRIM_DIR}/rocprim/include
-    ${HIP_ROOT_DIR}/../rocprim
-    ${HIP_ROOT_DIR}/../rocprim/include
-    ${HIP_ROOT_DIR}/../include)
-
-find_package_handle_standard_args(
-  ROCPRIM
-  DEFAULT_MSG
-  ROCPRIM_INCLUDE_DIRS)
-
-if (ROCPRIM_INCLUDE_DIRS)
-  set(ROCPRIM_FOUND True)
-else ()
-  set(ROCPRIM_FOUND False)
-endif()
diff --git a/cmake/thirdparty/FindrocPRIM.cmake b/cmake/thirdparty/FindrocPRIM.cmake
new file mode 100644
index 0000000000..702ef52013
--- /dev/null
+++ b/cmake/thirdparty/FindrocPRIM.cmake
@@ -0,0 +1,54 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and other RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#[=======================================================================[.rst:
+
+FindrocPRIM
+-------
+
+Finds the rocPRIM package.
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This will define the following variables:
+
+``rocPRIM_FOUND``
+True if the system has the rocPRIM library.
+``rocPRIM_INCLUDE_DIRS``
+Include directories needed to use rocPRIM.
+
+Cache Variables
+^^^^^^^^^^^^^^^
+
+The following cache variables may also be set:
+
+``rocPRIM_INCLUDE_DIR``
+The directory containing ``rocprim.hpp``.
+
+#]=======================================================================]
+
+include (FindPackageHandleStandardArgs)
+
+find_path(rocPRIM_INCLUDE_DIR
+  NAMES rocprim/rocprim.hpp
+  HINTS
+    ${ROCPRIM_DIR}/
+    ${HIP_ROOT_DIR}/../
+  PATH_SUFFIXES
+    include
+    rocprim
+    rocprim/include)
+
+find_package_handle_standard_args(
+  rocPRIM
+  DEFAULT_MSG
+  rocPRIM_INCLUDE_DIR)
+
+if (rocPRIM_FOUND)
+  set(rocPRIM_INCLUDE_DIRS ${rocPRIM_INCLUDE_DIR})
+endif()

From c80f1a5f6cc7a789cee23e4b87535d67efcbe788 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Mon, 5 Dec 2022 13:38:24 -0800
Subject: [PATCH 91/94] Final update of release notes.

---
 RELEASE_NOTES.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 60259de6ae..f34f191608 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -19,6 +19,7 @@ Notable changes include:
 
   * Bug fixes/improvements:
 
+
 Version 2022.10.3 -- Release date 2022-12-01
 ============================================
 
@@ -31,6 +32,8 @@ Notable changes include:
 
   * Properly export 'roctx' target when CMake variable RAJA_ENABLE_ROCTX is on. 
   * Fix CMake logic for exporting desul targets when desul atomics are enabled.
+  * Fix the way we use CMake to find the rocPRIM module to follow CMake
+    best practices.
   * Add missing template parameter pack argument in RAJA::statement::For
     execution policy construct used in RAJA::kernel implementation for OpenMP 
     target back-end.

From 828388e0935ab1d9cb051c61d0fef1d99d31a4cd Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Mon, 5 Dec 2022 18:05:59 -0500
Subject: [PATCH 92/94] adding building script for hipcc

---
 scripts/lc-builds/toss4_hipcc.sh | 92 ++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 scripts/lc-builds/toss4_hipcc.sh

diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh
new file mode 100644
index 0000000000..2ec9b9be46
--- /dev/null
+++ b/scripts/lc-builds/toss4_hipcc.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 2 ]]; then
+  echo
+  echo "You must pass 2 or more arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) HIP compute architecture"
+  echo "   3...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_amdclang.sh 4.1.0 gfx906"
+  exit
+fi
+
+COMP_VER=$1
+COMP_ARCH=$2
+shift 2
+
+HOSTCONFIG="hip_3_X"
+
+if [[ ${COMP_VER} == 4.* ]]
+then
+##HIP_CLANG_FLAGS="-mllvm -amdgpu-fixed-function-abi=1"
+  HOSTCONFIG="hip_4_link_X"
+elif [[ ${COMP_VER} == 3.* ]]
+then
+  HOSTCONFIG="hip_3_X"
+else
+  echo "Unknown hip version, using ${HOSTCONFIG} host-config"
+fi
+
+BUILD_SUFFIX=lc_toss4-hipcc-${COMP_VER}-${COMP_ARCH}
+
+echo
+echo "Creating build directory build_${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+echo "To use fp64 HW atomics you must configure with these options when using gfx90a and hip >= 5.2"
+echo "   -DCMAKE_CXX_FLAGS=\"-munsafe-fp-atomics\""
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.23.1
+
+# unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
+# are inconsistent causing the rocprim from the module to be used unexpectedly
+module unload rocm
+
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DROCM_ROOT_DIR="/opt/rocm-${COMP_VER}" \
+  -DHIP_ROOT_DIR="/opt/rocm-${COMP_VER}/hip" \
+  -DHIP_PATH=/opt/rocm-${COMP_VER}/bin \
+  -DCMAKE_C_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \
+  -DCMAKE_CXX_COMPILER=/opt/rocm-${COMP_VER}/bin/hipcc \
+  -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \
+  -DGPU_TARGETS="${COMP_ARCH}" \
+  -DAMDGPU_TARGETS="${COMP_ARCH}" \
+  -DBLT_CXX_STD=c++14 \
+  -C "../host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake" \
+  -DENABLE_HIP=ON \
+  -DENABLE_OPENMP=ON \
+  -DENABLE_CUDA=OFF \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you have to have a consistent build environment"
+echo "  when you make RAJA as cmake may reconfigure; unload the rocm module"
+echo "  or load the appropriate rocm module (${COMP_VER}) when building."
+echo
+echo "    module unload rocm"
+echo "    srun -n1 make"
+echo
+echo "***********************************************************************"

From 3ca7535ccbf0299f8921ebddfd0fd90ed2cfd9ab Mon Sep 17 00:00:00 2001
From: Corbin Robeck <13821049+CRobeck@users.noreply.github.com>
Date: Mon, 5 Dec 2022 23:08:07 -0500
Subject: [PATCH 93/94] make comments consistent

Co-authored-by: Jason Burmark <MrBurmark@users.noreply.github.com>
---
 scripts/lc-builds/toss4_hipcc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh
index 2ec9b9be46..0e3d8bdd6d 100644
--- a/scripts/lc-builds/toss4_hipcc.sh
+++ b/scripts/lc-builds/toss4_hipcc.sh
@@ -15,7 +15,7 @@ if [[ $# -lt 2 ]]; then
   echo "   3...) optional arguments to cmake"
   echo
   echo "For example: "
-  echo "    toss4_amdclang.sh 4.1.0 gfx906"
+  echo "    toss4_hipcc.sh 4.1.0 gfx906"
   exit
 fi
 

From 589685d756f40a78b5409641295bbd6882478c54 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 6 Dec 2022 10:30:29 -0800
Subject: [PATCH 94/94] Fix exec permissions on hipcc script

---
 scripts/lc-builds/toss4_hipcc.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 scripts/lc-builds/toss4_hipcc.sh

diff --git a/scripts/lc-builds/toss4_hipcc.sh b/scripts/lc-builds/toss4_hipcc.sh
old mode 100644
new mode 100755