diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000..94143827ed
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1 @@
+Dockerfile
diff --git a/.gitignore b/.gitignore
index f4f1cd0dc1..10b3b40f79 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+.cache
+.idea
+*.sync-conflict-*
 *.pyc
 *.o
 *.a
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 32d794b644..81c128f0b9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,6 +1,6 @@
 ###############################################################################
-# Copyright (c) 2016-2020, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
@@ -10,93 +10,56 @@
 # at Lawrence Livermore National Laboratory (LLNL).
 #
 # This entire pipeline is LLNL-specific
-# #############################################################################
-
-# We define the following GitLab pipeline variables:
-#
-# GIT_SUBMODULE_STRATEGY:
-# Tells Gitlab to recursively update the submodules when cloning umpire
 #
-# ALLOC_NAME:
-# On LLNL's ruby, this pipeline creates only one allocation shared among jobs
-# in order to save time and resources. This allocation has to be uniquely named
-# so that we are sure to retrieve it.
+# Important note: This file is a template provided by
+# llnl/radiuss-shared-ci. It should not require any change from the project to
+# get started but could feature project-specific stages.
 #
-# BUILD_ROOT:
-# The path to the shared resources between all jobs. The BUILD_ROOT is unique to
-# the pipeline, preventing any form of concurrency with other pipelines. This
-# also means that the BUILD_ROOT directory will never be cleaned.
-#
-# DEFAULT_TIME:
-# Default time to let the Lassen jobs run will be 30 minutes. However, if it is
-# a job that requires more time, it will be overwritten in the lassen template 
-# file.
-# TODO: add a clean-up mechanism
+# However, each project should provide:
+# - .gitlab/custom-jobs-and-variables.yml
+# - .gitlab/subscribed-pipelines.yml
+# - .gitlab/${MACHINE}-build-and-test-extra.yml
+###############################################################################
 
+# We define the following GitLab pipeline variables:
 variables:
+  MP_BRANCH: "develop"
+# Use a service user to run CI. This prevents from running pipelines as an
+# actual user.
+  LLNL_SERVICE_USER: ""
+# Use a service user workspace. Solves permission issues, stores everything
+# at the same location whoever triggers a pipeline.
+#  CUSTOM_CI_BUILDS_DIR: ""
+# Tells Gitlab to recursively update the submodules when cloning the project.
   GIT_SUBMODULE_STRATEGY: recursive
-  ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID}
+# We build the projects in the CI clone directory.
+# TODO: add a clean-up mechanism
   BUILD_ROOT: ${CI_PROJECT_DIR}
-  DEFAULT_TIME: 30
-  MP_BRANCH: "develop"
 
-# Normally, stages are blocking in Gitlab. However, using the keyword "needs" we
-# can express dependencies between job that break the ordering of stages, in
-# favor of a DAG.
-# In practice r_*, l_* and b_* stages are independently run and start immediately.
+# We organize the build-and-test stage in sub-pipelines. Each sub-pipeline
+# corresponds to a test batch on a given machine.
 
+# High level stages
 stages:
-  - r_allocate_resources
-  - r_build_and_test
-  - r_release_resources
-  - l_build_and_test
-  - b_build_and_test
-  - c_build_and_test
+  - build-and-test
   - multi_project
 
-# This is the rules that drives the activation of "advanced" jobs. All advanced
-# jobs will share this through a template mechanism.
-.advanced_pipeline:
-  rules:
-    - if: '$CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "develop" || $ALL_TARGETS == "ON"' #run only if ...
-
-# These are also templates (.name) that define project specific build commands.
-# If an allocation exist with the name defined in this pipeline, the job will
-# use it (slurm specific).
-.build_toss_3_x86_64_ib_script:
-  script:
-    - echo ${ALLOC_NAME}
-    - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
-    - echo ${JOBID}
-    - srun $( [[ -n "${JOBID}" ]] && echo "--jobid=${JOBID}" ) -t ${DEFAULT_TIME} -N 1 scripts/gitlab/build_and_test.sh
-  artifacts:
-    reports:
-      junit: junit.xml
-
-.build_toss_4_x86_64_ib_corona_script:
-  script:
-    - srun -p pbatch -t 30 -N 1 scripts/gitlab/build_and_test.sh
-
-# Lassen and Butte use a different job scheduler (spectrum lsf) that does not
-# allow pre-allocation the same way slurm does.
-.build_blueos_3_ppc64le_ib_script:
-  script:
-    - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh
-  artifacts:
-    reports:
-      junit: junit.xml
-
-.build_blueos_3_ppc64le_ib_ats_disabled_script:
-  script:
-    - lalloc 1 --atsdisable -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh
-  artifacts:
-    reports:
-      junit: junit.xml
-
-.build_blueos_3_ppc64le_ib_p9_script:
-  extends: .build_blueos_3_ppc64le_ib_script
+# Template for jobs triggering a build-and-test sub-pipelines:
+.build-and-test:
+  stage: build-and-test
+  trigger:
+    include:
+      - local: '.gitlab/custom-jobs-and-variables.yml'
+      - project: 'radiuss/radiuss-shared-ci'
+        ref: v2022.09.0
+        file: '${CI_MACHINE}-build-and-test.yml'
+      - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml'
+    strategy: depend
+    forward:
+      pipeline_variables: true
 
-# If testing develop branch, trigger CHAI pipeline with this version of RAJA.
+# If testing develop branch, trigger RAJAPerf pipeline with this version of
+# RAJA.
 # TODO: Once spack allows to clone a specific commit on demand, then point to the exact commit.
 #       This will prevent from sticking to a branch (here develop).
 #       MP_BRANCH is short for "Multi-Project Branch" and will usually be develop.
@@ -111,11 +74,6 @@ trigger-rajaperf:
     branch: develop
     strategy: depend
 
-# This is where jobs are included.
+# pipelines subscribed by the project
 include:
-  - local: .gitlab/ruby-templates.yml
-  - local: .gitlab/ruby-jobs.yml
-  - local: .gitlab/lassen-templates.yml
-  - local: .gitlab/lassen-jobs.yml
-  - local: .gitlab/corona-templates.yml
-  - local: .gitlab/corona-jobs.yml
+  - local: .gitlab/subscribed-pipelines.yml
diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml
new file mode 100644
index 0000000000..a94300f85b
--- /dev/null
+++ b/.gitlab/corona-build-and-test-extra.yml
@@ -0,0 +1,28 @@
+#############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+#############################################################################
+
+########################
+# Overridden shared jobs
+########################
+# We duplicate the shared jobs description and add necessary changes for RAJA.
+# We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
+# the comparison with the original job is easier.
+
+# No overridden jobs so far.
+
+############
+# Extra jobs
+############
+# We do not recommend using ${PROJECT_<MACHINE>_VARIANTS} and
+# ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
+# describe the spec here.
+
+rocm_5_1_1_clang_13_0_0_desul_atomics:
+  variables:
+    SPEC: " +rocm ~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 ^hip@5.1.1 ^blt@develop"
+  extends: .build_and_test_on_corona
+
diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml
deleted file mode 100644
index d5e72f6fea..0000000000
--- a/.gitlab/corona-jobs.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-#############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-#############################################################################
-
-hip_4_5_2_clang_13_0_0 (build and test on corona):
-  variables:
-    SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2"
-  extends: .build_and_test_on_corona
-
-hip_4_5_2_clang_13_0_0_desul_atomics (build and test on corona):
-  variables:
-    SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2"
-  extends: .build_and_test_on_corona
diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml
deleted file mode 100644
index 4e1a5cb744..0000000000
--- a/.gitlab/corona-templates.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-#############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-#############################################################################
-
-####
-# This is the share configuration of jobs for corona
-
-####
-# In pre-build phase, allocate a node for builds
-.on_corona:
-  tags:
-    - shell
-    - corona
-  rules:
-    - if: '$ON_CORONA == "OFF"' #run except if ...
-      when: never
-    - if: '$CI_JOB_NAME =~ /release_resources/'
-      when: always
-    - when: on_success
-
-####
-# Generic corona build job, extending build script
-.build_and_test_on_corona:
-  stage: c_build_and_test
-  extends: [.build_toss_4_x86_64_ib_corona_script, .on_corona]
-  needs: []
-
-.build_and_test_on_corona_advanced:
-  extends: [.build_and_test_on_corona, .advanced_pipeline]
-
diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
new file mode 100644
index 0000000000..53f36c56cd
--- /dev/null
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -0,0 +1,52 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC and RADIUSS
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (MIT)
+###############################################################################
+
+# We define the following GitLab pipeline variables:
+variables:
+# On LLNL's ruby, this pipeline creates only one allocation shared among jobs
+# in order to save time and resources. This allocation has to be uniquely named
+# so that we are sure to retrieve it and avoid collisions.
+  ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID}
+
+# Ruby
+# Arguments for top level allocation
+  RUBY_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --partition=pdebug --time=60 --nodes=1"
+# Arguments for job level allocation
+  RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=45 --nodes=1"
+# Project specific variants for ruby
+  PROJECT_RUBY_VARIANTS: "+openmp "
+# Project specific deps for ruby
+  PROJECT_RUBY_DEPS: ""
+
+# Corona
+# Arguments for top level allocation
+  CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--time-limit=60m --nodes=1"
+# Arguments for job level allocation
+  CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1"
+# Project specific variants for corona
+  PROJECT_CORONA_VARIANTS: "~openmp "
+# Project specific deps for corona
+  PROJECT_CORONA_DEPS: "^blt@develop "
+
+# Lassen and Butte use a different job scheduler (spectrum lsf) that does not
+# allow pre-allocation the same way slurm does.
+# Arguments for job level allocation
+  LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60"
+# Project specific variants for lassen
+  PROJECT_LASSEN_VARIANTS: "+openmp "
+# Project specific deps for lassen
+  PROJECT_LASSEN_DEPS: ""
+
+
+# Configuration shared by build and test jobs specific to this project.
+# Not all configuration can be shared. Here projects can fine tune the
+# CI behavior.
+# See Umpire for an example (export junit test reports).
+.custom_build_and_test:
+  artifacts:
+    reports:
+      junit: junit.xml
diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml
new file mode 100644
index 0000000000..0442a602bd
--- /dev/null
+++ b/.gitlab/lassen-build-and-test-extra.yml
@@ -0,0 +1,146 @@
+##############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+########################
+# Overridden shared jobs
+########################
+# We duplicate the shared jobs description and add necessary changes for RAJA.
+# We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
+# the comparison with the original job is easier.
+
+# Overriding shared spec: Allow failures
+ibm_clang_9_0_0:
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} %clang@ibm.9.0.0 ${PROJECT_LASSEN_DEPS}"
+  extends: .build_and_test_on_lassen
+  allow_failure: true
+
+# Overriding shared spec: Allow failures
+ibm_clang_9_0_0_gcc_8_3_1:
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} %clang@ibm.9.0.0 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ${PROJECT_LASSEN_DEPS}"
+  extends: .build_and_test_on_lassen
+  allow_failure: true
+
+# Overriding shared spec: Extra flags
+gcc_8_3_1:
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} %gcc@8.3.1 cxxflags=\"-finline-functions -finline-limit=20000\" cflags=\"-finline-functions -finline-limit=20000\" ${PROJECT_LASSEN_DEPS}"
+  extends: .build_and_test_on_lassen
+
+# Overriding shared spec: Longer allocation + Allow failures
+pgi_20_4_gcc_8_3_1:
+  extends: .build_and_test_on_lassen
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} %pgi@20.4 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ${PROJECT_LASSEN_DEPS}"
+    LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 80"
+  allow_failure: true
+
+# Overriding shared spec: Extra flags
+xl_16_1_1_12:
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} %xl@16.1.1.12 cxxflags=\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" ${PROJECT_LASSEN_DEPS}"
+  extends: .build_and_test_on_lassen
+
+# Overriding shared spec: Extra flags
+xl_16_1_1_12_gcc_8_3_1:
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ${PROJECT_LASSEN_DEPS}"
+  extends: .build_and_test_on_lassen
+
+# Overriding shared spec: Allow failures
+ibm_clang_9_0_0_gcc_8_3_1_cuda_10_1_168:
+  extends: .build_and_test_on_lassen
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %clang@ibm.9.0.0 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@10.1.168 ${PROJECT_LASSEN_DEPS}"
+  allow_failure: true
+
+# Overriding shared spec: Longer allocation + Extra flags + Allow failure + Updated cuda
+xl_16_1_1_12_cuda_11_1_1:
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cuda_arch=70 ^cuda@11.1.0 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}"
+    LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120"
+  allow_failure: true
+  extends: .build_and_test_on_lassen
+
+# Overriding shared spec: Deactivated spec. This spec will be removed soon.
+xl_16_1_1_12_gcc_8_3_1_cuda_11_0_2:
+  variables:
+    SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@11.0.2 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}"
+  extends: .build_and_test_on_lassen
+  script:
+    - |
+      echo -e "\e[31mDeactivated spec !\e[0m"
+      echo -e "\e[31m${SPEC}\e[0m"
+      echo -e "\e[31mRAJA won’t build with Cuda 11.0.2 due to a known issue.\e[0m"
+    - exit 1
+  allow_failure: true
+
+# Overriding shared spec: Longer allocation + Extra flags + Allow failure + Updated cuda
+xl_16_1_1_12_gcc_8_3_1_cuda_11_1_0:
+  variables:
+    SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cuda_arch=70 ^cuda@11.1.0 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}"
+    LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120"
+  allow_failure: true
+  extends: .build_and_test_on_lassen
+
+
+############
+# Extra jobs
+############
+# We do not recommend using ${PROJECT_<MACHINE>_VARIANTS} and
+# ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
+# describe the spec here.
+
+##########
+# CPU ONLY
+##########
+
+clang_14_0_5:
+  variables:
+    SPEC: " +openmp %clang@14.0.5"
+  extends: .build_and_test_on_lassen
+
+##########
+# CUDA
+##########
+
+clang_12_0_1_cuda_11_5_0:
+  variables:
+    SPEC: " +openmp +cuda cuda_arch=70 %clang@12.0.1 ^cuda@11.5.0"
+  extends: .build_and_test_on_lassen
+
+gcc_8_3_1_cuda_11_1_0:
+  variables:
+    SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.1.0"
+  extends: .build_and_test_on_lassen
+
+gcc_8_3_1_cuda_11_5_0_ats_disabled:
+  extends: .build_and_test_on_lassen
+  variables:
+    SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0"
+    LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 60"
+
+##########
+# OTHERS
+##########
+
+clang_13_0_1_libcpp:
+  variables:
+    SPEC: " +openmp %clang@13.0.1+libcpp"
+  extends: .build_and_test_on_lassen
+
+clang_14_0_5_asan:
+  variables:
+    SPEC: " +openmp %clang@14.0.5 cxxflags=-fsanitize=address"
+    ASAN_OPTIONS: "detect_leaks=1"
+  extends: .build_and_test_on_lassen
+
+gcc_8_3_1_cuda_10_1_168_desul_atomics:
+  variables:
+    SPEC: " +openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
+  extends: .build_and_test_on_lassen
diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml
deleted file mode 100644
index 8b5d070993..0000000000
--- a/.gitlab/lassen-jobs.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-##############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-##############################################################################
-
-##########
-# CPU ONLY
-##########
-
-ibm_clang_9:
-  variables:
-    SPEC: "%clang@ibm.9.0.0"
-  extends: .build_and_test_on_lassen
-
-ibm_clang_9_gcc_8:
-  variables:
-    SPEC: "%clang@ibm.9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
-  extends: .build_and_test_on_lassen
-
-gcc_8_3_1:
-  variables:
-    SPEC: "%gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'"
-  extends: .build_and_test_on_lassen
-
-xl_16_1_1_11:
-  variables:
-    SPEC: "%xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'"
-    DEFAULT_TIME: 50
-  extends: .build_and_test_on_lassen
-
-xl_16_1_1_11_gcc_8_3_1:
-  variables:
-    SPEC: "%xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
-    DEFAULT_TIME: 50
-  extends: .build_and_test_on_lassen
-
-##########
-# CUDA
-##########
-
-ibm_clang_9_cuda:
-  variables:
-    SPEC: "+cuda cuda_arch=70 %clang@ibm.9.0.0 ^cuda@10.1.168"
-  extends: .build_and_test_on_lassen
-
-ibm_clang_10_cuda:
-  variables:
-    SPEC: "+cuda cuda_arch=70 %clang@ibm.10.0.1 ^cuda@10.1.168"
-  extends: .build_and_test_on_lassen
-
-gcc_8_3_1_cuda:
-  variables:
-    SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
-  extends: .build_and_test_on_lassen
-
-gcc_8_3_1_cuda_ats_disabled:
-  variables:
-    SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
-  extends: .build_and_test_on_lassen_ats_disabled
-
-xl_16_1_1_7_cuda:
-  variables:
-    SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5"
-    DEFAULT_TIME: 60
-  allow_failure: true
-  extends: .build_and_test_on_lassen
-
-xl_16_1_1_7_gcc_8_3_1_cuda_11:
-  variables:
-    SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5"
-    DEFAULT_TIME: 60
-  allow_failure: true
-  extends: .build_and_test_on_lassen
-
-##########
-# EXTRAS
-##########
-
-clang_9_0_0_libcpp (build and test on lassen):
-  variables:
-    SPEC: "%clang@9.0.0+libcpp"
-  extends: .build_and_test_on_lassen
-
-clang_9_0_0_memleak (build and test on lassen):
-  variables:
-    SPEC: "%clang@9.0.0 cxxflags=-fsanitize=address"
-    ASAN_OPTIONS: "detect_leaks=1"
-  extends: .build_and_test_on_lassen
-
-gcc_8_3_1_cuda_desul_atomics:
-  variables:
-    SPEC: "+cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
-  extends: .build_and_test_on_lassen
diff --git a/.gitlab/lassen-templates.yml b/.gitlab/lassen-templates.yml
deleted file mode 100644
index dbc340f22a..0000000000
--- a/.gitlab/lassen-templates.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-##############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-##############################################################################
-
-####
-# Shared configuration of jobs for lassen
-.on_lassen:
-  variables:
-  tags:
-    - shell
-    - lassen
-  rules:
-    - if: '$CI_COMMIT_BRANCH =~ /_lnone/ || $ON_LASSEN == "OFF"' #run except if ...
-      when: never
-    - when: on_success
-
-.build_and_test_on_lassen:
-  stage: l_build_and_test
-  extends: [.build_blueos_3_ppc64le_ib_p9_script, .on_lassen]
-  needs: []
-
-.build_and_test_on_lassen_ats_disabled:
-  stage: l_build_and_test
-  extends: [.build_blueos_3_ppc64le_ib_ats_disabled_script, .on_lassen]
-  needs: []
-
-# Note: .build_and_test_on_lassen_advanced inherits from
-# .build_and_test_on_lassen and .advanced_pileline.
-# In particular, the rules section will be merged. Careful when changing rules.
-.build_and_test_on_lassen_advanced:
-  extends: [.build_and_test_on_lassen, .advanced_pipeline]
diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml
new file mode 100644
index 0000000000..9bebc62530
--- /dev/null
+++ b/.gitlab/ruby-build-and-test-extra.yml
@@ -0,0 +1,58 @@
+##############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+##############################################################################
+
+########################
+# Overridden shared jobs
+########################
+# We duplicate the shared jobs description and add necessary changes for RAJA.
+# We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
+# the comparison with the original job is easier.
+
+# Overriding shared config for longer run
+gcc_8_1_0:
+  variables:
+    SPEC: " ${PROJECT_RUBY_VARIANTS} %gcc@8.1.0 ${PROJECT_RUBY_DEPS}"
+    RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1"
+  extends: .build_and_test_on_ruby
+
+# Overriding shared spec: Allow failures
+pgi_20_1_gcc_local_8_3_1:
+  variables:
+    SPEC: " ${PROJECT_RUBY_VARIANTS} %pgi@20.1 cxxflags\"=-rc=/usr/workspace/umpire/pgi/x86_64/local-gcc-8.3.1-rc\" cflags\"=-rc=/usr/workspace/umpire/pgi/x86_64/local-gcc-8.3.1-rc\" fflags=\"-rc=/usr/workspace/umpire/pgi/x86_64/local-gcc-8.3.1-rc\" ${PROJECT_RUBY_DEPS}"
+  extends: .build_and_test_on_ruby
+  allow_failure: true
+
+############
+# Extra jobs
+############
+# We do not recommend using ${PROJECT_<MACHINE>_VARIANTS} and
+# ${PROJECT_<MACHINE>_DEPS} in the extra jobs. There is no reason not to fully
+# describe the spec here.
+
+clang_9_0_0_openmp_off:
+  variables:
+    SPEC: " ~openmp %clang@9.0.0"
+  extends: .build_and_test_on_ruby
+
+gcc_8_1_0_openmp_default:
+  variables:
+    SPEC: " %gcc@8.1.0"
+    RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1"
+  extends: .build_and_test_on_ruby
+
+icpc_19_1_0:
+  variables:
+    SPEC: " +openmp %intel@19.1.0"
+    RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1"
+  extends: .build_and_test_on_ruby
+
+# OTHERS
+clang_10_0_1_gcc_8_3_1_desul_atomics:
+  variables:
+    SPEC: " +openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
+  extends: .build_and_test_on_ruby
+
diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/ruby-jobs.yml
deleted file mode 100644
index 2b6cceb5c7..0000000000
--- a/.gitlab/ruby-jobs.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-##############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-##############################################################################
-
-clang_10:
-  variables:
-    SPEC: "%clang@10.0.1"
-  extends: .build_and_test_on_ruby
-
-clang_9:
-  variables:
-    SPEC: "%clang@9.0.0"
-  extends: .build_and_test_on_ruby
-
-gcc_8_1_0:
-  variables:
-    SPEC: "%gcc@8.1.0"
-    DEFAULT_TIME: 60
-  extends: .build_and_test_on_ruby
-
-#icpc_17_0_2:
-#  variables:
-#    SPEC: "%intel@17.0.2"
-#    DEFAULT_TIME: 40
-#  extends: .build_and_test_on_ruby
-
-icpc_18_0_2:
-  variables:
-    SPEC: " tests=none %intel@18.0.2"
-    DEFAULT_TIME: 40
-  extends: .build_and_test_on_ruby
-
-icpc_19_1_0:
-  variables:
-    SPEC: "%intel@19.1.0"
-    DEFAULT_TIME: 40
-  extends: .build_and_test_on_ruby
-
-# EXTRAS
-
-#gcc_4_9_3:
-#  variables:
-#    SPEC: "%gcc@4.9.3"
-#    DEFAULT_TIME: 60
-#  extends: .build_and_test_on_ruby
-
-clang_10_desul_atomics:
-  variables:
-    SPEC: "+openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1"
-  extends: .build_and_test_on_ruby
diff --git a/.gitlab/ruby-templates.yml b/.gitlab/ruby-templates.yml
deleted file mode 100644
index b1314534b3..0000000000
--- a/.gitlab/ruby-templates.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-##############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-##############################################################################
-
-####
-# This is the shared configuration of jobs for ruby
-
-####
-# In pre-build phase, allocate a node for builds
-.on_ruby:
-  tags:
-    - shell
-    - ruby
-  rules:
-    - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ...
-      when: never
-    - if: '$CI_JOB_NAME =~ /release_resources/'
-      when: always
-    - when: on_success
-
-####
-# In pre-build phase, allocate a node for builds
-# NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores
-allocate_resources (on ruby):
-  variables:
-    GIT_STRATEGY: none
-  extends: .on_ruby
-  stage: r_allocate_resources
-  script:
-    - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME}
-
-####
-# In post-build phase, deallocate resources
-# Note : make sure this is run even on build phase failure
-release_resources (on ruby):
-  variables:
-    GIT_STRATEGY: none
-  extends: .on_ruby
-  stage: r_release_resources
-  script:
-    - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A)
-    - ([[ -n "${JOBID}" ]] && scancel ${JOBID})
-
-####
-# Generic ruby build job, extending build script
-.build_and_test_on_ruby:
-  extends: [.build_toss_3_x86_64_ib_script, .on_ruby]
-  stage: r_build_and_test
-
-.build_and_test_on_ruby_advanced:
-  extends: [.build_and_test_on_ruby, .advanced_pipeline]
diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml
new file mode 100644
index 0000000000..c424d3e1e4
--- /dev/null
+++ b/.gitlab/subscribed-pipelines.yml
@@ -0,0 +1,24 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC and RADIUSS
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (MIT)
+###############################################################################
+
+# Uncomment pipelines to subscribe to a shared pipeline.
+
+# Trigger a build-and-test pipeline for ruby, corona and lassen
+ruby-build-and-test:
+  variables:
+    CI_MACHINE: "ruby"
+  extends: [.build-and-test]
+
+corona-build-and-test:
+  variables:
+    CI_MACHINE: "corona"
+  extends: [.build-and-test]
+
+lassen-build-and-test:
+  variables:
+    CI_MACHINE: "lassen"
+  extends: [.build-and-test]
diff --git a/.uberenv_config.json b/.uberenv_config.json
index 335f4c91eb..2fc700f855 100644
--- a/.uberenv_config.json
+++ b/.uberenv_config.json
@@ -3,10 +3,11 @@
 "package_version" : "develop",
 "package_final_phase" : "hostconfig",
 "package_source_dir" : "../..",
-"spack_url": "https://github.com/spack/spack",
-"spack_branch": "develop",
-"spack_commit": "be1c4bc563722d0774436cc905fd938c88c61a72",
+"spack_url": "https://github.com/spack/spack.git",
+"spack_branch": "v0.18.1",
 "spack_activate" : {},
 "spack_configs_path": "scripts/radiuss-spack-configs",
-"spack_packages_path": "scripts/spack_packages"
+"spack_packages_path": "scripts/spack_packages",
+"spack_concretizer": "clingo",
+"spack_setup_clingo": false
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0fc49959ac..77d31fe778 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,14 +44,21 @@ set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PA
 
 include(cmake/SetupRajaOptions.cmake)
 
-cmake_minimum_required(VERSION 3.14.5)
+if (ENABLE_HIP)
+  cmake_minimum_required(VERSION 3.23)
+else()
+  cmake_minimum_required(VERSION 3.20)
+endif()
 
 # Detect C++ standard and add appropriate flag _before_ loading BLT
 set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC)
 
 include(CheckCXXCompilerFlag)
 if(NOT DEFINED BLT_CXX_STD)
-  if("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES)
+  if("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES)
+    set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard")
+    message("Using C++ standard: ${BLT_CXX_STD}")
+  elseif("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES)
     set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard")
     message("Using C++ standard: ${BLT_CXX_STD}")
   elseif("cxx_std_14" IN_LIST CMAKE_CXX_KNOWN_FEATURES)
@@ -216,6 +223,7 @@ if (RAJA_ENABLE_EXTERNAL_ROCPRIM)
 endif ()
 
 if (RAJA_ENABLE_SYCL)
+  set (RAJA_ENABLE_DESUL_ATOMICS "On")
   set (raja_depends
     ${raja_depends}
     sycl)
@@ -283,13 +291,13 @@ blt_add_library(
 
 
 install(TARGETS RAJA
-  EXPORT RAJA
+  EXPORT RAJATargets
   ARCHIVE DESTINATION lib
   LIBRARY DESTINATION lib
   RUNTIME DESTINATION lib
   )
 
-install(EXPORT RAJA DESTINATION lib/cmake/raja)
+install(EXPORT RAJATargets DESTINATION lib/cmake/raja)
 
 target_include_directories(RAJA
   PUBLIC
diff --git a/Dockerfile b/Dockerfile
index b4d6dfc585..3b7f3c4804 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -41,7 +41,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \
+RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \
     cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \
     make -j 6 &&\
     ctest -T test --output-on-failure
@@ -50,7 +50,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \
+RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \
     cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Debug .. && \
     make -j 6 &&\
     ctest -T test --output-on-failure
@@ -59,7 +59,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
-RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \
+RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \
     cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Release .. && \
     make -j 6 &&\
     ctest -T test --output-on-failure
@@ -88,16 +88,16 @@ RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
     cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \
     make -j 4
 
-FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-4.3.1 AS hip
+FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip
 ENV GTEST_COLOR=1
 ENV HCC_AMDGPU_TARGET=gfx900
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
 RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \
-    cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \
+    cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \
     make -j 6
 
-FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.0.1 AS sycl
+FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl
 ENV GTEST_COLOR=1
 COPY . /home/raja/workspace
 WORKDIR /home/raja/workspace/build
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 30f94b6e50..062c604d6a 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -20,6 +20,90 @@ Notable changes include:
   * Bug fixes/improvements:
 
 
+Version 2022.10.0 -- Release date 2022-10-28
+============================================
+
+This release contains new features, bug fixes, and build improvements. Please
+see the RAJA user guide for more information about items in this release.
+
+Notable changes include:
+
+  * New features / API changes:
+     * Introduced a new RAJA::forall and reduction interface that extend
+       the execution behavior of reduction operations with RAJA::forall.
+       The main difference with the pre-existing reduction interface in RAJA
+       is that reduction variables and operations are passed into the 
+       RAJA::forall method and lambda expression instead of using the lambda
+       capture mechanism for reduction objects. This offers flexibility and
+       potential performance advantages when using RAJA reductions as the
+       new interface enables the ability to integrate with programming model 
+       back-end reduction machinery directly, for OpenMP and SYCL for example.
+       The interface also enables user-chosen kernel names to be passed to
+       RAJA::forall for performance analysis annotations that are easier to
+       understand. Example codes are included as well as a description of
+       the new interface and comparison with the pre-existing interface in
+       the RAJA User Guide.
+     * Added support for run time execution policy selection for RAJA::forall
+       kernels. Users can specify any number of execution policies in their
+       code and then select which to use at run time. There is no discussion 
+       of this in the RAJA User Guide yet. However, there are a couple of 
+       example codes in files RAJA/examples/*dynamic-forall*.cpp.
+          * The RAJA::launch framework has been moved out of the experimental namespace, into the RAJA:: namespace, which introduces an API change.
+          * Add support for all RAJA segment types in the RAJA::launch framework.
+     * Add SYCL back-end support for RAJA::launch and dynamic shared memory
+       for all back-ends in RAJA::launch. These changes introduce API changes.
+     * Add additional policies to WorkGroup construct that allow for different
+       methods of dispatching work.
+     * Add special case implementations to CUDA atomicInc and atomicDec 
+       functions to use special hardware support when available. This can
+       result in a significant performance boost.
+     * Rework HIP atomic implementations to support more native data types.
+     * Added RAJA_UNROLL_COUNT macro which enables users to unroll loops for
+       a fix unroll count.
+     * Major User Guide rework:
+         * New RAJA tutorial sections, including new exercise source files
+           to work through. Material used in recent RADIUSS/AWS RAJA Tutorial.
+         * Cleaned up and expanded RAJA feature sections to be more like a
+           reference guide with links to associated tutorial sections for
+           implementation examples.
+         * Improved presentation of build configuration sections.
+
+  * Build changes / improvements:
+     * Submodule updates:
+         * BLT updated to v0.5.2 release.
+         * Camp updated to v2022.10.0 release.
+     * The minimum CMake version required has changed. For a HIP build,
+       CMake 3.23 or newer is required. For all other builds CMake 3.20
+       or newer is required.
+     * OpenMP back-end support is now off by default to match behavior of
+       all other RAJA parallel back-end support. To enable OpenMP, users
+       must now run CMake with the -DENABLE_OPENMP=On option.
+     * Support OpenMP back-end enablement in a HIP build configuration.
+     * RAJA_ENABLE_VECTORIZATION CMake option added to enable/disable
+       new SIMD/SIMT vectorization support. The default is 'On'. The option
+       allows users to disable if they wish.
+     * Improvements to build target export mechanics coordinated with camp,
+       BLT, and Spack projects.
+     * Improve HIP builds to better support evolving ROCm software stack.
+     * Add CMake variable RAJA_ALLOW_INCONSISTENT_OPTIONS and CMake messages
+       to allow users more control when using CMake dependent options. When
+       CMake is run, the code now checks for cases when RAJA_ENABLE_X=On and
+       but ENABLE_X=Off. Previously, this was confusing because X would not
+       be enabled despite the value of the RAJA-specific option.
+     * Build system refactoring to make CMake configurations more robust; added
+       test to check for installed CMake config. 
+     * Added basic support to compile with C++20 standard.
+     * Add missing compilation macro guards for HIP and CUDA policies in
+       vectorization support when not running on a GPU device.
+     * Various compiler warnings squashed. 
+
+  * Bug fixes / improvements:
+     * Expanded test coverage to catch more cases that users have run into.
+     * Various fixes in SIMD/SIMT support for different compilers and versions
+       users have hit recently. Also, changes to internal implementations to
+       improve run time performance for those features.
+
+
 Version 2022.03.1 -- Release date 2022-08-10
 ============================================
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 6b40fa89ac..c84a71eb18 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -1,11 +1,5 @@
-variables:
-  DO_BUILD: 'yes'
-  DO_TEST: 'yes'
-  DO_INSTALL: 'yes'
-  COMPILER: 'g++'
-
 jobs:
-- job: Windows #temporarily commenting out until cmake/azure version issue resolved
+- job: Windows 
   strategy:
     matrix:
       shared:
@@ -28,15 +22,14 @@ jobs:
     inputs:
       workingDir: 'build'
       cmakeArgs: '--build . --config Release --verbose -j 4'
-#   - task: CmdLine@2
-#     inputs:
-#       script: 'ctest.exe -T test -C Release'
-#       workingDirectory: 'build'
-#     condition: eq( variables['Agent.OS'], 'Windows_NT')
-#   - task: PublishTestResults@2
-#     inputs:
-#       testResultsFormat: 'cTest'
-#       testResultsFiles: '**/Test.xml'
+  - task: CmdLine@2
+    inputs:
+      script: 'ctest.exe -T test -C Release'
+      workingDirectory: 'build'
+  - task: PublishTestResults@2
+    inputs:
+      testResultsFormat: 'cTest'
+      testResultsFiles: '**/Test.xml'
 - job: Docker
   timeoutInMinutes: 360
   strategy:
@@ -79,17 +72,6 @@ jobs:
       command: build
       dockerFile: 'Dockerfile'
       arguments: '--target $(docker_target)'
-  - script: |
-      CID=$(docker create llnl/raja:$(Build.BuildId))
-      echo ${CID}
-      docker cp ${CID}:/home/axom/workspace/build local-build
-      docker rm ${CID}
-    displayName: 'Copy test artifacts'
-    condition: ne( variables['docker_target'], 'nvcc')
-  - script: |
-      bash <(curl -s https://raw.githubusercontent.com/codecov/codecov-bash/0b376529f626b50b7d4a9fb734e0e50d28b9b91e/codecov) >& /dev/null
-    displayName: 'Upload code coverage'
-    condition: eq( variables['docker_target'], 'gcc')
   - task: PublishTestResults@2
     inputs:
       testResultsFormat: 'cTest'
@@ -112,12 +94,11 @@ jobs:
       make -j 4
     displayName: 'OSX Build'
     condition: eq( variables['Agent.OS'], 'Darwin')
-#  - script: |
-#      cd build
-#      ctest -T test --output-on-failure
-#    displayName: 'OSX Test'
-#    condition: eq( variables['Agent.OS'], 'Darwin')
-#  - task: PublishTestResults@2
-#    inputs:
-#      testResultsFormat: 'cTest'
-#      testResultsFiles: '**/Test.xml'
+  - script: |
+      cd build
+      ctest -T test --output-on-failure
+    displayName: 'OSX Test'
+  - task: PublishTestResults@2
+    inputs:
+      testResultsFormat: 'cTest'
+      testResultsFiles: '**/Test.xml'
diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp
index 2f059265d9..6720cda059 100644
--- a/benchmark/ltimes.cpp
+++ b/benchmark/ltimes.cpp
@@ -473,7 +473,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   timer.start();
 
   for (int iter = 0;iter < num_iter;++ iter){
-    RAJA::launch<pol_launch>(RAJA::HOST, RAJA::Grid(), [=](RAJA::LaunchContext ctx){
+    RAJA::launch<pol_launch>(RAJA::ExecPlace::HOST, RAJA::LaunchParams(), [=](RAJA::LaunchContext ctx){
 
       RAJA::loop<pol_g>(ctx, RAJA::TypedRangeSegment<IG>(0, num_g), [&](IG g){
         RAJA::loop<pol_z>(ctx, RAJA::TypedRangeSegment<IZ>(0, num_z), [&](IZ z){
@@ -1239,8 +1239,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   for (int iter = 0;iter < num_iter;++ iter){
     RAJA::launch<pol_launch>(
-        RAJA::DEVICE,
-        RAJA::Grid(RAJA::Teams(160, 1, 1),
+        RAJA::ExecPlace::DEVICE,
+        RAJA::LaunchParams(RAJA::Teams(160, 1, 1),
                               RAJA::Threads(8, 64, 1)),
         [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
     {
@@ -1380,8 +1380,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   printf("num_iter=%d\n", (int)num_iter);
   for (int iter = 0;iter < num_iter;++ iter){
     RAJA::launch<pol_launch>(
-        RAJA::DEVICE,
-        RAJA::Grid(RAJA::Teams(num_g, 1, 1),
+        RAJA::ExecPlace::DEVICE,
+        RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1),
                               RAJA::Threads(32, 32, 1)),
         [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
     {
diff --git a/blt b/blt
index 296bf64e64..97ea54d892 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit 296bf64e64edfcfcce6a53e3b396d6529e76b986
+Subproject commit 97ea54d892b4b1d56736830575c3db62e3d7674d
diff --git a/cmake/SetupDependentOptions.cmake b/cmake/SetupDependentOptions.cmake
index 163608071a..f5a3966bf9 100644
--- a/cmake/SetupDependentOptions.cmake
+++ b/cmake/SetupDependentOptions.cmake
@@ -9,6 +9,18 @@
 ## Here are the CMake dependent options in RAJA.
 ##
 
+set(RAJA_DEPENDENT_OPTIONS ENABLE_OPENMP ENABLE_CUDA ENABLE_HIP ENABLE_CLANG_CUDA ENABLE_COVERAGE ENABLE_TESTS ENABLE_EXAMPLES ENABLE_BENCHMARKS)
+foreach (option ${RAJA_DEPENDENT_OPTIONS})
+   if (${RAJA_${option}})
+      if (NOT ${option})
+         if (RAJA_ALLOW_INCONSISTENT_OPTIONS)
+            message(WARNING "RAJA_${option} set to On, but ${option} is Off. Please set ${option} to On to enable this feature.")
+         else ()
+            message(FATAL_ERROR "RAJA_${option} set to On, but ${option} is Off. Please set ${option} to On enable this feature.")
+         endif ()
+      endif ()
+   endif ()
+endforeach ()
 
 cmake_dependent_option(RAJA_ENABLE_OPENMP "Build with OpenMP support" On "ENABLE_OPENMP" Off)
 cmake_dependent_option(RAJA_ENABLE_CUDA "Build with CUDA support" On "ENABLE_CUDA" Off)
diff --git a/cmake/SetupPackages.cmake b/cmake/SetupPackages.cmake
index 8c76eb3b74..45dadbee4b 100644
--- a/cmake/SetupPackages.cmake
+++ b/cmake/SetupPackages.cmake
@@ -82,7 +82,7 @@ if (RAJA_ENABLE_HIP)
   endif()
 
   if (RAJA_ENABLE_EXTERNAL_ROCPRIM)
-    find_package(RocPRIM)
+    include(cmake/thirdparty/FindRocPRIM.cmake)
     if (ROCPRIM_FOUND)
       blt_import_library(
         NAME rocPRIM
@@ -105,21 +105,35 @@ if (RAJA_ENABLE_HIP AND RAJA_ENABLE_ROCTX)
 endif ()
 
 set(TPL_DEPS)
-blt_list_append(TO TPL_DEPS ELEMENTS cuda cuda_runtime IF RAJA_ENABLE_CUDA)
 blt_list_append(TO TPL_DEPS ELEMENTS nvtoolsext IF RAJA_ENABLE_NV_TOOLS_EXT)
 blt_list_append(TO TPL_DEPS ELEMENTS cub IF RAJA_ENABLE_EXTERNAL_CUB)
-blt_list_append(TO TPL_DEPS ELEMENTS blt_hip blt_hip_runtime IF RAJA_ENABLE_HIP)
 blt_list_append(TO TPL_DEPS ELEMENTS rocPRIM IF RAJA_ENABLE_EXTERNAL_ROCPRIM)
-blt_list_append(TO TPL_DEPS ELEMENTS openmp IF RAJA_ENABLE_OPENMP)
-blt_list_append(TO TPL_DEPS ELEMENTS mpi IF RAJA_ENABLE_MPI)
+
+set(RAJA_NEEDS_BLT_TPLS False)
+if (RAJA_ENABLE_CUDA OR RAJA_ENABLE_HIP OR RAJA_ENABLE_OPENMP OR RAJA_ENABLE_MPI)
+  set(RAJA_NEEDS_BLT_TPLS True)
+endif ()
+
+if (RAJA_NEEDS_BLT_TPLS)
+  if (NOT BLT_EXPORTED)
+    set(BLT_EXPORTED On CACHE BOOL "" FORCE)
+  blt_import_library(NAME          blt_stub EXPORTABLE On)
+  set_target_properties(blt_stub PROPERTIES EXPORT_NAME blt::blt_stub)
+            install(TARGETS blt_stub
+                    EXPORT               bltTargets)
+    blt_export_tpl_targets(EXPORT bltTargets NAMESPACE blt)
+    install(EXPORT bltTargets
+      DESTINATION  lib/cmake/raja)
+  endif()
+endif ()
 
 foreach(dep ${TPL_DEPS})
     # If the target is EXPORTABLE, add it to the export set
     get_target_property(_is_imported ${dep} IMPORTED)
     if(NOT ${_is_imported})
         install(TARGETS              ${dep}
-                EXPORT               RAJA
-                DESTINATION          lib)
+                EXPORT               RAJATargets
+                DESTINATION          lib/cmake/raja)
         # Namespace target to avoid conflicts
         set_target_properties(${dep} PROPERTIES EXPORT_NAME RAJA::${dep})
     endif()
diff --git a/cmake/SetupRajaOptions.cmake b/cmake/SetupRajaOptions.cmake
index 50e2f18c5d..934e720c41 100644
--- a/cmake/SetupRajaOptions.cmake
+++ b/cmake/SetupRajaOptions.cmake
@@ -5,9 +5,6 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ################################################################################
 
-# Enable OpenMP by by default
-set(RAJA_ENABLE_OPENMP On CACHE BOOL "Build OpenMP support")
-
 set(RAJA_ENABLE_WARNINGS_AS_ERRORS Off CACHE BOOL "")
 set(ENABLE_GTEST_DEATH_TESTS On CACHE BOOL "Enable tests asserting failure.")
 
@@ -18,6 +15,8 @@ option(RAJA_ENABLE_TBB "Build TBB support" Off)
 option(RAJA_ENABLE_TARGET_OPENMP "Build OpenMP on target device support" Off)
 option(RAJA_ENABLE_SYCL "Build SYCL support" Off)
 
+option(RAJA_ENABLE_VECTORIZATION "Build experimental vectorization support" On)
+
 option(RAJA_ENABLE_REPRODUCERS "Build issue reproducers" Off)
 
 option(RAJA_ENABLE_EXERCISES "Build exercises " On)
@@ -30,7 +29,7 @@ option(RAJA_ENABLE_BOUNDS_CHECK "Enable bounds checking in RAJA::Views/Layouts"
 option(RAJA_TEST_EXHAUSTIVE "Build RAJA exhaustive tests" Off)
 option(RAJA_TEST_OPENMP_TARGET_SUBSET "Build subset of RAJA OpenMP target tests when it is enabled" On)
 option(RAJA_ENABLE_RUNTIME_PLUGINS "Enable support for loading plugins at runtime" Off)
-option(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL "Enable use of device function pointers in hip backend" OFF)
+option(RAJA_ALLOW_INCONSISTENT_OPTIONS "Enable inconsistent values for ENABLE_X and RAJA_ENABLE_X options" Off)
 
 option(RAJA_ENABLE_DESUL_ATOMICS "Enable support of desul atomics" Off)
 set(DESUL_ENABLE_TESTS Off CACHE BOOL "")
diff --git a/docs/sphinx/dev_guide/branch_development.rst b/docs/sphinx/dev_guide/branch_development.rst
index e4092a26c3..16723645fe 100644
--- a/docs/sphinx/dev_guide/branch_development.rst
+++ b/docs/sphinx/dev_guide/branch_development.rst
@@ -29,14 +29,14 @@ Persistent, Protected Branches
 ---------------------------------
 
 The **main** and **develop** branches are the two primary branches we use.
-They always exist and are protected in the RAJA GitHub project in that
-changes to them only occur as a result of approved pull requests. The 
+They always exist and are protected in the RAJA GitHub project, meaning that
+changes to them can only occur as a result of approved pull requests. The 
 distinction between the main and develop branches is an important part of 
 Gitflow.
 
   * The *main* branch records the release history of the project. Each time 
     the main branch is changed, a new tag for a new code version is made. 
-    See :ref:`semver-label` for a description of the version numbering scheme 
+    See :ref:`version-label` for a description of the version labeling scheme 
     we use.
 
   * The *develop* branch is used to integrate and test new features and most
@@ -45,15 +45,15 @@ Gitflow.
 .. important:: **Development never occurs directly on the main branch or 
                develop branch.**
 
-All other branches in the RAJA repo are temporary and are used to perform 
-specific development tasks. When such a branch is no longer needed (e.g., 
-after it is merged), the branch is deleted typically.
+All other branches are temporary and are used to perform specific development 
+tasks. When such a branch is no longer needed (e.g., after it is merged), the 
+branch is deleted typically.
 
 ----------------
 Feature Branches
 ----------------
 
-*Feature* branches are created off of other branches (usually develop) and are 
+A *feature* branch is created from another branch (usually develop) and is 
 used to develop new features, bug fixes, etc. before they are merged to develop
 and eventually main. *Feature branches are temporary*, living only as long as 
 they are needed to complete development tasks they contain.
@@ -78,8 +78,9 @@ When all issues and comments arising in PR review discussion have been
 addressed, the PR has been approved, and all continuous integration checks 
 have passed, the pull request can be merged.
 
-.. important:: **Feature branches never interact directly with the main
-               branch.**
+.. important:: **Feature branches almost never interact directly with the main
+               branch.** One exception is when a bug fix is needed in
+               the main branch to tag a patch release.
 
 ---------------------------
 Other Important Branches
@@ -95,7 +96,7 @@ Gitflow Illustrated
 
 The figure below shows the basics of how branches interact in Gitflow.
 
-.. figure:: git-workflow-gitflow2.png
+.. figure:: ./figures/git-workflow-gitflow2.png
 
    This figure shows typical interactions between key branches in the Gitflow
    workflow. Here, development is shown following the v0.1.0 release. While
diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst
index a554b977de..fc43632188 100644
--- a/docs/sphinx/dev_guide/build_configurations.rst
+++ b/docs/sphinx/dev_guide/build_configurations.rst
@@ -6,40 +6,43 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. host_config:
+.. _build_config-label:
 
 **************************
 RAJA Build Configurations
 **************************
 
-RAJA must be built and tested with a wide range of compilers and with 
-all of its supported back-ends. The project currently maintains two 
-ways to build and test important configurations in a reproducible manner:
+To meet user needs, RAJA is built and tested with a wide range of compilers for 
+all of its supported back-ends. Automated continuous integration (CI) testing 
+employed by the project is described in :ref:`ci-label`. During day-to-day
+development, the project currently maintains two ways to build and test 
+configurations in a reproducible manner:
 
   * **Build scripts.** The RAJA source repository contains a collection of
     simple build scripts that are used to generate build configurations 
-    for platforms in the Livermore Computing Center primarily.
+    for a variety of platforms, such as Livermore Computing (LC) systems,
+    MacOS, and Linux environments.
     
   * **Generated host-config files.** The RAJA repository includes a 
     mechanism to generate *host-config* files (i.e., CMake cache files)
     using `Spack <https://github.com/spack/spack>`_.
 
-Each of these specifies compiler versions and options, a build target 
-(Release, Debug, etc.), RAJA features to enable (OpenMP, CUDA, etc.), 
+The configurations specify compiler versions and options, build targets 
+(Release, Debug, etc.), RAJA features to enable (OpenMP, CUDA, HIP, etc.), 
 and paths to required tool chains, such as CUDA, ROCm, etc.  
 They are described briefly in the following sections.
 
 
+.. _build_scripts-label:
+
 ===================
 RAJA Build Scripts
 ===================
 
-The build scripts in the RAJA ``scripts`` directory are used mostly by RAJA 
-developers to quickly create a build environment to compile and run tests
-during code development. 
-
-Each script is executed from the top-level RAJA directory. THe scripts for
-CPU-only platforms require an argument that indicate the compiler version.
+Build scripts mentioned above live in the 
+`RAJA/scripts <https://github.com/LLNL/RAJA/tree/develop/scripts>`_ directory. 
+Each script is executed from the top-level RAJA directory. The scripts for
+CPU-only platforms require an argument that indicates the compiler version.
 For example,
 
 .. code-block:: bash
@@ -47,18 +50,18 @@ For example,
   $ ./scripts/lc-builds/toss3_clang.sh 10.0.1
 
 Scripts for GPU-enabled platforms require three arguments: the device
-compiler version, followed by the compute architecture, followed by the host
+compiler version, the target compute architecture, and the host
 compiler version. For example,
 
 .. code-block:: bash
 
   $ ./scripts/lc-builds/blueos_nvcc_gcc.sh 10.2.89 sm_70 8.3.1
 
-When a script is run, it creates a uniquely-named build directory in the 
-top-level RAJA directory and runs CMake with arguments contained in the script 
-to create a build environment in the new directory. One then goes into that 
-directory and runs make to build RAJA, its tests, example codes, etc.  
-For example,
+When a script is run, it creates a build directory named for the configuration
+in the top-level RAJA directory and runs CMake with arguments contained in the 
+script to create a build environment in the new directory. One then goes into 
+that directory and runs 'make' to build RAJA, and depending on options
+passed to CMake RAJA tests, example codes, etc.  For example,
 
 .. code-block:: bash
 
@@ -67,29 +70,26 @@ For example,
   $ make -j
   $ make test
 
-Eventually, these scripts may go away and be superceded by the Spack-based
-host-config file generation process when that achieves the level of
-compiler coverage that the scripts have.
+.. _spack_host_config-label:
 
-
-============================
-Generated Host-Config Files
-============================
+==================================
+Spack-Generated Host-Config Files
+==================================
 
 The RAJA repository contains two submodules 
 `uberenv <https://github.com/LLNL/uberenv>`_ and
 `radiuss-spack-configs <https://github.com/LLNL/radiuss-spack-configs>`_ that 
 work together to generate host-config files. These are projects in the 
-GitHub LLNL organization and contain utilities shared by various projects. 
-The main uberenv script can be used to drive Spack to generate a *host-config* 
-file that contains all the information required to define a RAJA build 
-environment. The host-config file can then be passed to CMake using the '-C' 
-option to create a build configuration. *Spack specs* defining compiler 
-configurations are maintained in files in the radiuss-spack-configs 
-repository.
+GitHub LLNL organization and contain utilities shared and maintained by 
+various projects. The main uberenv script is used to drive Spack to generate 
+a *host-config* file (i.e., a CMake *cache* file) that contains all the 
+information required to define a RAJA build environment. The generated file 
+can then be passed to CMake using the '-C' option to create a build 
+configuration. *Spack specs* defining compiler configurations are maintained 
+in files in the radiuss-spack-configs repository.
 
-RAJA shares its uberenv workflow with other projects. The documentation 
-for this is available in `RADIUSS Uberenv Guide <https://radiuss-ci.readthedocs.io/en/latest/uberenv.html#uberenv-guide>`_.
+Additional documentation for this process is available in the
+`RADIUSS Uberenv Guide <https://radiuss-ci.readthedocs.io/en/latest/uberenv.html#uberenv-guide>`_.
 
 
 Generating a RAJA host-config file
@@ -97,34 +97,38 @@ Generating a RAJA host-config file
 
 This section describes the host-config file generation process for RAJA.
 
-Machine specific configurations
+Platform configurations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Compiler configurations for Livermore computer platforms are contained in 
-in sub-directories in the RAJA ``scripts/uberenv/spack_configs`` directory:
+Compiler configurations for Livermore computer platforms are contained
+in sub-directories of the ``RAJA/scripts/radiuss-spack-configs`` submodule
+directory:
 
 .. code-block:: bash
 
-  $ ls -c1 ./scripts/uberenv/spack_configs
+  $ ls -c1 ./scripts/radiuss-spack-configs
+  toss_4_x86_64_ib_cray
+  toss_4_x86_64_ib
+  toss_3_x86_64_ib
   blueos_3_ppc64le_ib
   darwin
-  toss_3_x86_64_ib
-  blueos_3_ppc64le_ib_p9
   config.yaml
+  blueos_3_ppc64le_ib_p9
+  ...
 
-To see currently supported configurations, please see the contents of the 
-``compilers.yaml`` file in each of these sub-directories.
+To see available configurations, please see the contents of the 
+``compilers.yaml`` and ``packages.yaml`` files in each sub-directory.
 
 Generating a host-config file
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The main uberenv python script can be invoked from the top-level RAJA directory
+The ``uberenv.py`` python script can be run from the top-level RAJA directory
 to generate a host-config file for a desired configuration. For example,
 
 .. code-block:: bash
 
-  $ python ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0"
-  $ python ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0~shared+openmp tests=benchmarks"
+  $ python3 ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0"
+  $ python3 ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0~shared+openmp tests=benchmarks"
 
 Each command generates a corresponding host-config file in the top-level RAJA 
 directory. The file name contains the platform and OS to which it applies, and 
@@ -134,17 +138,16 @@ the compiler and version. For example,
 
   hc-quartz-toss_3_x86_64_ib-gcc@8.1.0-fjcjwd6ec3uen5rh6msdqujydsj74ubf.cmake
 
-Specs that are exercised during the Gitlab CI process are found YAML files in 
-the ``RAJA/.gitlab`` directory. See :ref:`vettedspecs-label` for more 
-information.
+This process is also used by our Gitlab CI testing effort. 
+See :ref:`ci-label` for more information.
 
 Building RAJA with a generated host-config file
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 To build RAJA with one of these host-config files, create a build directory and
-run CMake in it by passing the host-config file to CMake using the '-C' option.
-Then, run make and RAJA tests, if desired, to make sure the build was done
-properly:
+run CMake in it by passing a host-config file to CMake using the '-C' option.
+Then, run 'make' to build RAJA. To ensure the build was successful, you may
+want to run the RAJA tests. For example,
 
 .. code-block:: bash
 
@@ -153,8 +156,16 @@ properly:
   $ cmake --build -j .
   $ ctest --output-on-failure -T test
 
-It is also possible to use the configuration with a RAJA CI script outside 
-of the normal CI process:
+You may also run the RAJA tests with the command
+
+.. code-block:: bash
+
+  $ make test
+
+as an alternative to the 'ctest' command used above.
+
+It is also possible to use the configuration with the RAJA Gitlab CI script 
+outside of the Gitlab environment:
 
 .. code-block:: bash
 
@@ -164,20 +175,25 @@ MacOS
 ^^^^^
 
 In RAJA, the Spack configuration for MacOS contains the default compiler
-corresponding to the OS version (`compilers.yaml`), and a commented section to 
-illustrate how to add `CMake` as an external package. You may install CMake 
+corresponding to the OS version in the ``compilers.yaml`` file in the 
+``RAJA/scripts/radiuss-spack-configs/darwin/`` directory, and a commented 
+section to illustrate how to add `CMake` as an external package in the
+``packages.yaml`` in the same directory. You may also install CMake 
 with `Homebrew <https://brew.sh>`_, for example, and follow the process 
 outlined above after it is installed.
 
-============================
-Reproducing Docker Builds
-============================
+.. _docker_local-label:
 
-RAJA uses docker container images that it shares with other LLNL GitHub projects
-for CI testing on GitHub. Currently, we use Azure for Linux, Windows, and MacOS 
-builds and also have Appveyor builds for Windows.
+==================================
+Reproducing Docker Builds Locally
+==================================
 
-You can reproduce these builds locally for testing with the following steps:
+RAJA uses Docker container images that it shares with other LLNL GitHub projects
+for Azure CI testing (see :ref:`azure_ci-label` for more information). 
+We use Azure Pipelines for Linux, Windows, and MacOS builds.
+
+You can reproduce these builds locally for testing with the following steps if
+you have Docker installed.
 
   #. Run the command to build a local Docker image:
 
@@ -185,8 +201,8 @@ You can reproduce these builds locally for testing with the following steps:
 
        $ DOCKER_BUILDKIT=1 docker build --target ${TARGET} --no-cache
 
-     Here, ${TARGET} is replaced with one of the names following "AS" in the
-     `RAJA Dockerfile <https://github.com/LLNL/RAJA/blob/develop/Dockerfile>`_ 
+     Here, ``${TARGET}`` is replaced with one of the names following ``AS`` in 
+     the `RAJA Dockerfile <https://github.com/LLNL/RAJA/blob/develop/Dockerfile>`_. 
 
 
   #. To get dropped into a terminal in the Docker image, run the following:
@@ -195,10 +211,15 @@ You can reproduce these builds locally for testing with the following steps:
      
        $ docker run -it axom/compilers:${COMPILER} /bin/bash
 
-     Here, ${COMPILER} is replaced with the compiler you want (see the 
+     Here, ``${COMPILER}`` is replaced with the compiler you want (see the 
      aforementioned Dockerfile).
  
 Then, you can build, run tests, edit files, etc. in the Docker image. Note that
-the docker command has a '-v' argument that you can use to mount your local 
-directory in the image; e.g., -v `pwd`:/opt/RAJA would mount the pwd as 
-/opt/RAJA in the image.
+the docker command has a ``-v`` argument that you can use to mount a local 
+directory in the image. For example
+
+  .. code-block:: bash 
+
+    & docker -v pwd:/opt/RAJA 
+
+will mount your current local directory as ``/opt/RAJA`` in the image.
diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst
index 2ac2876ccb..38634a1301 100644
--- a/docs/sphinx/dev_guide/ci.rst
+++ b/docs/sphinx/dev_guide/ci.rst
@@ -12,78 +12,348 @@
 Continuous Integration (CI) Testing
 ************************************
 
-The RAJA project employs multiple tools to run its tests for each GitHub
-*pull request*, all of which must pass before the pull request can be merged.
-These tools include:
+.. important:: * All CI test checks must pass before a pull request can be
+                 merged.
+               * The status (pass/fail and run) for all checks can be viewed by
+                 clicking the appropriate link in the **checks** section of a
+                 GitHub pull request.
 
-  * **Azure.** This runs builds for Linux, Windows, and MacOS  environments 
-    using a variety of compilers. While we do GPU builds for CUDA, HIP, and
-    SYCL on Azure, RAJA tests are run for each non-GPU build.
+The CI tools used by the RAJA project, and which integrate with GitHub are:
 
-  * **Appveyor.** This runs builds and tests for a Windows environment for two
-    versions of the Visual Studio compiler.
+  * **Azure Pipelines** runs builds and tests for Linux, Windows, and MacOS 
+    environments using recent versions of various compilers. While we do GPU 
+    builds for CUDA, HIP, and SYCL on Azure, RAJA tests are only run for 
+    CPU-only pipelines. See the 
+    `RAJA Azure DevOps <https://dev.azure.com/llnl/RAJA>`_ project to learn 
+    more about our testing there.
 
-  * **Gitlab CI.** This runs builds and tests on platforms in the Livermore
-    Computing *Collaboration Zone*. This is a recent addition for RAJA and
-    is a work-in-progress to get full coverage of compilers and tests we
-    need to exercise.
+  * **Gitlab** instances in the Livermore Computing (LC) Center
+    runs builds and tests in LC resource and compiler environments
+    important to many RAJA user applications. Execution of RAJA CI 
+    pipelines on LC Gitlab resources has restrictions described below. If 
+    you have access to LC resources, you can access additional information about
+    `LC GitLab CI <https://lc.llnl.gov/confluence/display/GITLAB/GitLab+CI>`_
 
-These tools integrate seamlessly with GitHub. They will automatically
-(re)run RAJA builds and tests as changes are pushed to each PR branch. Gitlab
-CI execution on Livermore Computing resources has some restrictions which are
-described below.
+The tools automatically run RAJA builds and tests when a PR is created and 
+when changes are pushed to a PR branch.
 
-Gitlab CI support is still being developed to make it more easy to use with 
-GitHub projects. The current state is described below.
+The following sections describe basic elements of the operation of the CI tools.
 
-.. note:: The status of checks (pass/fail, running status) for each of these 
-          tools can be viewed by clicking the appropriate link in the check
-          section of a pull request.
+.. _gitlab_ci-label:
 
+=========
 Gitlab CI
 =========
 
-If all memmbers of a GitHub project are members of the LLNL GitHub organization 
-and have two-factor authentication enabled on their GitHub accounts, 
-auto-mirroring on the Livermore Computing Collaboration Zone Gitlab server is
-enabled. Thus, Gitlab CI will run automatically for those projects on pull 
-requests that are made by project members. Otherwise, due to Livermore 
-Computing security policies, Gitlab CI must be launched manually by a *blessed* 
-GitHub user satisfying the constraints described above. To manually initiate
-Gitlab CI on a pull request, add a comment with 'LGTM' in it.
+The Gitlab CI instance used by the RAJA project lives in the Livermore 
+Computing (LC) Collaboration Zone (CZ). It runs builds and tests in LC 
+resource and compiler environments important to RAJA user applications at LLNL.
+
+Constraints
+-----------
+
+Running Gitlab CI on Livermore Computing (LC) resources is constrained by LC 
+security policies. The policies require that all members of a GitHub project 
+be members of the LLNL GitHub organization and have two-factor authentication 
+enabled on their GitHub accounts to automatically mirror a GitHub repo and
+trigger Gitlab CI functionality from GitHub. For compliant LLNL GitHub projects,
+auto-mirroring of the GitHub repo on LC Gitlab is done when changes are pushed 
+to PRs for branches in the RAJA repo, but not for PRs for a branch on a fork of
+the repo. An alternative procedure we use to handle this is described in 
+:ref:`contributing-label`. If you have access to LC resources, you can learn
+more about `LC Gitlab mirroring <https://lc.llnl.gov/confluence/pages/viewpage.action?pageId=662832265>`_.
+
+Gitlab CI (LC) Testing Workflow
+--------------------------------------
+
+The figure below shows the high-level steps in the RAJA Gitlab CI testing 
+process. The main steps, which we will discuss in more detail later, are:
+
+  #. A *mirror* of the RAJA GitHub repo in the RAJA Gitlab project is updated
+     whenever the RAJA ``develop`` or ``main`` branches are changed as well
+     as when any PR branch in the RAJA GitHub project is changed. 
+  #. Gitlab launches CI test pipelines. While running, the execution and 
+     pass/fail status may be viewed and monitored in the Gitlab CI GUI.
+  #. For each resource and compiler combination,
+     `Spack <https://github.com/spack/spack>`_ is used to generate a build 
+     configuration in the form of a CMake cache file, or *host-config* file.
+  #. A host-config file is passed to CMake, which configures a RAJA build 
+     space.  Then, RAJA and its tests are compiled.
+  #. Next, the RAJA tests are run.
+  #. When a test pipeline completes, final results are reported in Gitlab.
+
+In the next section, we will describe the roles that specific files in the 
+RAJA repo play in defining these steps.
+
+.. figure:: ./figures/RAJA-Gitlab-Workflow2.png
+
+   The main steps in the RAJA Gitlab CI testing workflow are shown in the 
+   figure. This process is triggered when a developer makes a PR on the 
+   GitHub project or whenever changes are pushed to the source branch of a PR.
+
+Gitlab CI (LC) Testing Files
+--------------------------------------
+
+The following figure shows directories and files in the RAJA repo that 
+support LC Gitlab CI testing. Files with names in blue are specific to RAJA 
+and are maintained by the RAJA team. Directories and files with names in red are
+in Git submodules, shared and maintained with other projects.
+
+.. figure:: ./figures/RAJA-Gitlab-Files.png
+
+   The figure shows directories and files in the RAJA repo that support Gitlab 
+   CI testing. Files in blue are specific to RAJA and owned by the RAJA team. 
+   Red directories and files are part of Git submodules shared with other 
+   projects.
+
+In the following sections, we discuss how these files are used in the 
+steps in the RAJA Gitlab CI testing process summarized above.
+
+Launching CI pipelines (step 2) 
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In **step 2** of the diagram above, Gitlab launches RAJA test pipelines.
+The `RAJA/.gitlab-ci.yml <https://github.com/LLNL/RAJA/blob/develop/.gitlab-ci.yml>`_ file contains high-level testing information, 
+such as stages (resource allocation, build-and-test, and resource 
+deallocation) and locations of files that define which jobs will run
+in each pipeline. For example, these items appear in the file as::
+
+  stages:
+    - r_allocate_resources
+    - r_build_and_test
+    - r_release_resources
+    - l_build_and_test
+    - c_build_and_test
+    - multi_project
+
+and:: 
+
+  include:
+    - local: .gitlab/ruby-templates.yml
+    - local: .gitlab/ruby-jobs.yml
+    - local: .gitlab/lassen-templates.yml
+    - local: .gitlab/lassen-jobs.yml
+    - local: .gitlab/corona-templates.yml
+    - local: .gitlab/corona-jobs.yml
+
+In the ``stages`` section above, prefixes 'r_', 'l_', and 'c_' refer to 
+resources in the LC on which tests are run. Specifically, the machines 'ruby',
+'lassen', and 'corona', respectively. Jobs that will run in pipeline(s) on each 
+resource are defined in the files listed in the ``include`` section above.
+Note that the stage labels above appear on each Gitlab CI run web page as the
+title of a column containing other information about what is run in that stage,
+such as build and test jobs.
+
+The `RAJA/.gitlab <https://github.com/LLNL/RAJA/tree/develop/.gitlab>`_ 
+directory contains a *templates* and *jobs* file for each LC resource on which 
+test pipelines will be run. The ``<resource>-templates.yml`` files contain 
+information that is common across jobs that run on the corresponding resource, 
+such as commands and scripts that are run for stages identified in the 
+``RAJA/.gitlab-ci.yml`` file. For example, the 
+``RAJA/.gitlab/ruby-templates.yml`` file contains a section::
+
+  allocate_resources (on ruby):
+    variables:
+      GIT_STRATEGY: none
+    extends: .on_ruby
+    stage: r_allocate_resources
+    script:
+      - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME}
+
+which contains the resource allocation command associated with the 
+``r_allocate_resources`` stage identifier on 'ruby'. Analogous stages are 
+defined similarly in other ``RAJA/.gitlab/<resource>-templates.yml`` files.
+
+The ``<resource>-jobs.yml`` files are described in the following sections.
+
+Running a CI build/test pipeline  (steps 3, 4, 5, 6)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The `RAJA/scripts/gitlab/build_and_test.sh <https://github.com/LLNL/RAJA/tree/develop/scripts/gitlab/build_and_test.sh>`_ file defines the steps executed
+for each build and test run as well as information that will appear in the
+log output for each step. First, the script invokes the 
+``RAJA/scripts/uberenv/uberenv.py`` Python script located in the 
+`uberenv <https://github.com/LLNL/uberenv>`_ submodule::
+
+  ...
+
+  python3 scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt}
+
+  ...
+
+Project specific settings related to which Spack version to use, where 
+Spack packages live, etc. are located in the 
+`RAJA/.uberenv_config.json <https://github.com/LLNL/RAJA/blob/develop/.uberenv_config.json>`_ file.
+
+The uberenv python script invokes Spack to generate a CMake *host-config* 
+file containing a RAJA build specification **(step 3)**. To generate
+a *host-config* file, Spack uses the 
+`RAJA Spack package <https://github.com/LLNL/RAJA/blob/develop/scripts/spack_packages/raja/package.py>`_, plus *Spack spec* information. 
+The ``RAJA/.gitlab/<resource>-jobs.yml`` file defines a build specification
+(*Spack spec*) for each job that will be run on the corresponding resource. 
+For example, in the ``lassen-jobs.yml`` file, you will see an entry such as::
+
+  gcc_8_3_1_cuda_10_1_168:
+    variables:
+      SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168"
+    extends: .build_and_test_on_lassen
+
+This defines the *Spack spec* for the test job in which CUDA device code will 
+be built with the nvcc 10.1.168 compiler and non-device code will be compiled 
+with the GNU 8.3.1 compiler. In the Gitlab CI GUI, this pipeline will be 
+labeled ``gcc_8_3_1_cuda_10_1_168``. Details for compilers, such as file 
+system paths, target architecture, etc. are located in the 
+``RAJA/scripts/radiuss-spack-configs/<sys-type>/compilers.yaml`` file for the 
+system type associated with the resource. Analogous information for packages 
+like CUDA and ROCm (HIP) are located in the corresponding 
+``RAJA/scripts/radiuss-spack-configs/<sys-type>/packages.yaml`` file.
+
+.. note:: Please see :ref:`spack_host_config-label` for more information about
+          Spack-generated host-config files and how to use them for local
+          debugging.
+
+After the host-config file is generated, the 
+``scripts/gitlab/build_and_test.sh`` script creates a build space directory 
+and runs CMake in it, passing the host-config (cache) file. Then, it builds
+the RAJA code and tests **(step 4)**::
+
+  ...
+
+  build_dir="${build_root}/build_${hostconfig//.cmake/}"
+
+  ...
+
+  date
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+  echo "~ Host-config: ${hostconfig_path}"
+  echo "~ Build Dir:   ${build_dir}"
+  echo "~ Project Dir: ${project_dir}"
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+  echo ""
+
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+  echo "~~~~~ Building RAJA"
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+
+  rm -rf ${build_dir} 2>/dev/null
+  mkdir -p ${build_dir} && cd ${build_dir}
+
+  ...
+
+  cmake \
+    -C ${hostconfig_path} \
+    ${project_dir}  
+ 
+  cmake --build . -j ${core_counts[$truehostname]}
+
+Next, it runs the tests **(step 5)**::
+
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+  echo "~~~~~ Testing RAJA"
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+
+  ...
+
+  cd ${build_dir}
+
+  ...
+
+  ctest --output-on-failure -T test 2>&1 | tee tests_output.txt
+
+Lastly, the script packages the test results into a JUnit XML file that
+Gitlab uses for reporting the results in its GUI **(step 6)**::
+
+  echo "Copying Testing xml reports for export"
+  tree Testing
+  xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml
+  mv junit.xml ${project_dir}/junit.xml
+
+The commands shown here intermingle with other commands that emit messages,
+timing information for various operations, etc. which appear in a log
+file that can be viewed in the Gitlab GUI.
+
+.. _azure_ci-label:
+
+==================
+Azure Pipelines CI
+==================
+
+The Azure Pipelines tool builds and tests for Linux, Windows, and MacOS 
+environments.  While we do builds for CUDA, HIP, and SYCL RAJA back-ends 
+in the Azure Linux environment, RAJA tests are only run for CPU-only pipelines.
+
+Azure Pipelines Testing Workflow
+--------------------------------
+
+The Azure Pipelines testing workflow for RAJA is much simpler than the Gitlab
+testing process described above.
+
+The test jobs we run for each OS environment are specified in the 
+`RAJA/azure-pipelines.yml <https://github.com/LLNL/RAJA/blob/develop/azure-pipelines.yml>`_ file. This file defines the job steps, commands,
+compilers, etc. for each OS environment in the associated ``- job:`` section.
+A summary of the configurations we build are:
+
+  * **Windows.** The ``- job: Windows`` Windows section contains information
+    for the Windows test builds. For example, we build and test RAJA as
+    a static and shared library. This is indicated in the Windows ``strategy``
+    section::
+   
+      strategy:
+        matrix:
+          shared:
+            ...
+          static:
+            ...
+
+    We use the Windows/compiler image provided by the Azure application 
+    indicated the ``pool`` section; for example::
 
-It is important to note that RAJA shares its Gitlab CI workflow with 
-other projects. See `Shared Gitlab CI Workflow <https://radiuss-ci.readthedocs.io/en/latest/uberenv.html#ci>`_ for more information.
+      pool:
+        vmImage: 'windows-2019'
 
+    **MacOS.** The ``- job: Mac`` section contains information for Mac test 
+    builds. For example, we build RAJA using the the MacOS/compiler 
+    image provided by the Azure application indicated in the ``pool`` section; 
+    for example::
 
-.. _vettedspecs-label:
+      pool:
+        vmImage: 'macOS-latest' 
 
-Vetted Specs
-------------
+    **Linux.** The ``- job: Docker`` section contains information for Linux
+    test builds. We build and test RAJA using Docker container images generated 
+    with recent versions of various compilers. The RAJA project shares these 
+    images with other open-source LLNL RADIUSS projects and they are maintained
+    in the `RES-ops Docker <https://github.com/rse-ops/docker-images>`_ 
+    project on GitHub. The builds we do at any point in time are located in 
+    the ``strategy`` block::
 
-The *vetted* compiler specs are those which we use during the RAJA Gitlab CI
-testing process. These can be viewed by looking at files in the RAJA
-``.gitlab`` directory. For example,
+      strategy:
+        matrix: 
+          gccX:
+            docker_target: ...
+          ...
+          clangY:
+            docker_target: ...
+          ...
+          nvccZ:
+            docker_target: ...
 
-.. code-block:: bash
+          ...
 
-  $ ls -c1 .gitlab/*jobs.yml
-  .gitlab/lassen-jobs.yml
-  .gitlab/ruby-jobs.yml
+    The Linux OS the docker images are run on is indicated in the ``pool`` section; 
+    for example::
 
-lists the yaml files containing the Gitlab CI jobs for the lassen and ruby 
-machines.
+      pool:
+        vmImage: 'ubuntu-latest'
 
-Then, executing a command such as:
+Docker Builds
+-------------
 
-.. code-block:: bash
+For each Linux/Docker pipeline, the base container images, CMake, build, and
+test commands are located in `RAJA/Dockerfile <https://github.com/LLNL/RAJA/blob/develop/Dockerfile>`_.
 
-  $ git grep -h "SPEC" .gitlab/ruby-jobs.yml | grep "gcc"
-      SPEC: "%gcc@4.9.3"
-      SPEC: "%gcc@6.1.0"
-      SPEC: "%gcc@7.3.0"
-      SPEC: "%gcc@8.1.0"
+The base container images are built and maintained through the `RSE-Ops <https://rse-ops.github.io/>`_ RADIUSS project. A table of the most up to date containers can be found `here <https://rse-ops.github.io/docker-images/>`_. These images are rebuilt regularly ensuring that we have the most up to date builds of each container / compiler.
 
-will list the specs vetted on the ruby platform.
+.. note:: Please see :ref:`docker_local-label` for more information about
+          reproducing Docker builds locally for debugging purposes.
 
-More details to come...
diff --git a/docs/sphinx/dev_guide/ci_tasks.rst b/docs/sphinx/dev_guide/ci_tasks.rst
new file mode 100644
index 0000000000..ebf813cc97
--- /dev/null
+++ b/docs/sphinx/dev_guide/ci_tasks.rst
@@ -0,0 +1,233 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _ci_tasks-label:
+
+******************************************************
+Continuous Integration (CI) Testing Maintenance Tasks
+******************************************************
+
+In :ref:`ci-label`, we described RAJA CI workflows. This section 
+describes common CI testing maintenance tasks for RAJA and how to 
+perform them.
+
+.. _gitlab_ci_tasks-label:
+
+=================
+Gitlab CI Tasks
+=================
+
+The tasks in this section apply to GitLab CI running on Livermore
+Computing (LC) resources.
+
+Changing Build Specs
+---------------------
+
+The builds for each LC platform on which we run Gitlab CI pipelines are
+defined in ``<resource>-jobs.yml`` files in the `RAJA/.gitlab <https://github.com/LLNL/RAJA/tree/develop/.gitlab>`_ directory. The key items 
+that change when a new build is added are:
+
+  * the unique **label** that identifies the build on a web page for 
+    a Gitlab CI pipeline, and
+  * the build **Spack spec**, which identifies the compiler and version,
+    compiler flags, etc.
+
+For example, an entry for a build using a clang compiler with CUDA is:
+
+.. code-block:: bash
+
+  ibm_clang_10_0_1_cuda_10_1_168:
+    variables:
+      SPEC: "+cuda cuda_arch=70 %clang@ibm.10.0.1 ^cuda@10.1.168"
+    extends: .build_and_test_on_lassen
+
+To update, change the corresponding spec item, such as clang compiler
+or version, or cuda version. Then, update the label accordingly.
+
+It is important to note that the build spec information must reside in 
+the ``compilers.yaml`` and/or ``packages.yaml`` file for the system type
+in the `radiuss-spack-configs <https://github.com/LLNL/RAJA/blob/develop/scripts>`_ submodule. If the desired information is not there,
+try updating the submodule to a newer version. If the information
+is still not available, create a branch in the 
+`RADIUSS Spack Configs <https://github.com/LLNL/radiuss-spack-configs>`_ repo, add the needed spec info, and create a pull request.
+
+.. important:: Build spec information used in RAJA Gitlab CI pipelines
+               must exist in the ``compilers.yaml`` file and/or 
+               ``packages.yaml`` file for the appropriate system type in
+               the `RADIUSS Spack Configs <https://github.com/LLNL/radiuss-spack-configs>`_ repo.
+
+Changing Build/Run Parameters
+------------------------------
+
+The commands executed to acquire resources on each 
+system/system-type on which we run Gitlab CI are defined in the 
+`RAJA/.gitlab-ci.yml <https://github.com/LLNL/RAJA/blob/develop/.gitlab-ci.yml>`_ file. The default execution time for each test pipeline is 
+also defined in the file using the variable ``DEFAULT_TIME``. These 
+commands and settings can remain as is for the most part. 
+
+However, sometimes a particular pipeline will take longer to build and
+run than the default allotted time. In this case, the default time can
+be adjusted in the build spec information in the associated 
+``<resource>-jobs.yml`` file discussed in the previous section. 
+For example:
+
+.. code-block:: bash
+
+  xl_16_1_1_7_cuda:
+    variables:
+      SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5"
+      DEFAULT_TIME: 60
+    allow_failure: true
+    extends: .build_and_test_on_lassen
+
+This example explicitly sets the build and test allocation time to 60 minutes:
+``DEFAULT_TIME: 60``. Note that it also allows the pipeline to fail: 
+``allow_failure: true``. We do this in some cases where certain tests are known
+to fail regularly. This allows the overall check status to report as passing,
+even though the test pipeline annotated this way may fail.
+
+
+Adding Test Pipelines
+---------------------
+
+Adding a test pipeline involves adding a new entry in the 
+``RAJA/.gitlab-ci.yml`` file.
+
+.. important:: Build spec information used in RAJA Gitlab CI pipelines
+               must exist in the ``compilers.yaml`` file and/or 
+               ``packages.yaml`` file for the appropriate system type in
+               the `RADIUSS Spack Configs <https://github.com/LLNL/radiuss-spack-configs>`_ repo.
+
+
+.. _azure_ci_tasks-label:
+
+=================
+Azure CI Tasks
+=================
+
+The tasks in this section apply to RAJA Azure Pipelines CI.
+
+Changing Builds/Container Images
+---------------------------------------
+
+The builds we run in Azure are defined in the `RAJA/azure-pipelines.yml <https://github.com/LLNL/RAJA/blob/develop/azure-pipelines.yml>`_ file.
+  
+Linux/Docker
+............
+
+To update or add a new compiler / job to Azure CI we need to edit both ``azure-pipelines.yml`` and ``Dockerfile``.
+
+If we want to add a new Azure pipeline to build with ``compilerX``, then in ``azure-pipelines.yml`` we can add the job like so::
+
+  -job: Docker
+    ...
+    strategy:
+      matrix:
+        ...
+        compilerX: 
+          docker_target: compilerX
+
+Here, ``compilerX:`` defines the name of a job in Azure. ``docker_target: compilerX`` defines a variable ``docker_target``, which is used to determine what part of the ``Dockerfile`` to run.
+
+In the ``Dockerfile`` we will want to add our section that defines the commands for the ``compilerX`` job.::
+
+  FROM ghcr.io/rse-ops/compilerX-ubuntu-20.04:compilerX-XXX AS compilerX
+  ENV GTEST_COLOR=1
+  COPY . /home/raja/workspace
+  WORKDIR /home/raja/workspace/build
+  RUN cmake -DCMAKE_CXX_COMPILER=compilerX ... && \
+      make -j 6 &&\
+      ctest -T test --output-on-failure
+
+Each of our docker builds is built up on a base image maintained by RSE-Ops, a table of available base containers can be found `here <https://rse-ops.github.io/docker-images/>`_. We are also able to add target names to each build with ``AS ...``. This target name correlates to the ``docker_target: ...`` defined in ``azure-pipelines.yml``.
+
+The base containers are shared across multiple projects and are regularly rebuilt. If bugs are fixed in the base containers the changes will be automatically propagated to all projects using them in their Docker builds.
+
+Check `here <https://rse-ops.github.io/docker-images/>`_ for a list of all currently available RSE-Ops containers. Please see the `RSE-Ops Containers Project <https://github.com/rse-ops/docker-images>`_ on Github to get new containers built that aren't yet available.
+
+Windows / MacOs
+...............
+
+We run our Windows / MacOS builds directly on the Azure virtual machine instances. In order to update the Windows / MacOS instance we can change the ``pool`` under ``-job: Windows`` or ``-job: Mac``::
+  
+  -job: Windows
+    ...
+    pool:
+      vmImage: 'windows-2019'
+    ...
+  -job: Mac
+    ...
+    pool:
+      vmImage: 'macOS-latest'
+
+Changing Build/Run Parameters
+-----------------------------
+
+Linux/Docker
+............
+
+We can edit the build and run configurations of each docker build, in the ``RUN`` command. Such as adding CMake options or changing the parallel build value of ``make -j N`` for adjusting throughput.
+
+Each base image is built using `spack <https://github.com/spack/spack>`_. For the most part the container environments are set up to run our CMake and build commands out of the box. However, there are a few exceptions where we need to ``spack load`` specific modules into the path.
+
+  * **Clang** requires us to load LLVM for OpenMP runtime libraries.::
+
+      . /opt/spack/share/spack/setup-env.sh && spack load llvm
+
+    **CUDA** for the cuda runtime.::
+
+      . /opt/spack/share/spack/setup-env.sh && spack load cuda
+
+    **HIP** for the hip runtime and llvm-amdgpu runtime libraries.::
+
+      . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu
+
+    **SYCL** requires us to run setupvars.sh::
+
+      source /opt/view/setvars.sh 
+
+Windows / MacOS
+...............
+
+Windows and MacOS build / run parameters can be configured directly in ``azure-pipelines.yml``. CMake options can be configured with ``CMAKE_EXTRA_FLAGS`` for each job. The ``-j`` value can also be edited directly in the Azure ``script`` definitions for each job.
+
+The commands executed to configure, build, and test RAJA for each 
+pipeline in Azure are located in the `RAJA/Dockerfile <https://github.com/LLNL/RAJA/blob/develop/Dockerfile>`_ file. 
+Each pipeline section begins with a line that ends with ``AS ...`` 
+where the ellipses in the name of a build-test pipeline. The name label
+matches an entry in the Docker test matrix in the 
+``RAJA/azure-pipelines.yml`` file mentioned above.
+
+
+.. _rajaperf_ci_tasks-label:
+
+================================
+RAJA Performance Suite CI Tasks
+================================
+
+The `RAJA Performance Suite <https://github.com/LLNL/RAJAPerf>`_ project CI
+testing processes, directory/file structure, and dependencies are nearly 
+identical to that for RAJA, which is described in :ref:`ci-label`. Specifically,
+
+  * The RAJA Performance Suite Gitlab CI process is driven by the 
+    `RAJAPerf/.gitlab-ci.yml <https://github.com/LLNL/RAJAPerf/blob/develop/.gitlab-ci.yml>`_ file. 
+  * The ``<resource>-jobs.yml`` and ``<resource>-templates.yml`` files reside 
+    in the 
+    `RAJAPerf/.gitlab <https://github.com/LLNL/RAJAPerf/tree/develop/.gitlab>`_ 
+    directory.
+  * The ``build_and_test.sh`` script resides in the `RAJAPerf/scripts/gitlab <https://github.com/LLNL/RAJAPerf/tree/develop/scripts/gitlab>`_ directory.
+  * The `RAJAPerf/Dockerfile <https://github.com/LLNL/RAJAPerf/blob/develop/Dockerfile>`_ drives the Azure testing pipelines.
+  
+The main difference is that for Gitlab CI, is that the Performance Suite uses 
+the RAJA submodules for ``uberenv`` and ``radiuss-spack-configs`` located in 
+the RAJA submodule to avoid redundant submodules. This is reflected in the
+`RAJAPerf/.uberenv_config.json <https://github.com/LLNL/RAJAPerf/blob/develop/.uberenv_config.json>`_ 
+file which point at the relevant RAJA submodule locations.
+
+Apart from this minor difference, all CI maintenance and development tasks for
+the RAJA Performance Suite follow the guidance in :ref:`ci_tasks-label`.
diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst
index 8ac0dddd35..281da30f69 100644
--- a/docs/sphinx/dev_guide/contributing.rst
+++ b/docs/sphinx/dev_guide/contributing.rst
@@ -12,165 +12,209 @@
 Contributing to RAJA
 *********************
 
-Since RAJA is a collaborative open source software project, we embrace 
-contributions from anyone who wants to add features or improve its existing 
-capabilities. This section describes basic processes to follow
-for individuals outside of the core RAJA team to contribute new features or 
-bugfixes to RAJA. It assumes you are familiar with 
+RAJA is a collaborative open source software project and we encourage 
+contributions from anyone who wants to add features or improve its
+capabilities. This section describes the following:
+
+  * GitHub project access
+  * How to develop a RAJA *pull request* (PR) contribution.
+  * Requirements that must be met for a PR to be merged.
+
+We assume contributors are familiar with 
 `Git <https://git-scm.com/>`_, which we use for source code version control,
 and `GitHub <https://github.com/>`_, which is where our project is hosted. 
 
-This section describes development processes, such as:
-
-  * Making a fork of the RAJA repository 
-  * Creating a branch for development
-  * Creating a pull request (PR)
-  * Tests that your PR must pass before it can be merged into RAJA
+.. important:: * Before a PR can be merged into RAJA, all test checks must pass 
+                 and the PR must be approved by at least one member of the 
+                 core RAJA team.
+               * Each RAJA contribution (feature, bugfix, etc.) must include 
+                 adequate tests, documentation, and code examples. The 
+                 *adequacy* of PR content, in this respect, is determined by
+                 PR reviewers applying their professional judgment considering
+                 the perspective of RAJA users and developers.
+
+=======================
+GitHub Project Access
+=======================
+
+RAJA maintains three levels of project access on it GitHub project:
+
+  * **Core team members.** Individuals on the core RAJA team are frequent
+    RAJA contributors and participate regularly in project meetings, 
+    discussions, and other project activities. They are members of 
+    the LLNL GitHub organization and the ``RAJA-core`` GitHub team. Their
+    project privileges include the ability to create branches in the repository,
+    push code changes to the RAJA repo, make PRs, and merge them when they are 
+    approved and all checks have passed. 
+  * **Regular contributors.** Individuals, who are not on the core RAJA team, 
+    but are members of the LLNL GitHub organization and are involved in some
+    aspects of RAJA development are considered regular contributors. They are
+    members of the ``RAJA-contrib`` GitHub team. Their project privileges 
+    include the ability to create branches in the repository, push code changes
+    to the RAJA repo, and make PRs. However, they may not merge PRs and must 
+    coordinate with the core team to have their work included in the develop
+    branch. This is mainly due to the way GitHub structures its project 
+    access levels.
+  * **Everyone else.** Anyone with a GitHub account is welcome to contribute
+    to RAJA. Individuals outside of the two groups described above can make PRs
+    in the RAJA project, but must do so from a branch on a *fork* of 
+    the RAJA repo. This is described below.
+
+=======================
+Pull Request Process
+=======================
+
+The following figure shows the basic elements of the RAJA PR contribution 
+workflow. Some details vary depending on RAJA GitHub project access level 
+of the contributor. The process involves four main steps:
+
+  #. A RAJA contributor makes a PR on the RAJA GitHub project to merge a
+     branch on which she has developed a contribution into another RAJA branch,
+     typically, the develop branch.
+  #. When a PR is created, GitHub triggers Azure CI testing checks and 
+     possibly Gitlab CI checks if the branch is part of the RAJA GItHub repo. 
+     Running and pass/fail status is reported back to GitHub where it can be 
+     viewed and monitored.
+  #. Meanwhile, RAJA team members and other contributors review the PR, 
+     suggesting changes and/or approving when they think it is ready to merge.
+  #. When all checks pass and the PR is approved, the PR may be merged.
+
+.. figure:: ./figures/PR-Workflow.png
+
+   The four main steps in the RAJA pull request (PR) process, which are
+   common practices for many software projects.
+
+This PR process should be familiar to nearly everyone who contributes to
+s software project. If you would like more information about pull requests, 
+GitHub has a good 
+`PR guide <https://help.github.com/articles/about-pull-requests/>`_ 
+on PR basics.
+
+.. important:: When you create a RAJA PR, you should enter a description of 
+               its contents in the *PR template* form the team maintains for 
+               this purpose. A good PR summary includes a descriptive title 
+               of the the bug you fixed or the feature you have added. Other 
+               relevant details that will assist others in reviewing your 
+               contribution should also be included.
 
 ============
 Forking RAJA
 ============
 
-If you are not a member of the LLNL organization on GitHub and of 
-the core RAJA team of developers, then you do not have permission to create 
-a branch in the RAJA repository. This is due to the policy adopted by the LLNL
-organization on GitHub in which the RAJA project resides. Fortunately, you may 
-still contribute to RAJA by `forking the RAJA repo 
+As noted earlier, if you are not a member of the core RAJA development team, 
+or a recognized RAJA contributor, then you do not have permission to create a 
+branch in the RAJA GitHub repository. This choice is due to policies enforced 
+by the LLNL organization on GitHub (in which the RAJA project resides) and the
+Livermore Computing (LC) organization (in which we run our Gitlab CI testing).
+Fortunately, you may still contribute to RAJA by `forking the RAJA repo 
 <https://github.com/LLNL/RAJA/fork>`_. Forking creates a copy of the RAJA 
-repository that you own. You can push code changes on that copy to GitHub and 
-create a pull request in the RAJA project.
+repository that you own. You can make changes on your local copy and push them 
+your fork on GitHub. When you are ready to have your RAJA contribution reviewed
+ad added to the RAJA project, you may create a pull request in the RAJA project.
 
-.. note:: A contributor who is not a member of the LLNL GitHub organization 
-          and the core team of RAJA developers cannot create a
-          branch in the RAJA repo. However, anyone can create a fork of the 
-          RAJA project and create a pull request in the RAJA project.
+.. note:: A contributor who is not a member of the core RAJA development team,
+          or a recognized RAJA contributor, cannot create a branch in the RAJA 
+          GitHub repo. However, anyone can create a fork of the 
+          RAJA project and create a pull request based on the fork in the 
+          RAJA project.
 
-=========================
-Developing RAJA Code
-=========================
+===============================
+Developing A RAJA Contribution
+===============================
 
 New features, bugfixes, and other changes are developed on a **feature branch.**
-Each such branch should be based on the RAJA ``develop`` branch. For more 
-information on the branch development model used in RAJA, please see
+Each such branch should be based on the most current RAJA ``develop`` branch. 
+For more information on the branch development model used in RAJA, please see
 :ref:`branching-label`. When you want to make a contribution, first ensure 
-you have an up-to-date copy of the ``develop`` branch locally:
+you have a local, up-to-date copy of the ``develop`` branch by running the
+following commands:
 
 .. code-block:: bash
 
     $ git checkout develop
     $ git pull origin develop
+    $ git submodule update --init --recursive
 
-----------------------
-Developing a Feature
-----------------------
-
-Assuming you are on the develop branch in your local copy of the RAJA repo,
-and the branch is up-to-date, the first step toward developing a RAJA feature
-is to create a new branch on which to perform your development. For example:
-
-.. code-block:: bash
-
-    $ git checkout -b feature/<name-of-feature>
-
-Proceed to modify your branch by committing changes with reasonably-sized 
-work portions (i.e., *atomic commits*), and add tests that will exercise your 
-new code. If you are creating new functionality, please add documentation to 
-the appropriate section of the `RAJA User Guide <https://readthedocs.org/projects/raja/>`_. The source files for the RAJA documentation are maintained in 
-the ``RAJA/docs`` directory.
-
-After your new code is complete, you've tested it, and developed appropriate
-documentation, you can push your branch to GitHub and create a PR in the RAJA
-project. It will be reviewed by members of the RAJA team, who will provide 
-comments, suggestions, etc. After it is approved and all CI checks pass, your 
-contribution will be merged into the RAJA repository.
-
-.. important:: When creating a branch that you intend to be merged into the 
-               RAJA repo, please give it a succinct name that clearly describes 
-               the contribution.  For example, **feature/<name-of-feature>** 
-               for a new feature, **bugfix/<fixed-issue>** for a bugfix, etc.
+Then, in your local copy, you will be on the current version of develop branch
+with all RAJA submodules synchronized with that. 
 
---------------------
-Developing a Bug Fix
---------------------
+-----------------------------------
+Feature and Bugfix Contributions
+-----------------------------------
 
-Contributing a bugfix follows the same process as described above. Be sure to
-indicate in the name of your branch that it is for a bugfix; for example:
+Assuming you are on an up-to-date develop branch in your local copy of RAJA,
+the first step toward developing a RAJA contribution is to create a new branch 
+on which to do your development and push it to the remote origin of your local 
+copy.  For example:
 
 .. code-block:: bash
 
-    $ git checkout -b bugfix/<fixed-issue>
-
-We recommend that you add a test that reproduces the issue you have found
-and demonstrates that the issue is resolved. To verify that you have done
-this properly, build the code for your branch and then run ``make test`` to 
-ensure that your new test passes.
-
-When you are done, push your branch to GitHub, then create a PR in the RAJA
-project.
-
------------------------
-Creating a Pull Request
------------------------
-
-You can create a pull request (PR) 
-`here <https://github.com/LLNL/RAJA/compare>`_. GitHub has a good 
-`PR guide <https://help.github.com/articles/about-pull-requests/>`_ on
-PR basics if you want more information. Ensure that the base branch for your 
-PR is the ``develop`` branch of RAJA.
+    $ git checkout -b <username>/feature/<name-of-feature>
+    $ git push <remote> <branch-name> 
 
-When you create a RAJA PR, you must enter a description of the contents of the 
-PR. We have a *PR template* for this purpose for you to fill in. Be sure to add
-a descriptive title explaining the bug you fixed or the feature you have added
-and any other relevant details that will assist the RAJA team in reviewing your
-contribution.
+where ``<username>/feature/<name-of-feature>`` is the name of your feature
+branch. Or,
 
-When a PR is created in RAJA, it will be run through our automated testing
-processes and be reviewed by RAJA team members. When the PR passes all 
-tests and it is approved, a member of the RAJA team will merge it.
-
-.. note:: Before a PR can be merged into RAJA, all CI checks must pass and
-          the PR must be approved by a member of the core team. 
+.. code-block:: bash
 
------
-Tests
------
+    $ git checkout -b <username>/bugfix/<issue-fixed>
+    $ git push <remote> <branch-name>
 
-RAJA uses multiple continuous integration (CI) tools to test every pull
-request. See :ref:`ci-label` for more information. 
+where ``<username>/bugfix/<issue-fixed>`` is the name of your bugfix branch.
 
-All RAJA tests are in the ``RAJA/test`` directory and are split into 
-*unit tests* and *functional tests*. Unit tests are intended to test basic
-interfaces and features of individual classes, methods, etc. Functional tests
-are used to test combinations of RAJA features. We have organized our 
-tests to make it easy to see what is being tested and easy to add new tests.
-For example, tests for each programming model back-end are exercised using
-the same common, parameterized test code to ensure back-end support is
-consistent.
+Proceed to modify your branch by committing changes with reasonably-sized 
+work portions (i.e., *atomic commits*), and add tests that will exercise your 
+new code, and examples and documentation, as needed. If you are creating new 
+functionality, please add documentation to the appropriate section of the 
+`RAJA Documentation <https://readthedocs.org/projects/raja/>`_. The source 
+files for the RAJA documentation are maintained in the ``RAJA/docs`` directory 
+of the source repository. Consider adding example code(s) that illustrate 
+usage of the new features you develop to help users and other developers 
+understand your addition. These should be placed in the ``RAJA/examples`` 
+directory and referenced in the RAJA User Guide as appropriate.
+
+After your work is complete, you've tested it, and developed appropriate
+documentation, you can push your local branch to GitHub and create a PR in the 
+RAJA project to merge your work into the RAJA develop branch. It will be 
+reviewed by members of the RAJA team, who will provide comments, suggestions, 
+etc. 
+
+As we stated earlier, not all required :ref:`ci-label` checks can be run on a 
+PR made from a branch in a fork of RAJA. When the RAJA team has agreed to 
+accept your work, it will be pulled into the RAJA GitHub repo 
+(see :ref:`prfromfork-label`). Then, it will run through all required testing 
+and receive final reviews and approvals. When it is approved and all CI test 
+checks pass, your contribution will be merged into the RAJA repository, most 
+likely the develop branch.
 
-.. important:: Please follow the sub-directory structure and code implementation
-               pattern for existing tests in the ``RAJA/test`` directory when 
-               adding or modifying tests. 
+.. important:: When creating a branch that you intend to be merged into the 
+               RAJA repo, please give it a succinct name that clearly describes 
+               the contribution.  For example, 
+               **username/feature/<name-of-feature>** for a new feature, 
+               **username/bugfix/<issue-fixed>** for a bugfix, etc.
 
 .. _prfromfork-label:
 
------------------------------------------------------------
-Testing Pull Requests from Branches in Forked Repositories
------------------------------------------------------------
+===========================================================
+Accepting A Pull Request From A Forked Repository
+===========================================================
 
-Due to LLNL security policies and RAJA project policies, only a PR created
-by someone on the RAJA core development team will be run automatically
-through all RAJA CI tools. In particular, a PR made from branch on a forked 
-repository will not trigger Gitlab CI checks. Gitlab CI on internal LLNL 
-platforms will only be run on PRs that are made from branches in the GitHub 
-RAJA repository. This may change in the future to make it easier to work with 
-PRs from contributors that are not members of the LLNL organization on GitHub.
+Due to LLNL security policies, some RAJA pull requests will not be able to
+be run through all RAJA CI tools. The Livermore Computing (LC) Center
+Gitlab systems restrict which GitHub PRs may automatically run through its 
+CI test pipelines. For example, a PR made from branch on a forked repository 
+will not trigger Gitlab CI checks. Gitlab CI on LC platforms will be run only
+on PRs that are made from branches in the GitHub RAJA repository. 
+See :ref:`ci-label` for more information about RAJA PR testing.
 
-.. note:: **RAJA core team members:**
+.. note:: **The following process for accepting PR contributions from a fork 
+          of the RAJA repo must be executed by a member of the RAJA team:**
 
           To facilitate testing contributions in PRs from forked repositories, 
           we maintain a script to pull a PR branch from a forked repo into the 
-          RAJA repo. First, identify the number of the PR. Then, run the 
-          script from the top-level RAJA directory::
+          RAJA repo. First, identify the number of the PR, which appears at
+          the top of your PR. Then, run a script from the top-level RAJA 
+          directory::
 
             $ ./scripts/make_local_branch_from_fork_pr -b <PR #>
 
diff --git a/docs/sphinx/dev_guide/figures/PR-Workflow.png b/docs/sphinx/dev_guide/figures/PR-Workflow.png
new file mode 100644
index 0000000000..13d5853e9f
Binary files /dev/null and b/docs/sphinx/dev_guide/figures/PR-Workflow.png differ
diff --git a/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Files.png b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Files.png
new file mode 100644
index 0000000000..1ee658bdaa
Binary files /dev/null and b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Files.png differ
diff --git a/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Workflow2.png b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Workflow2.png
new file mode 100644
index 0000000000..2985739683
Binary files /dev/null and b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Workflow2.png differ
diff --git a/docs/sphinx/dev_guide/git-workflow-gitflow2.png b/docs/sphinx/dev_guide/figures/git-workflow-gitflow2.png
similarity index 100%
rename from docs/sphinx/dev_guide/git-workflow-gitflow2.png
rename to docs/sphinx/dev_guide/figures/git-workflow-gitflow2.png
diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst
index 664fe31e47..bac04460e6 100644
--- a/docs/sphinx/dev_guide/index.rst
+++ b/docs/sphinx/dev_guide/index.rst
@@ -12,17 +12,19 @@
 RAJA Developer Guide
 ####################
 
-The RAJA Developer Guide is a work-in-progress....
-
-This guide documents key software development processes used by the RAJA 
-project so that they are understood and uniformly applied by contributors.
+The RAJA Developer Guide documents software development processes 
+followed by the RAJA project. The main goal of the guide is to ensure 
+all project contributors understand the key elements of the processes so
+that they are consistently applied.
 
 .. toctree::
    :maxdepth: 1
 
    contributing
-   ci
-   build_configurations
    branch_development
+   build_configurations
+   ci
+   ci_tasks 
+   tests
    release_process 
-   semantic_versioning 
+   versioning 
diff --git a/docs/sphinx/dev_guide/release_process.rst b/docs/sphinx/dev_guide/release_process.rst
index 2886cd06c7..21196e2f76 100644
--- a/docs/sphinx/dev_guide/release_process.rst
+++ b/docs/sphinx/dev_guide/release_process.rst
@@ -12,7 +12,14 @@
 RAJA Release Process
 *******************************************
 
-The RAJA release process typically involves the following sequence of steps:
+RAJA is considered part of the **RAJA Portability Suite** set of projects. 
+Currently, the Suite includes `Umpire <https://github.com/LLNL/Umpire>`_, `CHAI <https://github.com/LLNL/CHAI>`_, and `camp <https://github.com/LLNL/camp>`_, in addition to RAJA. 
+
+.. important:: Releases for the Suite are coordinated, meaning that when a 
+               non-patch release is done for one, a new version release is 
+               done for all Suite projects.
+
+The RAJA release process includes the following sequence of steps:
 
   #. Identify all work (features in development, outstanding PRs, etc.) to be 
      to be included in the release.
@@ -24,19 +31,19 @@ The RAJA release process typically involves the following sequence of steps:
      into the **main branch.** When it is approved and all CI checks pass,
      merge the release candidate branch into the RAJA main branch.
   #. On GitHub, make a new release with a tag for the release. Following our
-     convention, the tag label should have the format ``vMM.mm.pp``. See 
-     :ref:`semver-label` for a description of the version numbering scheme we 
+     convention, the tag label should have the format ``YYYY.mm.pp``. See 
+     :ref:`version-label` for a description of the version numbering scheme we 
      use.  In the GitHub release description, please note key features, 
      bugfixes, etc. in the release. These should be a high-level summary of the 
      contents of the ``RELEASE_NOTES.md`` file in the RAJA repo, which may 
      contain more detailed information. Also, add a note to the 
      release description to remind users to download the gzipped tarfile for 
      the release instead of the assets GitHub creates for the release.
-     The GitHub-created assets do not contain the RAJA submodules and will
+     The GitHub-created assets do not contain the RAJA submodules and may
      cause issues for users as a result.
 
-     .. important:: For consistency, please follow a similar description 
-                    pattern for all RAJA releases.
+     .. important:: For consistency, please follow a similar release 
+                    description pattern for all RAJA releases.
 
   #. Check out the main branch locally and make sure it is up-to-date.     
      Then, generate the release tarfile by running the script 
@@ -47,8 +54,12 @@ The RAJA release process typically involves the following sequence of steps:
   #. Edit the release in GitHub and upload the tarfile to the release.
   #. Make a PR to merge the main branch into the develop branch. After it 
      passes all CI checks and is approved, merge the PR. This will ensure that
-     all changes done to finalize the release will not be lost in future
-     changes to the develop branch.
+     all changes done to finalize the release will be included in the develop 
+     branch and future work on that branch.
+
+After a RAJA release is done, there a other tasks that typically need to be 
+performed to update content in other projects. These task are described in
+:ref:`post_release-label`.
 
 .. _rcbranch-label:
 
@@ -100,9 +111,8 @@ Hotfix Branch
 ===========================
 
 *Hotfix* branches are used in the (hopefully!) rare event that a bug is found
-shortly after a release and which has the potential to negatively impact RAJA
-users. A hotfix branch is used to address the issue and make a new release
-containing only the fixed code. 
+shortly after a release that may negatively impact RAJA users. A hotfix branch 
+will address the issue be merged into both develop and main branches.
 
 A hotfix branch is *made from main* with the name **hotfix/<issue>**. The 
 issue is fixed (hopefully quickly!) and the release notes file is updated on 
@@ -114,17 +124,19 @@ similar to the process described in :ref:`release-label`. For completeness,
 the key steps for performing a hotfix release are:
 
   #. Make a **hotfix** branch from main for a release (hotfix/<issue>), fix the
-     issue on the branch and verify, testing against user code if necessary, 
-     and update the release notes file as needed.
+     issue on the branch and verify, testing against user code if necessary.
+     Update the release notes and RAJA patch version number as described
+     in :ref:`rcbranch-label`.
   #. When the hotfix branch is ready, make a PR for it to be merged
      into the **main branch.** When that is approved and all CI checks pass,
      merge it into the RAJA main branch.
   #. On GitHub, make a new release with a tag for the release. Following our
-     convention, the tag label should have the format ``vMM.mm.ppp``. In the
-     GitHub release description, note that the release is a bugfix release
-     and describe the issue that is resolved. Also, add a note to the release 
-     description to download the gzipped tarfile for the release rather than 
-     one of the assets GitHub creates as part of the release.
+     convention, the tag label should have the format ``YYYY.mm.pp``, where
+     only the **patch** portion of the release tag should differ from the
+     last release. In the GitHub release description, note that the release 
+     is a bugfix release and describe the issue that is resolved. Also, add 
+     a note to the release description to download the gzipped tarfile for the 
+     release rather than the assets GitHub creates as part of the release.
   #. Check out the main branch locally and make sure it is up-to-date.     
      Then, generate the tarfile for the release by running the script 
      ``./scripts/make_release_tarball.sh`` from the top-level RAJA directory. 
@@ -134,3 +146,32 @@ the key steps for performing a hotfix release are:
   #. Make a PR to merge the main branch into the develop branch. After it 
      passes all CI checks and is approved, merge the PR. This will ensure that
      changes for the bugfix will be included in future development.
+
+.. _post_release-label:
+
+=========================
+Post-release Activities
+=========================
+
+After a RAJA release is complete, other tasks are performed to update content 
+in other repositories, typically. These tasks include:
+
+  * Update the `RAJAProxies <https://github.com/LLNL/RAJAProxies>`_ project
+    to the newly RAJA Portability Suite projects. This typically consists of 
+    updating the submodules to the new RAJA Portability Suite project 
+    versions, making sure the proxy-apps build and run correctly. When this
+    is done, tag a release for proxy-app project.
+  * Update the RAJA Spack package in the 
+    `Spack repository <https://github.com/spack/spack>`_. This requires some
+    knowledge of Spack and attention to details and Spack conventions. Please
+    see :ref:`spack_package-label` for details.
+
+.. _spack_package-label:
+
+=========================
+Spack Package Update
+=========================
+
+Describe how to update the RAJA Spack package....
+
+
diff --git a/docs/sphinx/dev_guide/tests.rst b/docs/sphinx/dev_guide/tests.rst
new file mode 100644
index 0000000000..5d401bbb43
--- /dev/null
+++ b/docs/sphinx/dev_guide/tests.rst
@@ -0,0 +1,299 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tests-label:
+
+***************************
+RAJA Tests
+***************************
+
+As noted in :ref:`ci-label`, all RAJA test checks must pass before any PR 
+contribution will be merged. Additionally, we recommend that contributors
+include new tests in their code contributions when adding new features
+and bug fixes.
+
+.. note:: If RAJA team members think adequate testing is not included in a 
+          PR branch, they will ask for additional testing to be added during
+          the review process.
+
+.. _tests_organization-label:
+
+=========================
+Test Organization
+=========================
+
+The goals of the RAJA test organization are to:
+
+  * Make it easy to see what is tested and where tests live. We want
+    developers and users to be able to find tests easily and know where
+    to put new tests when they add them.
+  * Parameterize tests as much as reasonable to ensure that features work with 
+    all supported RAJA back-ends and we are testing them consistently. We want
+    the source files for each test case to allow testing of each RAJA back-end.
+    Specifically, tests for each back-end are generated by instantiating the 
+    same source routines with different type information.
+  * Have test source code generated for compilation by CMake when the code is 
+    configured. This significantly reduces code redundancy and enables our
+    test parameterization goals.
+
+.. important: RAJA uses the `GoogleTest <https://github.com/google/googletest>`_ framework, which is included in the `BLT <https://github.com/LLNL/blt>`_ build system that RAJA uses. 
+
+All RAJA tests reside in the 
+`RAJA/test <https://github.com/LLNL/RAJA/tree/develop/test>`_ directory.
+The test directory structure looks like this::
+
+  RAJA/test/functional/forall
+                       kernel
+                       scan
+                       ...
+            include/...
+            integration/...
+            unit/algorithm
+                 atomic
+                 index
+                 ...
+
+RAJA tests are partitioned into three main categories:
+
+  * **Unit tests** exercise basic interfaces and features of individual RAJA
+    classes and methods in standalone fashion; i.e., integrated with other 
+    parts of RAJA as minimally as is reasonable. RAJA unit tests reside
+    in sub-directories of the `RAJA/test/unit <https://github.com/LLNL/RAJA/tree/develop/test/unit>`_ directory.
+  * **Functional tests** integrate multiple RAJA features in common ways to 
+    test how RAJA is used in practice. RAJA functional tests reside
+    in sub-directories of the `RAJA/test/functional <https://github.com/LLNL/RAJA/tree/develop/test/functional>`_ directory.
+  * **Integration tests** exercise features that integrate RAJA with other 
+    libraries, such as Kokkos performance tools as plug-ins. RAJA integration 
+    tests reside in sub-directories of the `RAJA/test/integration <https://github.com/LLNL/RAJA/tree/develop/test/integration>`_ directory.
+
+The `RAJA/test/include <https://github.com/LLNL/RAJA/tree/develop/test/include>`_ directory contains header files that define types and other items that are 
+commonly used in various tests.
+
+.. important:: Please follow the existing sub-directory structure and code 
+               implementation patterns for RAJA tests when adding or modifying 
+               tests. 
+
+.. _tests_anatomy-label:
+
+=========================
+Anatomy Of A Test Case
+=========================
+
+This section discusses in some detail the structure of files for a single
+RAJA test case and how the work together. In particular, we describe the set
+of basic tests that exercise ``RAJA::forall`` execution with various RAJA
+segment types. 
+
+.. note:: The implementation pattern described in the following sections is 
+          similarly used by all other RAJA tests. 
+
+Since these tests integrate multiple RAJA features, it is considered a 
+*functional* test. The files for this test are located in the
+`RAJA/test/functional/forall/segment <https://github.com/LLNL/RAJA/tree/develop/test/functional/forall/segment>`_ directory. The contents of the directory are::
+
+  $ ls -c1 -R ./test/functional/forall/segment 
+  ./test/functional/forall/segment:
+  tests
+  test-forall-segment.cpp.in
+  CMakeLists.txt
+
+  ./test/functional/forall/segment/tests:
+  test-forall-RangeStrideSegment.hpp
+  test-forall-RangeSegment.hpp
+  test-forall-ListSegment.hpp
+
+Next, we describe these and their relationships.
+
+.. _tests_source-label:
+
+Test Source File
+-----------------
+
+The `test-forall-segment.cpp.in <https://github.com/LLNL/RAJA/blob/develop/test/functional/forall/segment/test-forall-segment.cpp.in>`_ file is the 
+parameterized test source file. It contains header file include statements::
+
+  //
+  // test/include headers
+  //
+  #include "RAJA_test-base.hpp"
+  #include "RAJA_test-camp.hpp"
+  #include "RAJA_test-index-types.hpp"
+
+  #include "RAJA_test-forall-data.hpp"
+  #include "RAJA_test-forall-execpol.hpp"
+
+  //
+  // Header for tests in ./tests directory
+  //
+  // Note: CMake adds ./tests as an include dir for these tests.
+  //
+  #include "test-forall-@SEGTYPE@.hpp"
+
+The first set of header files live in the ``RAJA/test/include`` directory
+mentioned earlier. The headers are centrally located since their contents 
+are shared with other test files. The last include statement pulls in the
+header file containing the parameterized tests for the corresponding RAJA 
+segment type.
+
+Next, a ``camp::cartesian_product`` type is defined to assemble sets of types 
+used in the parameterized tests::
+
+  //
+  // Cartesian product of types used in parameterized tests
+  //
+  using @BACKEND@ForallSegmentTypes =
+    Test< camp::cartesian_product<StrongIdxTypeList,
+                                  @BACKEND@ResourceList,
+                                  @BACKEND@ForallExecPols>>::Types;
+
+The first template argument defining the ``camp::cartesian_product object``
+type refers to a list of segment index types defined in the 
+`RAJA_test-index-types.hpp <https://github.com/LLNL/RAJA/blob/develop/test/include/RAJA_test-index-types.hpp>`_ header file. 
+The second argument refers to  a list
+of RAJA/camp resource types appropriate for the RAJA execution back-end defined
+in the `RAJA_test-camp.hpp <https://github.com/LLNL/RAJA/blob/develop/test/include/RAJA_test-camp.hpp>`_ header file (see :ref:`tests_header-label` for 
+where this is used). The third argument refers to a list of RAJA
+execution policy types defined in the 
+`RAJA_test-forall-execpol.hpp <https://github.com/LLNL/RAJA/blob/develop/test/include/RAJA_test-forall-execpol.hpp>`_
+header file. This results in the generation of a combinatorial collection of 
+typed tests being run. Each test is defined by a unique tuple of types, 
+described in :ref:`tests_header-label`.
+
+Lastly, the parameterized set of tests is instantiated::
+
+  //
+  // Instantiate parameterized test
+  //
+  INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                                 Forall@SEGTYPE@Test,
+                                 @BACKEND@ForallSegmentTypes);
+
+``INSTANTIATE_TYPED_TEST_SUITE_P`` is a GoogleTest macro. The first 
+argument is a label noting the RAJA back-end used for the generated tests.
+This can be used to filter the tests when they are manually run.
+The second argument is a label identifying the test set, and the
+third argument matches the CMake generated name for the 
+``camp::cartesian_product`` type described above.
+
+.. important:: The second argument passed to the 
+              ``INSTANTIATE_TYPED_TEST_SUITE_P`` macro must match the name of
+              the test suite class discussed in :ref:`tests_header-label`.
+
+.. _tests_cmakelists-label:
+
+CMakeLists.txt File
+--------------------
+
+The concrete version of each of the items described above is generated by 
+CMake when a RAJA build is configured. CMake fills in the segment type and 
+back-end identifiers, ``@SEGTYPE@`` and ``@BACKEND@``, respectively. These 
+identifiers and the test file and executable generation process is defined in 
+the 
+`CMakeLists.txt <https://github.com/LLNL/RAJA/blob/develop/test/functional/forall/segment/CMakeLists.txt>`_ file in the test directory. If you look in the file,
+you will see nested loops over RAJA back-ends and segment types which
+process the test source file ``test-forall-segment.cpp.in`` multiple times 
+to create a uniquely named source file for each back-end/segment type 
+combination in the RAJA build space. Each source file will be compiled into 
+a similarly named, unique test executable when the code is compiled.
+
+.. _tests_header-label:
+
+Test Header files
+--------------------
+
+Recall the line in the test source file::
+
+  #include "test-forall-@SEGTYPE@.hpp"
+
+This identifies the header file containing the actual test code used to 
+generate the tests. The test header files are located in the 
+`RAJA/test/functional/forall/segment/tests <https://github.com/LLNL/RAJA/tree/develop/test/functional/forall/segment/tests>`_ directory. The main elements of 
+each test header file are described next. We use the 
+`test-forall-RangeSegment.hpp <https://github.com/LLNL/RAJA/tree/develop/test/functional/forall/segment/tests/test-forall-RangeSegment.hpp>`_ file to 
+illustrate the essential test implementation elements.
+
+The file contains the following important items:
+
+  * test implementation method
+  * typed test suite class
+  * typed test invocation
+  * type test suite registration
+ 
+The test implementation is contained in a parameterized template method::
+
+  template <typename INDEX_TYPE, typename WORKING_RES, typename EXEC_POLICY>
+  void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+  {
+     ...
+  }
+
+Here, the template parameters identify the index type of the RAJA
+segment ``INDEX_TYPE``, the resource type for allocating test memory in the
+proper execution environment ``WORKING_RES``, and the execution policy
+``EXEC_POLICY`` for the ``RAJA::forall`` method used to run the tests.
+
+The test suite class plugs into the GoogleTest framework::
+
+  TYPED_TEST_SUITE_P(ForallRangeSegmentTest);
+  template <typename T>
+  class ForallRangeSegmentTest : public ::testing::Test
+  {
+  };
+
+using the ``TYPED_TEST_SUITE_P`` GoogleTest macro.
+
+.. important:: The name of the test class must be identical to the label passed
+               to the GoogleTest ``TYPED_TEST_SUITE_P`` macro.
+
+The specific tests that are run are defined by calls to the test implementation
+template method ``ForallRangeSegmentTestImpl`` described above::
+
+  TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall)
+  {
+    using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+    using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+    using EXEC_POLICY = typename camp::at<TypeParam, camp::num<2>>::type;
+
+    // test zero-length range segment
+    ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+
+    ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+    ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+    ForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, EXEC_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+
+    runNegativeTests<INDEX_TYPE, WORKING_RES, EXEC_POLICY>();
+  } 
+
+Here, ``TYPED_TEST_P`` is a GoogleTest macro defining the method for 
+executing the tests. Note that the first three lines 
+in the method extract the template parameter types from the ``camp::tuple`` 
+produced by the ``camp::cartesian_product`` described earlier in
+:ref:`tests_source-label`. If you look in the file, you will see an example of
+how we use C++ SFINAE to exclude running tests with negative index values 
+for index types that are unsigned.
+
+.. important:: * The label passed as the first argument to the GoogleTest
+                 ``TYPED_TEST_P`` macro must match the name of the test suite 
+                 class. The second argument is discussed below.
+               * It is critical to use the same type ordering when extracting
+                 the types that was used when the ``camp::cartesian_product`` 
+                 type was defined in the test source file, described in
+                 :ref:`tests_source-label`.
+
+Lastly, the test suite is registered with GoogleTest using the
+``REGISTER_TYPED_TEST_SUITE_P`` macro::
+
+  REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest,
+                              RangeSegmentForall);
+
+.. important:: * The label passed as the first argument to the GoogleTest
+                 ``REGISTER_TYPED_TEST_SUITE_P`` macro must match the name of 
+                 the test suite class. 
+               * The label passed as the second argument to the GoogleTest
+                 ``REGISTER_TYPED_TEST_SUITE_P`` macro must match the label
+                 passed as the second argument to the ``TYPED_TEST_P`` macro.
diff --git a/docs/sphinx/dev_guide/semantic_versioning.rst b/docs/sphinx/dev_guide/versioning.rst
similarity index 68%
rename from docs/sphinx/dev_guide/semantic_versioning.rst
rename to docs/sphinx/dev_guide/versioning.rst
index 3e77d2e340..62df10c2b9 100644
--- a/docs/sphinx/dev_guide/semantic_versioning.rst
+++ b/docs/sphinx/dev_guide/versioning.rst
@@ -6,21 +6,39 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _semver-label:
+.. _version-label:
 
-***********************
+****************************
+RAJA Release Version Naming
+****************************
+
+Prior to the RAJA release in March 2022, the RAJA project used the *Semantic
+Versioning* scheme for assigning release tag names. At the March 2022 release,
+we changed the release naming scheme to use ``YYYY.mm.pp``, for year, month, 
+and patch number. So, for example, the March 2022 release is labeled v2022.03.0.The main motivation for the release naming scheme is to do coordinated releases
+with the `Umpire <https://github.com/LLNL/Umpire>`_, 
+`CHAI <https://github.com/LLNL/CHAI>`_, and 
+`camp <https://github.com/LLNL/camp>`_ projects, which are considered parts 
+of the **RAJA Portability Suite**. In a coordinated release, all the projects 
+will have the same release name. If a project requires a patch release between 
+coordinated releases, it will indicate that by incrementing the patch number;
+for example, v2022.03.1.
+
+The following sections describe the Semantic Versioning scheme for reference
+and posterity.
+
+====================
 Semantic Versioning
-***********************
+====================
 
-The RAJA project uses the *semantic* versioning scheme for assigning
-release numbers. Semantic versioning is a methodology for assigning a version
-number to a software release in a way that conveys specific meaning about
-code modifications from version to version.
+Semantic versioning is a 
+methodology for assigning a version number to a software release in a way that 
+conveys specific meaning about code modifications from version to version.
 See `Semantic Versioning <http://semver.org>`_ for a more detailed description.
 
-============================
-Version Numbers and Meaning
-============================
+-------------------------------------
+Semantic Version Numbers and Meaning
+-------------------------------------
 
 Semantic versioning is based on a three part version number `MM.mm.pp`:
 
@@ -41,9 +59,9 @@ Semantic versioning is based on a three part version number `MM.mm.pp`:
     number is always changed when a hotfix branch is merged into main, or when 
     changes are made to main that only contain bug fixes.
 
-===========================================
-What Does a Change in Version Number Mean?
-===========================================
+-----------------------------------------------------
+What Does a Change in Semantic Version Number Mean?
+-----------------------------------------------------
 
 A key consideration in meaning for these three version numbers is that
 the software has a public API. Changes to the API or code functionality
diff --git a/docs/sphinx/user_guide/config_options.rst b/docs/sphinx/user_guide/config_options.rst
index c120e51761..89487f7874 100644
--- a/docs/sphinx/user_guide/config_options.rst
+++ b/docs/sphinx/user_guide/config_options.rst
@@ -22,9 +22,7 @@ their defaults.
 RAJA Option Types
 =============================
 
-Nearly all Cmake options used in RAJA contain the prefix ``RAJA_`` to give 
-users flexibility to enable/disable individual compilation features for RAJA, 
-specifically. RAJA contains two types of options, those that exist in 
+RAJA contains two types of options, those that exist in 
 RAJA only and those that are similar to standard CMake options or options 
 provided by BLT; i.e., *dependent options* in CMake terminology. RAJA 
 dependent option names are the same as the associated CMake and BLT option 
@@ -34,16 +32,18 @@ names, but with the ``RAJA_`` prefix added.
           options that can be controlled with CMake or BLT variants. 
 
             * Dependent options are typically used for *disabling* features.
-              For example, providing the option ``-DRAJA_ENABLE_TESTS=Off``
-              to CMake will disable compilation of RAJA tests, even if the 
-              option ``-DENABLE_TESTS=On`` is also provided.
+              For example, when the CMake option ``-DENABLE_TESTS=On`` is
+              used to enable tests in the build of an application that includes
+              multiple CMake-based package builds, providing the CMake option 
+              ``-DRAJA_ENABLE_TESTS=Off`` will disable compilation of RAJA 
+              tests, while compiling them for other packages.
 
             * We recommend using the option names without the ``RAJA_`` prefix,
               when available, to enable features at compile time to avoid 
               potential undesired behavior. For example, passing the option
               ``-DRAJA_ENABLE_CUDA=On`` to CMake will not enable CUDA because
               ``ENABLE_CUDA`` is off by default. So to enable CUDA, you need
-              to pass the ``-DENABLE_CUDA=On`` option to Cmake.
+              to pass the ``-DENABLE_CUDA=On`` option to CMake.
 
 =======================
 Setting Options
@@ -74,26 +74,34 @@ need to do that using appropriate CMake variables.
 All RAJA options are set like regular CMake variables. RAJA settings for 
 default options, compilers, flags for optimization, etc. can be found in files 
 in the ``RAJA/cmake`` directory and top-level ``CMakeLists.txt`` file. 
-Configuration variables can be set by passing
-arguments to CMake on the command line when CMake is called, or by setting
-options in a CMake *cache file* and passing that file to CMake using the 
-CMake ``-C`` options. For example, to enable RAJA OpenMP functionality, 
-pass the following argument to CMake::
+Configuration variables can be set by passing arguments to CMake on the 
+command line when calling CMake. For example, to enable RAJA OpenMP 
+functionality, pass the following argument to CMake::
 
-    -DENABLE_OPENMP=On
+    cmake ... \
+    -DENABLE_OPENMP=On \
+    ...
 
-The RAJA repository contains a collection of CMake cache files 
-(we call them *host-config* files) that may be used as a guide for users trying
-to set their own options. See :ref:`configopt-raja-hostconfig-label`.
+Alternatively, CMake options may be set in a CMake *cache file* and passing 
+that file to CMake using the CMake ``-C`` option; for example::
 
-Next, we summarize RAJA options and their defaults.
+    cmake ... \
+    -C my_cache_file.cmake \
+    ...
+
+The directories ``RAJA/scripts/*-builds`` contain scripts that run CMake for
+various build configurations. These contain cmake invocations that use CMake 
+cache files (we call them *host-config* files) and may be used as a guide for 
+users trying to set their own options. 
+
+Next, we summarize RAJA CMake options and their defaults.
 
 
 .. _configopt-raja-features-label:
 
-====================================
-Available RAJA Options and Defaults
-====================================
+==========================================
+Available RAJA CMake Options and Defaults
+==========================================
 
 RAJA uses a variety of custom variables to control how it is compiled. Many 
 of these are used internally to control RAJA compilation and do 
@@ -113,7 +121,8 @@ build process for all of the code.
 The following tables describe which variables set RAJA options and 
 and their default settings:
 
-* **Examples, tests, warnings, etc.**
+Examples, tests, warnings, etc.
+--------------------------------
 
 CMake variables can be used to control whether RAJA tests, examples, 
 tutorial exercises, etc. are built when RAJA is compiled.
@@ -123,23 +132,24 @@ tutorial exercises, etc. are built when RAJA is compiled.
       =========================  =========================================
       (RAJA_)ENABLE_TESTS        On 
       (RAJA_)ENABLE_EXAMPLES     On 
-      (RAJA_)ENABLE_BENCHMARKS   Off
-      (RAJA_)ENABLE_COVERAGE     Off (supported for GNU compilers only)
       RAJA_ENABLE_EXERCISES      On 
+      (RAJA_)ENABLE_BENCHMARKS   Off
       RAJA_ENABLE_REPRODUCERS    Off 
+      (RAJA_)ENABLE_COVERAGE     Off (supported for GNU compilers only)
       =========================  =========================================
 
-RAJA can also be configured to build with compiler warnings reported as
-errors, which may be useful to make sure your application builds cleanly:
+Other configuration options are available to specialize how RAJA is compiled:
 
-      ================================   ======================
-      Variable                           Default
-      ================================   ======================
-      (RAJA_)ENABLE_WARNINGS_AS_ERRORS   Off
-      ================================   ======================
+      ==================================   =========================
+      Variable                             Default
+      ==================================   =========================
+      (RAJA_)ENABLE_WARNINGS_AS_ERRORS     Off
+      RAJA_ENABLE_FORCEINLINE_RECURSIVE    On (Intel compilers only)
+      RAJA_ALLOW_INCONSISTENT_OPTIONS      Off 
+      ==================================   =========================
 
 RAJA Views/Layouts may be configured to check for out of bounds 
-indexing at runtime:
+indexing at run time:
 
       =========================   ======================
       Variable                    Default
@@ -147,11 +157,30 @@ indexing at runtime:
       RAJA_ENABLE_BOUNDS_CHECK    Off
       =========================   ======================
 
-Note that RAJA bounds checking is a runtime check and will add 
-considerable execution time overhead. Thus, this feature should only be 
-used for correctness checking and should be disabled for production builds.
-     
-* **Programming model back-end support**
+.. note:: RAJA bounds checking is a run time check and will add considerable 
+          execution time overhead. Thus, this feature should only be used for 
+          debugging and correctness checking and should be disabled for 
+          production builds.
+    
+RAJA Features
+-------------------
+
+Some RAJA features are enabled by RAJA-specific CMake variables.
+
+      ===========================   =======================================
+      Variable                      Meaning
+      ===========================   =======================================
+      RAJA_ENABLE_RUNTIME_PLUGINS   Enable support for dynamically loaded
+                                    RAJA plugins. Default is off.
+      RAJA_ENABLE_DESUL_ATOMICS     Replace RAJA atomic implementations
+                                    with Desul variants at compile-time.
+                                    Default is off.
+      RAJA_ENABLE_VECTORIZATION     Enable SIMD/SIMT intrinsics support.
+                                    Default is on.
+      ===========================   =======================================
+ 
+Programming model back-end support
+-------------------------------------
 
 Variables that control which RAJA programming model back-ends are enabled
 are as follows (names are descriptive of what they enable):
@@ -159,11 +188,11 @@ are as follows (names are descriptive of what they enable):
       ==========================   ============================================
       Variable                     Default
       ==========================   ============================================
-      (RAJA_)ENABLE_OPENMP         On
+      (RAJA_)ENABLE_OPENMP         Off
       (RAJA_)ENABLE_CUDA           Off
+      RAJA_ENABLE_CLANG_CUDA       Off
       (RAJA_)ENABLE_HIP            Off
-      RAJA_ENABLE_TARGET_OPENMP    Off (when on, (RAJA_)ENABLE_OPENMP must 
-                                   also be on!)
+      RAJA_ENABLE_TARGET_OPENMP    Off (when on, ENABLE_OPENMP must also be on)
       RAJA_ENABLE_TBB              Off
       RAJA_ENABLE_SYCL             Off
       ==========================   ============================================
@@ -180,37 +209,50 @@ Other programming model specific compilation options are also available:
       CUDA_ARCH                                sm_35 (based on hardware support)
       RAJA_ENABLE_EXTERNAL_ROCPRIM             Off
       RAJA_ENABLE_ROCTX                        Off
-      RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL   Off (enables device function 
-                                               pointers in HIP back-end)
       ======================================   =================================
 
 Turning the ``(RAJA_)ENABLE_CLANG_CUDA`` variable on will build CUDA 
 code with the native support in the Clang compiler.
 
-The ``RAJA_ENABLE_EXTERNAL_CUB`` variable is used to enable use of an
-external install of the NVIDIA CUB support library. When Off, the CUB
-library included in the CUDA toolkit will still be used, if available.
-Starting with CUDA 11, CUB is installed as part of the CUDA toolkit and
-the NVIDIA Thrust library requires that install of CUB. We recommended
-projects use the CUB included with the CUDA toolkit for compatibility with
-Thrust and applications using Thrust. Users should take note of the CUB
-install used by RAJA to ensure they use the same include directories when
-configuring their applications.
-
-The ``RAJA_ENABLE_EXTERNAL_ROCPRIM`` variable is used to enable use of an 
-external install of the AMD rocPRIM support library. When Off, the 
-rocPRIM library included in the ROCm install will be used, when available.
-We recommend projects use the rocPRIM included with the ROCm install when
-available. Users should take note of the rocPRIM install used by RAJA to
-ensure they use the same include directories when configuring their
-applications.
-
 .. note:: See :ref:`getting_started-label` for more information about
-          setting other options for RAJA back-ends.
+          using the ``RAJA_ENABLE_EXTERNAL_CUB`` and 
+          ``RAJA_ENABLE_EXTERNAL_ROCPRIM`` variables, as well other 
+          RAJA back-ends.
+
+Timer Options
+--------------
+
+RAJA provides a simple portable timer class that is used in RAJA
+example codes to determine execution timing and can be used in other apps
+as well. This timer can use any of three internal timers depending on
+your preferences, and one should be selected by setting the 'RAJA_TIMER'
+variable. 
 
-* **Data types, sizes, alignment, etc.**
+      ======================   ======================
+      Variable                 Values
+      ======================   ======================
+      RAJA_TIMER               chrono (default),
+                               gettime,
+                               clock
+      ======================   ======================
+
+What these variables mean:
 
-RAJA provides type aliases that can be used to parameterize floating 
+      =============================   ========================================
+      Value                           Meaning
+      =============================   ========================================
+      chrono                          Use the std::chrono library from the 
+                                      C++ standard library
+      gettime                         Use `timespec` from the C standard 
+                                      library time.h file
+      clock                           Use `clock_t` from time.h
+      =============================   ========================================
+
+Data types, sizes, alignment, etc.
+-------------------------------------
+
+The options discussed in this section are typically not needed by users.
+They are provided for special cases when users want to parameterize floating 
 point types in applications, which makes it easier to switch between types.
 
 .. note:: RAJA data types in this section are provided as a convenience to 
@@ -297,35 +339,8 @@ in units of **bytes**.
 For details on the options in this section are used, please see the 
 header file ``RAJA/include/RAJA/util/types.hpp``.
 
-* **Timer Options**
-
-RAJA provides a simple portable timer class that is used in RAJA
-example codes to determine execution timing and can be used in other apps
-as well. This timer can use any of three internal timers depending on
-your preferences, and one should be selected by setting the 'RAJA_TIMER'
-variable. 
-
-      ======================   ======================
-      Variable                 Values
-      ======================   ======================
-      RAJA_TIMER               chrono (default)
-                               gettime
-                               clock
-      ======================   ======================
-
-What these variables mean:
-
-      =============================   ========================================
-      Value                           Meaning
-      =============================   ========================================
-      chrono                          Use the std::chrono library from the 
-                                      C++ standard library
-      gettime                         Use `timespec` from the C standard 
-                                      library time.h file
-      clock                           Use `clock_t` from time.h
-      =============================   ========================================
-
-* **Other RAJA Features**
+Other RAJA Features
+-------------------
    
 RAJA contains some features that are used mainly for development or may
 not be of general interest to RAJA users. These are turned off be default.
@@ -340,10 +355,6 @@ They are described here for reference and completeness.
                                     tolerance enabled run (e.g., number of 
                                     faults detected, recovered from, 
                                     recovery overhead, etc.)
-      RAJA_ENABLE_RUNTIME_PLUGINS   Enable support for dynamically loaded
-                                    RAJA plugins.
-      RAJA_ENABLE_DESUL_ATOMICS     Replace RAJA atomic implementations
-                                    with desul variants at compile-time.     
       ===========================   =======================================
 
 
diff --git a/docs/sphinx/user_guide/feature/atomic.rst b/docs/sphinx/user_guide/feature/atomic.rst
index 20799be569..64f3241ad5 100644
--- a/docs/sphinx/user_guide/feature/atomic.rst
+++ b/docs/sphinx/user_guide/feature/atomic.rst
@@ -6,20 +6,45 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _atomics-label:
+.. _feat-atomics-label:
 
-========
-Atomics
-========
+===================
+Atomic Operations
+===================
 
 RAJA provides portable atomic operations that can be used to update values
 at arbitrary memory locations while avoiding data races. They are described
 in this section.
 
-A complete working example code that shows RAJA atomic usage can be found in 
-:ref:`atomichist-label`.
+.. note:: All RAJA atomic operations are in the namespace ``RAJA``.
+
+.. note:: Each RAJA atomic operation is templated on an *atomic policy*
+          type, which **must be compatible with the execution policy used by 
+          the kernel in which it is used.** For example, in 
+          a CUDA kernel, a CUDA atomic policy type must be used.
+
+For more information about available RAJA atomic policies, please see
+:ref:`atomicpolicy-label`.
+
+.. note:: RAJA support for CUDA atomic operations may be specific to
+          the compute architecture for which the code is compiled. Please 
+          see :ref:`cudaatomics-label` for more information.
+
+RAJA currently supports two different implementations of atomic operations
+via the same basic interface. The default implementation is the original one
+developed in RAJA and which has been available for several years. Alternatively,
+one can choose an implementation based on 
+`DESUL <https://github.com/desul/desul>`_ at compile time. Please see 
+:ref:`desul-atomics-label` for more information. Eventually, we plan to 
+deprecate the original RAJA implementation and provide only the DESUL 
+implementation. The RAJA atomic interface is expected to change when we switch
+over to DESUL atomic support. Specifically, the atomic policy noted above will
+no longer be used.
+
+Please see the following tutorial sections for detailed examples that use
+RAJA atomic operations:
 
-.. note:: * All RAJA atomic operations are in the namespace ``RAJA``.
+ * :ref:`tut-atomichist-label`.
 
 .. _atomic-ops:
 
@@ -27,59 +52,57 @@ A complete working example code that shows RAJA atomic usage can be found in
 Atomic Operations
 -----------------
 
-RAJA atomic support includes a variety of the most common atomic operations.
+RAJA atomic support the most common atomic operations.
 
-.. note:: * Each RAJA atomic operation is templated on an *atomic policy*.
-          * Each method described in the table below returns the value of 
-            the potentially modified argument (i.e., \*acc) immediately before 
-            the atomic operation is applied, in case it is needed by a user.
-          * See :ref:`atomics-label` for details about CUDA atomic operations.
+.. note:: Each atomic method described below returns the value of 
+          the potentially modified argument (i.e., \*acc) immediately before 
+          the atomic operation is applied, in case a user requires it.
 
 ^^^^^^^^^^^
 Arithmetic
 ^^^^^^^^^^^
 
-* ``atomicAdd< atomic_policy >(T* acc, T value)`` - Add value to \*acc.
+* ``atomicAdd< atomic_policy >(T* acc, T value)`` - Add ``value`` to ``\*acc``.
 
-* ``atomicSub< atomic_policy >(T* acc, T value)`` - Subtract value from \*acc.
+* ``atomicSub< atomic_policy >(T* acc, T value)`` - Subtract ``value`` from ``\*acc``.
 
 ^^^^^^^^^^^
 Min/max
 ^^^^^^^^^^^
 
-* ``atomicMin< atomic_policy >(T* acc, T value)`` - Set \*acc to min of \*acc and value.
+* ``atomicMin< atomic_policy >(T* acc, T value)`` - Set ``\*acc`` to min of ``\*acc`` and ``value``.
 
-* ``atomicMax< atomic_policy >(T* acc, T value)`` - Set \*acc to max of \*acc and value.
+* ``atomicMax< atomic_policy >(T* acc, T value)`` - Set ``\*acc`` to max of ``\*acc`` and ``value``.
 
 ^^^^^^^^^^^^^^^^^^^^
 Increment/decrement
 ^^^^^^^^^^^^^^^^^^^^
 
-* ``atomicInc< atomic_policy >(T* acc)`` - Add 1 to \*acc.
+* ``atomicInc< atomic_policy >(T* acc)`` - Add 1 to ``\*acc``.
 
-* ``atomicDec< atomic_policy >(T* acc)`` - Subtract 1 from \*acc.
+* ``atomicDec< atomic_policy >(T* acc)`` - Subtract 1 from ``\*acc``.
 
-* ``atomicInc< atomic_policy >(T* acc, T compare)`` - Add 1 to \*acc if \*acc < compare, else set \*acc to zero.
+* ``atomicInc< atomic_policy >(T* acc, T compare)`` - Add 1 to ``\*acc`` if ``\*acc`` < ``compare``, else set ``\*acc`` to zero.
 
-* ``atomicDec< atomic_policy >(T* acc, T compare)`` - Subtract 1 from \*acc if \*acc != 0 and \*acc <= compare, else set \*acc to compare.
+* ``atomicDec< atomic_policy >(T* acc, T compare)`` - Subtract 1 from ``\*acc`` if ``\*acc`` != 0 and ``\*acc`` <= ``compare``, else set ``\*acc`` to ``compare``.
 
 ^^^^^^^^^^^^^^^^^^^^
 Bitwise operations
 ^^^^^^^^^^^^^^^^^^^^
 
-* ``atomicAnd< atomic_policy >(T* acc, T value)`` - Bitwise 'and' equivalent: Set \*acc to \*acc & value. Only works with integral data types.
+* ``atomicAnd< atomic_policy >(T* acc, T value)`` - Bitwise 'and' equivalent: Set ``\*acc`` to ``\*acc`` & ``value``. Only works with integral data types.
 
-* ``atomicOr< atomic_policy >(T* acc, T value)`` - Bitwise 'or' equivalent: Set \*acc to \*acc | value. Only works with integral data types.
+* ``atomicOr< atomic_policy >(T* acc, T value)`` - Bitwise 'or' equivalent: Set ``\*acc`` to ``\*acc`` | ``value``. Only works with integral data types.
 
-* ``atomicXor< atomic_policy >(T* acc, T value)`` - Bitwise 'xor' equivalent: Set \*acc to \*acc ^ value. Only works with integral data types.
+* ``atomicXor< atomic_policy >(T* acc, T value)`` - Bitwise 'xor' equivalent: Set ``\*acc`` to ``\*acc`` ^ ``value``. Only works with integral data types.
 
 ^^^^^^^^^^^^^^^^^^^^
 Replace
 ^^^^^^^^^^^^^^^^^^^^
 
-* ``atomicExchange< atomic_policy >(T* acc, T value)`` - Replace \*acc with value.
+* ``atomicExchange< atomic_policy >(T* acc, T value)`` - Replace ``\*acc`` with ``value``.
 
-* ``atomicCAS< atomic_policy >(T* acc, Tcompare, T value)`` - Compare and swap: Replace \*acc with value if and only if \*acc is equal to compare.
+* ``atomicCAS< atomic_policy >(T* acc, Tcompare, T value)`` - Compare and swap: Replace ``\*acc`` with ``value`` if and only if ``\*acc`` is equal to ``compare``.
 
 Here is a simple example that shows how to use an atomic operation to compute
 an integral sum on a CUDA GPU device::
@@ -95,8 +118,8 @@ an integral sum on a CUDA GPU device::
   cudaDeviceSynchronize();
   *sum = 0;
 
-  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::RangeSegment(0, N), 
-    [=] RAJA_DEVICE (RAJA::Index_type i) {
+  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::TypedRangeSegment<int>(0, N), 
+    [=] RAJA_DEVICE (int i) {
 
     RAJA::atomicAdd< RAJA::cuda_atomic >(sum, 1);
 
@@ -108,7 +131,7 @@ After this kernel executes, the value reference by 'sum' will be 'N'.
 AtomicRef
 ^^^^^^^^^^^^^^^^^^^^
 
-RAJA also provides an atomic interface similar to the C++20 'std::atomic_ref', 
+RAJA also provides an interface similar to the C++20 ``std::atomic_ref``, 
 but which works for arbitrary memory locations. The class 
 ``RAJA::AtomicRef`` provides an object-oriented interface to the 
 atomic methods described above. For example, after the following operations:: 
@@ -122,22 +145,14 @@ atomic methods described above. For example, after the following operations::
 
 the value of 'val' will be 5.
 
------------------
-Atomic Policies
------------------
-
-For more information about available RAJA atomic policies, please see
-:ref:`atomicpolicy-label`.
-
-
 .. _cudaatomics-label:
 
 ---------------------------------------
 CUDA Atomics Architecture Dependencies
 ---------------------------------------
 
-The internal implementations for RAJA atomic operations may vary depending
-on which CUDA architecture is available and/or specified when the RAJA
+The implementations for RAJA atomic operations may vary depending
+on which CUDA architecture is available and/or specified when RAJA
 is configured for compilation. The following rules apply when the following
 CUDA architecture level is chosen:
 
@@ -155,28 +170,36 @@ CUDA architecture level is chosen:
 
     * CUDA native 64-bit double `atomicAdd` is used.
 
+.. _desul-atomics-label:
+
 ---------------------
 DESUL Atomics Support
 ---------------------
 
-RAJA provides support for the use of `DESUL Atomics <https://github.com/desul/desul>`_ as
-an alternative backend to the default implementation of RAJA atomics. DESUL atomics are considered an **experimental** feature in RAJA at this point. DESUL atomics
-may impact the performance of some atomic functions. While switching
-to DESUL atomics typically yields positive or neutral performance results, some atomic
+RAJA provides the ability to use 
+`DESUL Atomics <https://github.com/desul/desul>`_ as
+an alternative to the default implementation of RAJA atomics. DESUL atomics 
+are considered an **experimental** feature in RAJA at this point and may
+impact the performance of some atomic functions. While DESUL atomics typically 
+yields better or similar performance to RAJA default atomics, some atomic 
 operations may perform worse when using DESUL.
 
-To enable DESUL Atomics:
-
-#. Ensure that RAJA and its dependencies are configured to use C++14.
-#. Set ``RAJA_ENABLE_DESUL_ATOMICS=On``.
-
-Enabling DESUL Atomics alters RAJA atomic functions to be wrapper-functions for their
-DESUL counterparts. This removes the need for user code changes to switch between
-DESUL and RAJA implementations. The exception to this is when RAJA atomic helper functions
-are used instead of the backwards-compatible API functions specified by :ref:`atomic-ops`. By *helper functions*, we mean the RAJA atomic methods which take a reduction policy object as the first argument, instead of specifying the reduction policy type as a template parameter. 
-
-DESUL atomic functions are compiled with the proper back-end implementation based on the scope in which they are
-called, which removes the need to specify atomic policies for
-target back-ends. As a result, atomic policies such as ``cuda_atomic`` or ``omp_atomic``
-are ignored when DESUL is enabled, but are still necessary to pass in as parameters
-to the RAJA API. This will likely change in the future and RAJA atomic policies will be removed.
+To enable DESUL atomics, pass the option to CMake when configuring a RAJA
+build: ``-DRAJA_ENABLE_DESUL_ATOMICS=On``.
+
+Enabling DESUL atomics alters RAJA atomic functions to be wrapper-functions 
+for their DESUL counterparts. This removes the need for user code changes to 
+switch between DESUL and RAJA implementations for the most part. The exception 
+to this is when RAJA atomic helper functions are used instead of the 
+backward-compatible API functions specified by :ref:`atomic-ops`. By 
+*helper functions*, we mean the RAJA atomic methods which take an atomic
+policy object as the first argument, instead of specifying the atomic policy 
+type as a template parameter. 
+
+DESUL atomic functions are compiled with the proper back-end implementation 
+based on the scope in which they are called, which removes the need to specify 
+atomic policies for target back-ends. As a result, atomic policies such as 
+``RAJA::cuda_atomic`` or ``RAJA::omp_atomic`` are ignored when DESUL is 
+enabled, but are still necessary to pass in as parameters to the RAJA API. 
+This will likely change in the future when we switch to use DESUL atomics
+exclusively and remove the default RAJA atomic operations.
diff --git a/docs/sphinx/user_guide/feature/iteration_spaces.rst b/docs/sphinx/user_guide/feature/iteration_spaces.rst
index ef3ab9c7e9..fafb35a90e 100644
--- a/docs/sphinx/user_guide/feature/iteration_spaces.rst
+++ b/docs/sphinx/user_guide/feature/iteration_spaces.rst
@@ -6,7 +6,7 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _index-label:
+.. _feat-index-label:
 
 ================================
 Indices, Segments, and IndexSets
@@ -20,12 +20,15 @@ order for loop iterates, aggregate and partition iterates, as well as other
 configurations. In this section, we introduce RAJA index and iteration space 
 concepts and types.
 
-More examples of RAJA iteration space usage can be found in the
-:ref:`indexset-label` and :ref:`vertexsum-label` sections of the tutorial.
-
 .. note:: All RAJA iteration space types described here are located in the 
           namespace ``RAJA``.
 
+Please see the following tutorial sections for detailed examples that use
+RAJA iteration space concepts:
+
+ * :ref:`tut-indexset-label`
+ * :ref:`tut-vertexsum-label`
+
 .. _indices-label:
 
 -------
@@ -37,146 +40,43 @@ identify loop iterates. Any lambda expression that represents all or part of
 a loop body passed to a ``RAJA::forall`` or ``RAJA::kernel`` method will 
 take at least one loop index variable argument. RAJA iteration space types 
 are templates that allow users to use any integral type for an
-index variable. The index variable type may be explicitly specified by a user.
-RAJA also provides the ``RAJA::Index_type`` type, which is used as a default 
-in some circumstances for convenience by allowing use of a common type 
-alias to typed constructs without explicitly specifying the type. 
-The ``RAJA::Index_type`` type is an alias to the C++ type ``std::ptrdiff_t``, 
-which is appropriate for most compilers to generate useful loop-level 
-optimizations.
+index variable. 
 
 .. _segments-label:
 
--------------
-Segments
--------------
-
-A RAJA **Segment** represents a set of loop indices that one wants to 
-execute as a unit. RAJA provides Segment types for contiguous index ranges, 
-constant (non-unit) stride ranges, and arbitrary lists of indices.
-
-Stride-1 Segments
-^^^^^^^^^^^^^^^^^^^
-
-A ``RAJA::TypedRangeSegment`` is the fundamental type for representing a 
-stride-1 (i.e., contiguous) range of indices.
-
-.. figure:: ../figures/RangeSegment.png
-
-   A range segment defines a stride-1 index range [beg, end).
-
-One can create an explicitly-typed range segment or one with the default
-``RAJA::Index_type`` index type. For example,::
-
-   // A stride-1 index range [beg, end) using type int.
-   RAJA::TypedRangeSegment<int> int_range(beg, end);
-
-   // A stride-1 index range [beg, end) using the RAJA::Index_type default type
-   RAJA::RangeSegment default_range(beg, end);
-
-.. note:: When using a RAJA range segment, no loop iterations will be run when
-          begin is greater-than-or-equal-to end similar to a C-style for-loop.
-
-Strided Segments
-^^^^^^^^^^^^^^^^^^^
-
-A ``RAJA::TypedRangeStrideSegment`` defines a range with a constant stride
-that is given explicitly stride, including negative stride.
-
-.. figure:: ../figures/RangeStrideSegment.png
-
-   A range-stride segment defines an index range with arbitrary stride [beg, end, stride).
-
-One can create an explicitly-typed strided range segment or one with the 
-default ``RAJA::Index_type`` index type. For example,::
+-----------------------
+Segments and IndexSets
+-----------------------
 
-   // A stride-2 index range [beg, end, 2) using type int.
-   RAJA::TypedRangeStrideSegment<int> stride2_range(beg, end, 2);
+A RAJA **Segment** represents a set of indices that one wants to 
+execute as a unit for a kernel. RAJA provides the following Segment types:
 
-   // A index range with -1 stride [0, N-1, -1) using the RAJA::Index_type default type
-   RAJA::RangeStrideSegment neg1_range( N-1, -1, -1);
+   * ``RAJA::TypedRangeSegment`` represents a stride-1 range
+   * ``RAJA::TypedRangeStrideSegment`` represents a (non-unit) stride range
+   * ``RAJA::TypedListSegment`` represents an arbitrary set of indices
 
-Using a range with a stride  of '-1' as above in a RAJA loop traversal template
-will run the loop indices in reverse order. That is, using 'neg1_range' 
-from above::
-
-   RAJA::forall< RAJA::seq_exec >( neg1_range, [=] (RAJA::Index_type i) {
-     printf("%ld ", i); 
-   } );
-
-will print the values::
-
-   N-1  N-2  N-3 .... 1 0 
-
-RAJA strided ranges support both positive and negative stride values. The
-following items are worth noting:
-
-.. note:: When using a RAJA strided range, no loop iterations will be run
-          under the following conditions:
-            * Stride > 0 and begin > end
-            * Stride < 0 and begin < end
-            * Stride == 0
-
-List Segments
-^^^^^^^^^^^^^^
-
-A ``RAJA::TypedListSegment`` is used to define an arbitrary set of loop 
-indices, akin to an indirection array.
-
-.. figure:: ../figures/ListSegment.png
-
-   A list segment defines an arbitrary collection of indices. Here, we have a list segment with 5 irregularly-spaced indices.
-
-A list segment is created by passing an array of integral values to a list
-segment constructor. For example::
-
-   // Create a vector holding some integer index values
-   std::vector<int> idx = {0, 2, 3, 4, 7, 8, 9, 53};
-
-   // Create list segment with these loop indices where the indices are 
-   // stored in the host memory space
-   camp::resources::Resource host_res{camp::resources::Host()};
-   RAJA::TypedListSegment<int> idx_list( &idx[0], idx.size(),
-                                         host_res );
-
-Using a list segment in a RAJA loop traversal template will run the loop 
-indices specified in the array passed to the list segment constructor. That 
-is, using 'idx_list' from above::
-
-   RAJA::forall< RAJA::seq_exec >( idx_list, [=] (RAJA::Index_type i) {
-     printf("%ld ", i);
-   } );
-
-will print the values::
-
-   0 2 3 4 7 8 9 53
+A ``RAJA::TypedIndexSet`` is a container that can hold an arbitrary collection
+of segments to compose iteration patterns in a single kernel invocation.
 
-Note that a ``RAJA::TypedListSegment`` constructor can take a pointer to
-an array of indices and an array length, as shown above. If the indices are
-in a container, such as ``std::vector`` that provides ``begin()``, ``end()``,
-and ``size()`` methods, the length argument is not required. For example::
+Segment and IndexSet types are used in ``RAJA::forall`` and other RAJA kernel
+execution mechanisms to define the iteration space for a kernel.
 
-   std::vector<int> idx = {0, 2, 3, 4, 7, 8, 9, 53};
+.. note:: Iterating over the indices of all segments in a RAJA index set 
+          requires a two-level execution policy, with two template parameters,
+          as shown above. The first parameter specifies how to iterate over 
+          the segments. The second parameter specifies how each segment will 
+          execute. See :ref:`indexsetpolicy-label` for more information about 
+          RAJA index set execution policies.
 
-   camp::resources::Resource host_res{camp::resources::Host()};
-   RAJA::TypedListSegment<int> idx_list( idx, host_res );
+.. note:: It is the responsibility of the user to ensure that segments are
+          defined properly when using RAJA index sets. For example, if the
+          same index appears in multiple segments, the corresponding loop
+          iteration will be run multiple times.
 
-Similar to range segment types, RAJA provides ``RAJA::ListSegment``, which is
-a type alias to ``RAJA::TypedListSegment`` using ``RAJA::Index_type`` as the
-template type parameter.
-   
-By default, the list segment constructor copies the indices in the array
-passed to it to the memory space specified by the resource argument.
-The resource argument is required so that the segment index values are in the
-proper memory space for the kernel to run. Since the kernel is run on 
-the CPU host in this example (indicated by the ``RAJA::seq_exec`` execution 
-policy), we pass a host resource object to the list segment constructor. 
-If, for example, the kernel was to run on a GPU using a CUDA or HIP 
-execution policy, then the resource type passed to the camp resource 
-constructor would be ``camp::resources::Cuda()`` or 
-``camp::resources::Hip()``, respectively.
+Please see :ref:`tut-indexset-label` for a detailed discussion of how to create
+and use these segment types.
 
-Segment Types and  Iteration
+Segment Types and Iteration
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 It is worth noting that RAJA segment types model **C++ iterable interfaces**.
@@ -192,56 +92,4 @@ and two types:
   * value_type
 
 Thus, any iterable type that defines these methods and types appropriately
-can be used as a segment with RAJA traversal templates.
-
-.. _indexsets-label:
-
---------------
-IndexSets
---------------
-
-A ``RAJA::TypedIndexSet`` is a container that can hold an arbitrary collection
-of segment objects of arbitrary type as illustrated in the following figure,
-where we have two contiguous ranges and an irregularly-spaced list of indices.
-
-.. figure:: ../figures/IndexSet.png
-
-   An index set with 2 range segments and one list segment.
-
-We can create an index set that describes such an iteration space::
-
-   // Create an index set that can hold range and list segments with the
-   // default index type
-   RAJA::TypedIndexSet< RAJA::RangeSegment, RAJA::ListSegment > iset;
-
-   // Add two range segments and one list segment to the index set
-   iset.push_back( RAJA::RangeSegment( ... ) );
-   iset.push_back( RAJA::ListSegment(...) );
-   iset.push_back( RAJA::RangeSegment( ... ) );
-
-Now that we've created this index set object, we can pass it to any RAJA 
-loop execution template to execute the indices defined by its segments::
-
-   // Define an index set execution policy type that will iterate over
-   // its segments in parallel (OpenMP) and execute each segment sequentially 
-   using ISET_EXECPOL = RAJA::ExecPolicy< RAJA::omp_parallel_segit, 
-                                          RAJA::seq_exec >;
-
-   // Run a kernel with iterates defined by the index set
-   RAJA::forall<ISET_EXECPOL>(iset, [=] (int i) { ... });
-
-In this example, the loop iterations will execute in three chunks defined by 
-the two range segments and one list segment. The segments will be iterated 
-over in parallel using OpenMP, and each segment will execute sequentially.
-
-.. note:: Iterating over the indices of all segments in a RAJA index set 
-          requires a two-level execution policy, with two template parameters,
-          as shown above. The first parameter specifies how to iterate over 
-          the seqments. The second parameter specifies how each segment will 
-          execute. See :ref:`indexsetpolicy-label` for more information about 
-          RAJA index set execution policies.
-
-.. note:: It is the responsibility of the user to ensure that segments are
-          defined properly when using RAJA index sets. For example, if the
-          same index appears in multiple segments, the corresponding loop
-          iteration will be run multiple times.
+can be used as a segment with RAJA kernel execution templates.
diff --git a/docs/sphinx/user_guide/feature/local_array.rst b/docs/sphinx/user_guide/feature/local_array.rst
index 3dc81d3856..9716708af1 100644
--- a/docs/sphinx/user_guide/feature/local_array.rst
+++ b/docs/sphinx/user_guide/feature/local_array.rst
@@ -6,7 +6,7 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _local_array-label:
+.. _feat-local_array-label:
 
 ===========
 Local Array
@@ -15,7 +15,7 @@ Local Array
 This section introduces RAJA *local arrays*. A ``RAJA::LocalArray`` is an
 array object with one or more dimensions whose memory is allocated when a 
 RAJA kernel is executed and only lives within the scope of the kernel 
-execution. To motivate the concept and usage, consider a simple C++ example
+execution. To motivate the concept and usage, consider a simple C example
 in which we construct and use two arrays in nested loops::
 
            for(int k = 0; k < 7; ++k) { //k loop
@@ -35,9 +35,10 @@ in which we construct and use two arrays in nested loops::
            }
 
 Here, two stack-allocated arrays are defined inside the outer 'k' loop and 
-used in both inner 'j' loops. This loop pattern may be also be expressed 
-using RAJA local arrays in a ``RAJA::kernel_param`` kernel. We show a 
-RAJA variant below, which matches the implementation above, and then discuss 
+used in both inner 'j' loops. 
+
+This loop pattern may be also be written using RAJA local arrays in a 
+``RAJA::kernel_param`` kernel. We show this next, and then discuss 
 its constituent parts::
 
   // 
@@ -73,8 +74,8 @@ its constituent parts::
   // Define the kernel
   // 
 
-  RAJA::kernel_param<POL> ( RAJA::make_tuple(RAJA::RangeSegment(0,5), 
-                                             RAJA::RangeSegment(0,7)),
+  RAJA::kernel_param<POL> ( RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0,5),
+                                             RAJA::TypedRangeSegment<int<(0,7)),
                             RAJA::make_tuple(kernel_a_array, kernel_b_array),
 
     [=] (int j, int k, RAJA_a_array& kernel_a_array, RAJA_b_array& kernel_b_array) {
@@ -93,25 +94,30 @@ two-dimensional and one one-dimensional and creates an instance of each type.
 The template arguments for the ``RAJA::LocalArray`` types are:
 
   * Array data type
-  * Index permutation (see :ref:`view-label` for more on RAJA permutations)
+  * Index striding order (see :ref:`feat-view-label` for details)
   * Array dimensions
 
-.. note:: ``RAJA::LocalArray`` types support arbitrary dimensions and sizes.
+The local array instances are passed to the kernel in a tuple after the 
+iteration space tuple. 
 
 The kernel policy is a two-level nested loop policy (see 
 :ref:`loop_elements-kernel-label` for information about RAJA kernel policies) 
 with a statement type ``RAJA::statement::InitLocalMem`` inserted between the 
-nested for-loops which allocates the memory for the local arrays when the 
-kernel executes.  The ``InitLocalMem`` statement type uses a 'CPU tile' memory 
-type, for the two entries '0' and '1' in the kernel parameter tuple 
-(second argument to ``RAJA::kernel_param``). Then, the inner initialization 
-loop and inner print loop are run with the respective lambda bodies defined 
-in the kernel.
+nested 'For' statements, which allocates the memory for the local arrays when 
+the kernel executes. The ``InitLocalMem`` statement type has two parameters.
+One for the memory type ``RAJA::cpu_tile_mem``, and one for specifying which
+parameter tuple entries correspond to the local arrays 
+``RAJA::ParamList<0, 1>``. The local array initialization is done in the first 
+lambda expression, and the local array values are printed in the second lambda 
+expression.
+
+.. note:: ``RAJA::LocalArray`` types support arbitrary dimensions and extents
+          in each dimension.
 
 -------------------
 Memory Policies
 -------------------
 
-``RAJA::LocalArray`` supports CPU stack-allocated memory and CUDA GPU shared
-memory and thread private memory. See :ref:`localarraypolicy-label` for a
-discussion of available memory policies.
+``RAJA::LocalArray`` supports CPU stack-allocated memory and CUDA or HIP GPU 
+shared memory and thread private memory. See :ref:`localarraypolicy-label` 
+for a discussion of available memory policies.
diff --git a/docs/sphinx/user_guide/feature/loop_basic.rst b/docs/sphinx/user_guide/feature/loop_basic.rst
index d425a31ff5..80ef7896e4 100644
--- a/docs/sphinx/user_guide/feature/loop_basic.rst
+++ b/docs/sphinx/user_guide/feature/loop_basic.rst
@@ -12,52 +12,25 @@
 Elements of Loop Execution
 ==============================================
 
-In this section, we describe the basic elements of RAJA loop kernel 
-execution.  ``RAJA::forall``, ``RAJA::kernel``, and ``RAJA::expt::launch`` 
-(aka *RAJA Teams*) template methods comprise the RAJA interface for loop 
-execution. ``RAJA::forall`` methods execute simple, non-nested loops, 
-``RAJA::kernel`` methods support nested loops and other complex loop 
-kernels and transformations, and ``RAJA::expt::launch`` creates an execution 
-space in which algorithms are expressed in terms of nested loops using 
-the ``RAJA::expt::loop`` method.
-
-.. note:: * The ``forall`` , and ``kernel`` methods are in the
-            namespace ``RAJA``, while ``launch`` is found under
-            the RAJA namespace for experimental features ``RAJA::expt``.
-
-          * A ``RAJA::forall`` loop execution method is a template on an
-            *execution policy* type. A ``RAJA::forall`` method takes two 
-            arguments:
-
-              * an iteration space object, such as a contiguous range of loop
-                indices, and
-              * a single lambda expression representing the loop body.
-
-          * Each ``RAJA::kernel`` method is a template on a policy that
-            contains statements with *execution policy* types appropriate for
-            the kernel structure; e.g., an execution policy for each level in a
-            loop nest. A ``RAJA::kernel`` method takes multiple arguments:
-
-              * a *tuple* of iteration space objects, and
-              * one or more lambda expressions representing portions of
-                the loop kernel body.
-
-          * The ``RAJA::expt::launch`` method is a template on both host and
-            device policies to create an execution space for kernels.
-            Since both host and device poilices are specified, the launch 
-            method can be used to select at run-time whether to run a kernel
-            on the host or device.  Algorithms are expressed inside the 
-            execution space as nested loops using ``RAJA::loop`` methods.
-
-              * Hierarchical parallelism can be expressed using the thread and
-                thread-team model with ``RAJA::expt::loop`` methods as found in
-                programming models such as CUDA/HIP.
-
-Various examples showing how to use ``RAJA::forall``, ``RAJA::kernel``, ``RAJA::launch``
-methods may be found in the :ref:`tutorial-label`.
-
-For more information on RAJA execution policies and iteration space constructs, 
-see :ref:`policies-label` and :ref:`index-label`, respectively. 
+The ``RAJA::forall``, ``RAJA::expt::dynamic_forall``, ``RAJA::kernel``, and ``RAJA::launch``
+template methods comprise the RAJA interface for kernel
+execution. ``forall`` methods execute simple, non-nested loops,
+``RAJA::kernel`` methods support nested loops and other complex loop
+kernels and transformations, and ``RAJA::launch`` creates an execution
+space in which kernels are written in terms of nested loops using
+the ``RAJA::loop`` method.
+
+.. note:: The ``forall`` ,  ``kernel``, and ``launch`` methods are in the ``RAJA``
+          namespace, while ``dynamic_forall`` is in the RAJA namespace for
+          experimental features ``RAJA::expt``.  ``RAJA::expt::dynamic_forall``
+          will be moved to the ``RAJA`` namespace in a future RAJA release.
+
+For more information on RAJA execution policies and iteration space constructs,
+see :ref:`feat-policies-label` and :ref:`feat-index-label`, respectively.
+
+The following sections describe the basic aspects of these methods.
+Detailed examples showing how to use ``RAJA::forall``, ``RAJA::kernel``, ``RAJA::launch`` methods may be found in the :ref:`tutorial-label`. Links to specific
+RAJA tutorial sections are provided in the sections below.
 
 .. _loop_elements-forall-label:
 
@@ -65,53 +38,93 @@ see :ref:`policies-label` and :ref:`index-label`, respectively.
 Simple Loops (RAJA::forall)
 ---------------------------
 
-As noted earlier, a ``RAJA::forall`` template executes simple 
-(i.e., non-nested) loops. For example, a C-style loop that adds two vectors,
-like this::
+Consider a C-style loop that adds two vectors::
 
   for (int i = 0; i < N; ++i) {
     c[i] = a[i] + b[i];
   }
 
-may be written using RAJA as::
+This may be written using ``RAJA::forall`` as::
 
-  RAJA::forall<exec_policy>(RAJA::RangeSegment(0, N), [=] (int i) {
+  RAJA::forall<exec_policy>(RAJA::TypesRangeSegment<int>(0, N), [=] (int i) {
     c[i] = a[i] + b[i];
   });
 
-A ``RAJA::forall`` method is a template on an execution policy type and takes
-two arguments: an object describing the loop iteration space, such as a RAJA 
-range segment (shown here), and a lambda expression for the loop body. Applying 
-different loop execution policies enables the loop to run in different ways; 
-e.g., using different programming model back-ends. Different iteration space 
-objects enable the loop iterates to be partitioned, reordered, run in 
-different threads, etc. 
+A ``RAJA::forall`` loop execution method is a template that takes an
+*execution policy* type template parameter. A ``RAJA::forall`` method takes
+two arguments: an iteration space object, such as a contiguous range of loop
+indices as shown here, and a single lambda expression representing the loop
+kernel body.
+
+Applying different loop execution policies enables the loop to run in
+different ways; e.g., using different programming model back-ends. Different
+iteration space objects enable the loop iterates to be partitioned, reordered,
+run in different threads, etc. Please see :ref:`feat-index-label` for details
+about RAJA iteration spaces.
 
 .. note:: Changing loop execution policy types and iteration space constructs
-          enables loops to run in different ways by recompiling the code and 
+          enables loops to run in different ways by recompiling the code and
           without modifying the loop kernel code.
 
-While loop execution using ``RAJA::forall`` methods is a subset of 
-``RAJA::kernel`` functionality, described next, we maintain the 
-``RAJA::forall`` interface for simple loop execution because the syntax is 
+As an extension of ``RAJA::forall``, the ``RAJA::expt::dynamic_forall`` method enables users
+to compile using a list of execution policies and choose the execution policy at run-time.
+For example, a user may want to have N policies available and at run-time choose which policy to use::
+
+  using exec_pol_list = camp::list<pol_0,
+				    ...
+				   pol_N>;
+  int pol = i; //run-time value
+
+  RAJA::expt::dynamic_forall<exec_pol_list>(pol, RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+     c[i]  = a[i] + b[i];
+  });
+
+
+While static loop execution using ``forall`` methods is a subset of
+``RAJA::kernel`` functionality, described next,
+we maintain the ``forall`` interfaces for simple loop execution because the syntax is
 simpler and less verbose for that use case.
 
-.. note:: Data arrays in lambda expressions used with RAJA are typically 
-          RAJA Views (see :ref:`view-label`) or bare pointers as shown in
+.. note:: Data arrays in lambda expressions used with RAJA are typically
+          RAJA Views (see :ref:`feat-view-label`) or bare pointers as shown in
           the code snippets above. Using something like 'std::vector' is
-          non-portable (won't work in GPU kernels, generally) and would add 
+          non-portable (won't work in GPU kernels, generally) and would add
           excessive overhead for copying data into the lambda data environment
           when captured by value.
 
+Please see the following tutorial sections for detailed examples that use
+``RAJA::forall``:
+
+ * :ref:`tut-addvectors-label`
+ * :ref:`tut-dotproduct-label`
+ * :ref:`tut-reduction-label`
+ * :ref:`tut-atomichist-label`
+ * :ref:`tut-indexset-label`
+ * :ref:`tut-vertexsum-label`
+ * :ref:`tut-permutedlayout-label`
+
+
 .. _loop_elements-kernel-label:
 
 ----------------------------
 Complex Loops (RAJA::kernel)
 ----------------------------
 
-A ``RAJA::kernel`` template provides ways to compose and execute arbitrary 
-loop nests and other complex kernels. To introduce the RAJA *kernel* interface,
-consider a (N+1)-level C-style loop nest::
+A ``RAJA::kernel`` template provides ways to compose and execute arbitrary
+loop nests and other complex kernels.
+The ``RAJA::kernel`` interface employs similar concepts to ``RAJA::forall``
+but extends it to support much more complex kernel structures.
+Each ``RAJA::kernel`` method is a template that takes an *execution policy*
+type template parameter. The execution policy can be an arbitrarily complex
+sequence of nested templates that define a kernel execution pattern.
+In its simplest form, ``RAJA::kernel`` takes two arguments:
+a *tuple* of iteration space objects, and a lambda expression representing
+the kernel inner loop body. In more complex usage, ``RAJA::kernel`` can take
+multiple lambda expressions representing different portions of the loop
+kernel body.
+
+To introduce the RAJA *kernel* interface, consider a (N+1)-level C-style loop
+nest::
 
   for (int iN = 0; iN < NN; ++iN) {
     ...
@@ -120,8 +133,8 @@ consider a (N+1)-level C-style loop nest::
        }
   }
 
-Note that we could write this by nesting ``RAJA::forall`` statements and
-it would work for some execution policy choices::
+It is important to note that we do not recommend writing a RAJA version of
+this by nesting ``RAJA::forall`` statements. For example::
 
   RAJA::forall<exec_policyN>(IN, [=] (int iN) {
     ...
@@ -131,37 +144,40 @@ it would work for some execution policy choices::
     ...
   }
 
-However, this approach treats each loop level as an independent entity. This
+This would work for some execution policy choices, but not in general.
+Also, this approach treats each loop level as an independent entity, which
 makes it difficult to parallelize the levels in the loop nest together. So it
-may limit the amount of parallelism that can be exposed and the types of 
+may limit the amount of parallelism that can be exposed and the types of
 parallelism that may be used. For example, if an OpenMP or CUDA
 parallel execution policy is used on the outermost loop, then all inner loops
-would be run sequentially in each thread. It also makes it difficult to perform 
-transformations like loop interchange and loop collapse without changing the 
+would be run sequentially in each thread. It also makes it difficult to perform
+transformations like loop interchange and loop collapse without changing the
 source code, which breaks RAJA encapsulation.
 
-.. note:: **We do not recommend nesting ``RAJA::forall`` statements.**
+.. note:: **We do not recommend using nested ``RAJA::forall`` statements.**
 
-The RAJA *kernel* interface facilitates parallel execution and compile-time
-transformation of arbitrary loop nests and other complex loop structures. 
-It can treat a complex loop structure as a single entity, which simplifies 
-the ability to transform and apply different parallel execution patterns by 
-changing the execution policy type and *not the kernel code*.
+The ``RAJA::kernel`` interface facilitates parallel execution and compile-time
+transformation of arbitrary loop nests and other complex loop structures.
+It can treat a complex loop structure as a single entity, which enables
+the ability to transform and apply different parallel execution patterns by
+changing the execution policy type and **not the kernel code**, in many cases.
 
-The loop above nest may be written using the RAJA kernel interface as::
+The C-style loop above nest may be written using ``RAJA::kernel`` as::
 
-    using KERNEL_POL = 
-      RAJA::KernelPolicy< RAJA::statement::For<N, exec_policyN, 
+    using KERNEL_POL =
+      RAJA::KernelPolicy< RAJA::statement::For<N, exec_policyN,
                             ...
                               RAJA::statement::For<0, exec_policy0,
                                 RAJA::statement::Lambda<0>
                               >
                             ...
-                          > 
+                          >
                         >;
-  
+
     RAJA::kernel< KERNEL_POL >(
-      RAJA::make_tuple(RAJA::RangeSegment(0, NN), ..., RAJA::RangeSegment(0, N0),
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, NN),
+                       ...,
+                       RAJA::TypedRangeSegment<int>(0, N0),
 
       [=] (int iN, ... , int i0) {
          // inner loop body
@@ -169,130 +185,141 @@ The loop above nest may be written using the RAJA kernel interface as::
 
     );
 
-A ``RAJA::kernel`` method takes a ``RAJA::KernelPolicy`` type template 
-parameter, and a tuple of iteration spaces and a sequence of lambda 
-expressions as arguments. 
-
 In the case we discuss here, the execution policy contains a nested sequence
-of ``RAJA::statement::For`` statements, one for each level in the loop nest. 
-Each ``For`` statement takes three template parameters: 
+of ``RAJA::statement::For`` types, indicating an iteration over each level in
+the loop nest.  Each of these statement types takes three template parameters:
 
-  * an integral index parameter that binds the ``For`` statement to the item 
-    in the iteration space tuple corresponding to that index,
-  * an execution policy type for the associated loop nest level, and
+  * an integral index parameter that binds the statement to the item
+    in the iteration space tuple corresponding to that index
+  * an execution policy type for the associated loop nest level
   * an *enclosed statement list* (described in :ref:`loop_elements-kernelpol-label`).
 
 .. note:: The nesting of ``RAJA::statement::For`` types is analogous to the
           nesting of for-statements in the C-style version of the loop nest.
-          One can think of the '<, >' symbols enclosing the template parameter 
+          One can think of the '<, >' symbols enclosing the template parameter
           lists as being similar to the curly braces in C-style code.
 
-Here, the innermost type in the kernel policy is a 
+Here, the innermost type in the kernel policy is a
 ``RAJA::statement::Lambda<0>`` type indicating that the first lambda expression
-(argument zero of the sequence of lambdas passed to the ``RAJA::kernel`` method)
-will comprise the inner loop body. We only have one lambda in this example 
-but, in general, we can have any number of lambdas and we can use any subset 
+(argument zero of a sequence of lambdas passed to the ``RAJA::kernel`` method)
+will comprise the inner loop body. We only have one lambda in this example
+but, in general, we can have any number of lambdas and we can use any subset
 of them, with ``RAJA::statement::Lambda`` types placed appropriately in the
-execution policy, to construct a loop kernel. For example, placing 
-``RAJA::statement::Lambda`` types between ``RAJA::statement::For`` statements 
+execution policy, to construct a loop kernel. For example, placing
+``RAJA::statement::Lambda`` types between ``RAJA::statement::For`` statements
 enables non-perfectly nested loops.
 
-RAJA offers two types of lambda statements. The first as illustratated
-above, requires that each lambda expression passed to a ``RAJA::kernel`` method
-**must take an index argument for each iteration space in the tuple**.
-With this type of lambda statement, the entire iteration space must be active 
-in a containing ``For`` construct.  A compile time ``static_assert`` will be 
-triggered if any of the arguments are undefined, indicating that something
-is not correct.
+RAJA offers two types of ``RAJA::statement::Lambda`` statements. The simplest
+form, shown above, requires that each lambda expression passed to a
+``RAJA::kernel`` method **must take an index argument for each iteration
+space.** With this type of lambda statement, the entire iteration space must
+be active in a surrounding ``For`` construct.  A compile time ``static_assert``
+will be triggered if any of the arguments are undefined, indicating that
+something is not correct.
 
-The second type of lambda statement, an extension of the first, takes additional
-template parameters which specify which iteration space indices are passed
-as lambda arguments. The result is that a kernel lambda only needs to accept
-iteration space index arguments that are used in the lambda body.
+A second ``RAJA::statement::Lambda`` type, which is an extension of the first,
+takes additional template parameters which specify which iteration spaces
+are passed as lambda arguments. The result is that a kernel lambda only needs
+to accept iteration space index arguments that are used in the lambda body.
 
 The kernel policy list with lambda arguments may be written as::
 
-    using KERNEL_POL = 
-      RAJA::KernelPolicy< RAJA::statement::For<N, exec_policyN, 
+    using KERNEL_POL =
+      RAJA::KernelPolicy< RAJA::statement::For<N, exec_policyN,
                             ...
                               RAJA::statement::For<0, exec_policy0,
                                 RAJA::statement::Lambda<0, RAJA::Segs<N,...,0>>
                               >
                             ...
-                          > 
+                          >
                         >;
 
-The template parameter ``RAJA::Segs`` is used to specify which elements in the
-segment tuple are used to pass arguments to a lambda. RAJA offers other 
-types such as ``RAJA::Offsets``, and ``RAJA::Params`` to identify offsets and 
-parameters in segments and param tuples respectively to be used as lambda 
-argumentsx. See :ref:`matrixmultiply-label` and 
-:ref:`matrixtransposelocalarray-label` for detailed  examples.
+The template parameter ``RAJA::Segs`` is used to specify indices from which
+elements in the segment tuple are passed as arguments to the lambda, and in
+which argument order. Here, we pass all segment indices so the lambda kernel
+body definition could be identical to on passed to the previous RAJA version.
+RAJA offers other types such as ``RAJA::Offsets``, and ``RAJA::Params`` to
+identify offsets and parameters in segments and parameter tuples that could be
+passed to ``RAJA::kernel`` methods. See :ref:`tut-matrixmultiply-label`
+for an example.
 
 .. note:: Unless lambda arguments are specified in RAJA lambda statements,
           the loop index arguments for each lambda expression used in a RAJA
-          kernel loop body **must match** the contents of the 
-          *iteration space tuple* in number, order, and type. Not all index 
-          arguments must be used in a lambda, but they **all must appear** 
-          in the lambda argument list and **all must be in active loops** to be 
-          well-formed. In particular, your code will not compile if this is 
-          not done correctly. If an argument is unused in a lambda expression, 
-          you may include its type and omit its name in the argument list to 
-          avoid compiler warnings just as one would do for a regular C++ 
+          kernel loop body **must match** the contents of the
+          *iteration space tuple* in number, order, and type. Not all index
+          arguments must be used in a lambda, but they **all must appear**
+          in the lambda argument list and **all must be in active loops** to be
+          well-formed. In particular, your code will not compile if this is
+          not done correctly. If an argument is unused in a lambda expression,
+          you may include its type and omit its name in the argument list to
+          avoid compiler warnings just as one would do for a regular C++
           method with unused arguments.
 
-For RAJA nested loops implemented with ``RAJA::kernel``, as shown here, the 
-loop nest ordering is determined by the order of the nested policies, starting 
-with the outermost loop and ending with the innermost loop. 
+For RAJA nested loops implemented with ``RAJA::kernel``, as shown here, the
+loop nest ordering is determined by the order of the nested policies, starting
+with the outermost loop and ending with the innermost loop.
 
-.. note:: The integer value that appears as the first parameter in each 
-          ``RAJA::statement::For`` template indicates which iteration space 
-          tuple entry or lambda index argument it corresponds to. **This 
-          allows loop nesting order to be changed simply by changing the 
-          ordering of the nested policy statements**. This is analogous to 
-          changing the order of 'for-loop' statements in C-style nested loop 
+.. note:: The integer value that appears as the first parameter in each
+          ``RAJA::statement::For`` template indicates which iteration space
+          tuple entry or lambda index argument it corresponds to. **This
+          allows loop nesting order to be changed simply by changing the
+          ordering of the nested policy statements**. This is analogous to
+          changing the order of 'for-loop' statements in C-style nested loop
           code.
 
-See :ref:`matmultkernel-label` for a complete example showing RAJA nested
-loop functionality and :ref:`nestedreorder-label` for a detailed example 
-describing nested loop reordering.
+.. note:: In general, RAJA execution policies for ``RAJA::forall`` and
+          ``RAJA::kernel`` are different. A summary of all RAJA execution
+          policies that may be used with ``RAJA::forall`` or ``RAJA::kernel``
+          may be found in :ref:`feat-policies-label`.
 
-.. note:: In general, RAJA execution policies for ``RAJA::forall`` and 
-          ``RAJA::kernel`` are different. A summary of all RAJA execution 
-          policies that may be used with ``RAJA::forall`` or ``RAJA::kernel`` 
-          may be found in :ref:`policies-label`. 
-
-Finally, a discussion of how to construct ``RAJA::KernelPolicy`` types and 
-available ``RAJA::statement`` types can be found in 
+A discussion of how to construct ``RAJA::KernelPolicy`` types and
+available ``RAJA::statement`` types can be found in
 :ref:`loop_elements-kernelpol-label`.
 
---------------------------------
-Team based loops (RAJA::launch)
---------------------------------
+Please see the following tutorial sections for detailed examples that use
+``RAJA::kernel``:
+
+ * :ref:`tut-kernelnestedreorder-label`
+ * :ref:`tut-kernelexecpols-label`
+ * :ref:`tut-matrixtranspose-label`
+ * :ref:`tut-offsetlayout-label`
+ * :ref:`tut-matrixmultiply-label`
 
-The *RAJA Teams* framework aims to unify thread/block based
+------------------------------------------
+Hierarchical loops (RAJA::launch)
+------------------------------------------
+
+The ``RAJA::launch`` template is an alternative interface to
+``RAJA::kernel`` that may be preferred for certain types of complex kernels
+or based on coding style preferences.
+
+``RAJA::launch`` optionally allows either host or device execution
+to be chosen at run time. The method takes an execution policy type that
+will define the execution environment inside a lambda expression for a kernel
+to be run on a host, device, or either. Kernel algorithms are written inside
+main lambda expression using ``RAJA::loop`` methods.
+
+The ``RAJA::launch`` framework aims to unify thread/block based
 programming models such as CUDA/HIP/SYCL while maintaining portability on
-host backends (OpenMP, sequential). When using the ``RAJA::kernel`` 
-interface, developers express all aspects of nested loop execution in the
-execution policy type on which the ``RAJA::kernel`` method is templated.
-In contrast, the ``RAJA::launch`` interface allows users to express 
+host back-ends (OpenMP, sequential). As we showed earlier, when using the
+``RAJA::kernel`` interface, developers express all aspects of nested loop
+execution in an execution policy type on which the ``RAJA::kernel`` method
+is templated.
+In contrast, the ``RAJA::launch`` interface allows users to express
 nested loop execution in a manner that more closely reflects how one would
-write conventional nested C-style for-loop code.  Additionally, *RAJA Teams* 
-introduces run-time host or device selectable kernel execution. The main 
-application of *RAJA Teams* is imperfectly nested loops. Using the 
-``RAJA::expt::launch method`` developers are provided with an execution 
-space enabling them to express algorithms in terms of nested
-``RAJA::expt::loop`` statements::
+write conventional nested C-style for-loop code. For example, here is an
+example of a ``RAJA::launch`` kernel that copies values from an array in
+into a *shared memory* array::
 
-  RAJA::expt::launch<launch_policy>(select_CPU_or_GPU)
-  RAJA::expt::Grid(RAJA::expt::Teams(NE), RAJA::expt::Threads(Q1D)),
-  [=] RAJA_HOST_DEVICE (RAJA::expt::Launch ctx) {
+  RAJA::launch<launch_policy>(select_CPU_or_GPU)
+  RAJA::LaunchParams(RAJA::Teams(NE), RAJA::Threads(Q1D)),
+  [=] RAJA_HOST_DEVICE (RAJA::Launch ctx) {
 
-    RAJA::expt::loop<team_x> (ctx, RAJA::RangeSegment(0, teamRange), [&] (int bx) {
+    RAJA::loop<team_x> (ctx, RAJA::RAJA::TypedRangeSegment<int>(0, teamRange), [&] (int bx) {
 
       RAJA_TEAM_SHARED double s_A[SHARE_MEM_SIZE];
 
-      RAJA::expt::loop<thread_x> (ctx, RAJA::RangeSegment(0, threadRange), [&] (int tx) {
+      RAJA::loop<thread_x> (ctx, RAJA::RAJA::TypedRangeSegment<int>(0, threadRange), [&] (int tx) {
         s_A[tx] = tx;
       });
 
@@ -301,41 +328,55 @@ space enabling them to express algorithms in terms of nested
    )};
 
   });
-  
-The underlying idea of *RAJA Teams* is to enable developers to express nested
-parallelism in terms of teams and threads. Similar to the CUDA programming model,
-development is done using a collection of threads, threads are grouped into teams.
-Using the ``RAJA::expt::loop`` methods iterations of the loop may be executed by threads
-or teams (depending on the execution policy). The launch context serves to synchronize
-threads within the same team. The *RAJA Teams* abstraction consist of three main concepts.
-
-  * *Launch Method*: creates an execution space in which developers may express 
-    their algorithm in terms of nested ``RAJA::expt::loop`` statements. The loops are then
-    executed by threads or thread-teams. The method is templated on both a host
-    and device execution space and enables run-time selection of the execution environment.
-
-  * *Resources*: holds a number of teams and threads (akin to CUDA blocks/threads).
-
-  * *Loops*: are used to express hierarchical parallelism. Work within a loop is mapped to either teams or threads. Team shared memory
-    is available by using the ``RAJA_TEAM_SHARED`` macro. Team shared memory enables
-    threads in a given team to share data. In practice, team policies are typically
-    aliases for RAJA GPU block policies in the x,y,z dimensions (for example cuda_block_direct),
-    while thread policies are aliases for RAJA GPU thread policies (for example cuda_thread_direct)
-    x,y,z dimensions. On the host, teams and threads may be mapped to sequential
-    loop execution or OpenMP threaded regions.
-
-The team loop interface combines concepts from ``RAJA::forall`` and ``RAJA::kernel``.
-Various policies from ``RAJA::kernel`` are compatible with the ``RAJA Teams``
-framework.
+
+The idea underlying ``RAJA::launch`` is to enable developers to express
+hierarchical parallelism in terms of teams and threads. Similar to the CUDA
+programming model, development is done using a collection of threads, and
+threads are grouped into teams. Using the ``RAJA::loop`` methods
+iterations of the loop may be executed by threads or teams depending on the
+execution policy type. The launch context serves to synchronize threads within
+the same team. The ``RAJA::launch`` interface has three main concepts:
+
+  * ``RAJA::launch`` template. This creates an execution environment in
+    which a kernel implementation is written using nested ``RAJA::loop``
+    statements. The launch policy template parameter used with the
+    ``RAJA::launch`` method enables specification of both a host and
+    device execution environment, which enables run time selection of
+    kernel execution.
+
+  * ``RAJA::LaunchParams`` type. This type takes a number of teams and and a
+    number of threads as arguments.
+
+  * ``RAJA::loop`` template. These are used to define hierarchical
+    parallel execution of a kernel. Operations within a loop are mapped to
+    either teams or threads based on the execution policy template parameter
+    provided.
+
+Team shared memory is available by using the ``RAJA_TEAM_SHARED`` macro. Team
+shared memory enables threads in a given team to share data. In practice,
+team policies are typically aliases for RAJA GPU block policies in the
+x,y,z dimensions, while thread policies are aliases for RAJA GPU thread
+policies in the x,y,z dimensions. In a host execution environment, teams and
+threads may be mapped to sequential loop execution or OpenMP threaded regions.
+Often, the ``RAJA::LaunchParams`` method can take an empty argument list for
+host execution.
+
+Please see the following tutorial sections for detailed examples that use
+``RAJA::launch``:
+
+ * :ref:`tut-launchintro-label`
+ * :ref:`tut-launchexecpols-label`
+ * :ref:`tut-matrixtranspose-label`
 
 .. _loop_elements-CombiningAdapter-label:
 
---------------------------------
-MultiDimensional loops using Simple loop APIs (RAJA::CombiningAdapter)
---------------------------------
+------------------------------------------------------------------------
+Multi-dimensional loops using simple loop APIs (RAJA::CombiningAdapter)
+------------------------------------------------------------------------
 
 A ``RAJA::CombiningAdapter`` object provides ways to run perfectly nested loops
-with simple loop APIs like ``RAJA::forall`` and ``RAJA::WorkGroup`` :ref:`workgroup-label`.
+with simple loop APIs like ``RAJA::forall`` and those described in
+:ref:`workgroup-label`.
 To introduce the ``RAJA ::CombiningAdapter`` interface, consider a (N+1)-level
 C-style loop nest::
 
@@ -357,13 +398,13 @@ loops and pass the adapter to a ``RAJA::forall`` statement to execute them::
   RAJA::forall<exec_policy>(adapter.getRange(), adapter);
 
 A ``RAJA::CombiningAdapter`` object is a template combining a loop body and
-iteration spaces. The maker function template takes a lambda expression for the
-loop body and an arbitrary number of segment arguments. It provides a flattened
-index space via the ``getRange`` method that can be passed as the iteration space
-to the simple loop API. The object itself can be passed into the loop API as the
-loop body. The object's call operator does the conversion of the flat single
-dimensional index into the multi-dimensional index space, calling the provided
-lambda with the appropriate indices.
-
-.. note:: CombiningAdapter currently only supports ``RAJA::RangeSegment`` and
+iteration spaces. The ``RAJA::make_CombingingAdapter`` template method takes
+a lambda expression for the loop body and an arbitrary number of index
+arguments. It provides a *flattened* iteration space via the ``getRange``
+method that can be passed as the iteration space to the ``RAJA::forall``
+method, for example. The object's call operator does the conversion of the
+flat single dimensional index into the multi-dimensional index space, calling
+the provided lambda with the appropriate indices.
+
+.. note:: CombiningAdapter currently only supports
           ``RAJA::TypedRangeSegment`` segments.
diff --git a/docs/sphinx/user_guide/feature/plugins.rst b/docs/sphinx/user_guide/feature/plugins.rst
index 4773c2bf17..c7cd3c63b0 100644
--- a/docs/sphinx/user_guide/feature/plugins.rst
+++ b/docs/sphinx/user_guide/feature/plugins.rst
@@ -6,7 +6,7 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _plugins-label:
+.. _feat-plugins-label:
 
 ========
 Plugins
@@ -36,50 +36,41 @@ can be added to a project as easily as making a shared object file and setting
 ``RAJA_PLUGINS`` to the appropriate path.
 
 ^^^^^^^^^^^^^^^^^^^
-Quick Start Guide
+Plugins Quick Start
 ^^^^^^^^^^^^^^^^^^^
 
 **Static Plugins**
 
-1. Build RAJA normally.
-
-2. Either use an ``#include`` statement within the code or compiler flags to load your plugin file with your project at compile time. A brief example of this would be something like ``g++ project.cpp plugin.cpp -lRAJA -fopenmp -ldl -o project``.
-
-3. When you run your project, your plugin should work.
+#. Build RAJA normally.
+#. Use an ``#include`` statement in your code or pass options to the compiler to load your plugin file with your project at compile time. For example: ``g++ project.cpp plugin.cpp -lRAJA -ldl -o project``.
+#. When you run your project, your plugin should work.
 
 **Dynamic Plugins**
 
-1. Build RAJA normally.
-
-2. Compile your plugin to be a shared object file with a .so extension. A brief 
-example of this would be something like ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``.
-
-3. Set the environment variable ``RAJA_PLUGINS`` to be the path of your .so file. 
-This can either be the path to its directory or to the shared object file itself. 
-If the path is to a directory, it will attempt to load all .so files in that 
-directory.
-
-4. When you run your project, your plugins should work.
+#. Build RAJA normally.
+#. Compile your plugin to be a shared object file with ``.so`` extension. For example: ``g++ plugin.cpp -lRAJA -fPIC -shared -o plugin.so``.
+#. Set the environment variable ``RAJA_PLUGINS`` to the path of your ``.so`` file.  This can either be the path to its directory or to the shared object file itself. If the path is a directory, all ``.so`` files in that directory will be loaded.
+#. When you run your project, your plugins should work.
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Interfacing with Plugins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The RAJA plugin API allows for limited interfacing between a project and a 
-plugin. There are a couple of functions that allow for this to take place, 
+plugin. There are a couple of methods to call in your code:
 ``init_plugins`` and ``finalize_plugins``. These will call the corresponding 
-``init`` and ``finalize`` functions, respectively, of *every* currently loaded 
+``init`` and ``finalize`` methods, respectively, of *every* currently loaded 
 plugin. It's worth noting that plugins don't require either an init or finalize
-function by default.
+method by default.
 
-* ``RAJA::util::init_plugins();`` - Will call the ``init`` function of every 
+* ``RAJA::util::init_plugins();`` will call the ``init`` method of every 
   currently loaded plugin.
 
-* ``RAJA::util::init_plugins("path/to/plugins");`` - Does the same as the above
-  call to ``init_plugins``, but will also dynamically load plugins located at 
-  the path specified.
+* ``RAJA::util::init_plugins("path/to/plugins");`` will call the ``init`` 
+  method of every currently loaded plugin and, in addition, will also 
+  dynamically load plugins located at the given path.
 
-* ``RAJA::util::finalize_plugins();`` - Will call the ``finalize`` function of 
+* ``RAJA::util::finalize_plugins();`` will call the ``finalize`` method of 
   every currently loaded plugin. 
 
 
@@ -88,51 +79,56 @@ Creating Plugins For RAJA
 --------------------------
 
 Plugins are classes derived from the ``RAJA::util::PluginStrategy`` base class
-and implement the required functions for the API. An example implementation 
-can be found at the bottom of this page.
+and implement the required virtual methods for the API. An example 
+implementation can be found at the bottom of this page.
 
-^^^^^^^^^^^
-Functions
-^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^
+Plugin API methods
+^^^^^^^^^^^^^^^^^^^^
 
-The ``preLaunch`` and ``postLaunch`` functions are automatically called by 
-RAJA before and after executing a kernel that uses ``RAJA::forall`` or 
-``RAJA::kernel`` methods.
+The following list summarizes the virtual methods in the 
+``RAJA::util::PluginStrategy`` base class.
 
-* ``void init(const PluginOptions& p) override {}`` - runs on all plugins when 
-  a user calls ``init_plugins``
+* ``void init(const PluginOptions& p) override {}`` is called on all plugins 
+  when a user calls ``init_plugins()``
 
-* ``void preCapture(const PluginContext& p) override {}`` - is called before 
-  lambda capture in ``RAJA::forall`` or ``RAJA::kernel``.
+* ``void preCapture(const PluginContext& p) override {}`` is called before 
+  lambda capture in RAJA kernel execution methods.
 
-* ``void postCapture(const PluginContext& p) override {}`` - is called after 
-  lambda capture in ``RAJA::forall`` or ``RAJA::kernel``.
+* ``void postCapture(const PluginContext& p) override {}`` is called after 
+  lambda capture in RAJA kernel execution methods.
 
-* ``void preLaunch(const PluginContext& p) override {}`` - is called before 
-  ``RAJA::forall`` or ``RAJA::kernel`` runs a kernel.
+* ``void preLaunch(const PluginContext& p) override {}`` is called before 
+  a RAJA kernel execution method runs a kernel.
 
-* ``void postLaunch(const PluginContext& p) override {}`` - is called after 
-  ``RAJA::forall`` or ``RAJA::kernel`` runs a kernel.
+* ``void postLaunch(const PluginContext& p) override {}`` is called after 
+  a RAJA kernel execution method runs a kernel.
 
-* ``void finalize() override {}`` - Runs on all plugins when a user calls 
+* ``void finalize() override {}`` is called on all plugins when a user calls 
   ``finalize_plugins``. This will also unload all currently loaded plugins.
 
-``init`` and ``finalize`` are never called by RAJA by default and are only 
-called when a user calls ``RAJA::util::init_plugins()`` or 
-``RAJA::util::finalize_plugin()``, respectively.
+.. note:: The pre/post methods above are automatically called
+          before and after executing a kernel with ``RAJA::forall`` or 
+          ``RAJA::kernel`` kernel execution methods.
+
+.. note:: The ``init`` and ``finalize`` methods are never called by
+          default and are only called when a user calls 
+          ``RAJA::util::init_plugins()`` or ``RAJA::util::finalize_plugin()``, 
+          respectively.
 
 ^^^^^^^^^^^^^^^^^
 Static Loading
 ^^^^^^^^^^^^^^^^^
 
-If a plugin is to be loaded into a project at compile time, adding the 
-following method call will add the plugin to the RAJA ``PluginRegistry`` and will 
-be loaded every time the compiled executable is run. This requires the plugin 
-to be loaded with either an ``#include`` statement within a project or with
-source code line such as::
+If a plugin is to be loaded into a project at compile time, it must be
+loaded with either an ``#include`` statement in the project source code or
+by calling the following method in the project source code, which adds the 
+plugin to the RAJA ``PluginRegistry`::`
 
   static RAJA::util::PluginRegistry::add<PluginName> P("Name", "Description");
 
+In either case, the plugin will be loaded every time the compiled 
+project executable is run.
 
 ^^^^^^^^^^^^^^^^^
 Dynamic Loading
@@ -142,39 +138,40 @@ If a plugin is to be dynamically loaded in a project at run time, the RAJA
 plugin API requires a few conditions to be met. The following must be true 
 about the plugin, not necessarily of the project using it.
 
-1. **The plugin must have the following factory function.** This will return 
-   a pointer to an instance of your plugin. Note that using ``extern "C"`` is 
-   required to search for the ``getPlugin()`` method call for the dynamically 
-   loaded plugin correctly::
+#. The plugin must have the following factory method that returns
+   a pointer to an instance of your plugin::
 
-     extern "C" RAJA::util::PluginStrategy *getPlugin ()
+     extern "C" RAJA::util::PluginStrategy* getPlugin()
      {
        return new MyPluginName;
      }
   
+   Note that using ``extern "C"`` is required to search for the ``getPlugin()``
+   method call for the dynamically loaded plugin correctly.
 
-2. **The plugin must be compiled to be a shared object with a .so extension.** 
-   A simple example containing required flags would be: ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``. 
+#. The plugin must be compiled to be a shared object with a ``.so`` extension. 
+   For example: ``g++ plugin.cpp -lRAJA -fPIC -shared -o plugin.so``. 
 
-   At the moment, RAJA will only attempt to load files with .so extensions. 
+   At the moment, RAJA will only attempt to load files with ``.so`` extensions. 
    It's worth noting why these flags (or their equivalents) are important. 
 
-     * ``-lRAJA -fopenmp`` are standard flags for compiling the RAJA library. 
+     * ``-lRAJA`` is a standard flag for linking the RAJA library. 
 
      * ``-fPIC`` tells the compiler to produce *position independent code*, 
        which prevents conflicts in the address space of the executable. 
 
      * ``-shared`` will let the compiler know that you want the resulting 
        object file to be shared, removing the need for a *main* as well as 
-       giving dynamically loaded executables access to functions flagged 
+       giving dynamically loaded executables access to methods flagged 
        with ``extern "C"``.
 
-3. **The** ``RAJA_PLUGINS`` **environment variable has been set**, or a user 
-   has made a call to ``RAJA::util::init_plugins("path");`` with a path 
-   specified to either a directory or a .so file. It's worth noting that these 
-   are not mutually exclusive. RAJA will look for plugins based on the 
-   environment variable on program startup and new plugins may be loaded after 
-   that by calling the ``init_plugins()`` method.
+#. The ``RAJA_PLUGINS`` environment variable must be set, or the project code 
+   must call ``RAJA::util::init_plugins("path");``. Either of these approaches
+   is required to supply the path to either a directory containing the plugin
+   or its ``.so`` file. It's worth noting that these are not mutually 
+   exclusive. RAJA will look for plugins based on the environment variable on 
+   program startup and new plugins may be loaded after that by calling the 
+   ``init_plugins()`` method.
 
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -195,16 +192,16 @@ CHAI Plugin
 ^^^^^^^^^^^^^^^^^^^^^
 
 RAJA provides abstractions for parallel execution, but does not support
-a memory model for managing data in heterogeneous memory spaces.
-The `CHAI library <https://github.com/LLNL/CHAI>`_ provides an array abstraction
-that integrates with RAJA to enable automatic copying of data at runtime to the 
-proper execution memory space for a RAJA-based kernel based on the 
-RAJA exection policy used to execute the kernel. Then, the data can be accessed
-inside the kernel as needed.
+a memory model for managing data in heterogeneous memory spaces. One
+option for managing such data is to use `CHAI <https://github.com/LLNL/CHAI>`_,
+which provides an array abstraction that integrates with RAJA to enable 
+automatic copying of data at runtime to the proper execution memory space for 
+a RAJA-based kernel determined by the RAJA execution policy used to execute the 
+kernel. Then, the data can be accessed inside the kernel as needed.
 
 To build CHAI with RAJA integration, you need to download and install CHAI with
-the ``ENABLE_RAJA_PLUGIN`` option turned on.  Please see the `CHAI project
-<https://github.com/LLNL/CHAI>`_ for details.
+the ``ENABLE_RAJA_PLUGIN`` option turned on.  Please see 
+`CHAI <https://github.com/LLNL/CHAI>`_ for details.
 
 After CHAI has been built with RAJA support enabled, applications can use CHAI
 ``ManangedArray`` objects to access data inside a RAJA kernel. For example::
@@ -215,7 +212,7 @@ After CHAI has been built with RAJA support enabled, applications can use CHAI
       array[i] = i * 2.0f;
   });
 
-  RAJA::forall<RAJA::seq_exec>(0, 1000, [=] (int i) {
+  RAJA::forall<RAJA::loop_exec>(0, 1000, [=] (int i) {
     std::cout << "array[" << i << "]  is " << array[i] << std::endl;
   });
 
@@ -223,8 +220,8 @@ Here, the data held by ``array`` is allocated on the host CPU. Then, it is
 initialized on a CUDA GPU device. CHAI sees that the data lives on the CPU
 and is needed in a GPU device data environment since it is used in a kernel that
 will run with a RAJA CUDA execution policy. So it copies the data from
-CPU to GPU, making it available for access in the RAJA kernel. Next,
-it is printed in the second kernel which runs on the CPU (indicated by the
+CPU memory to GPU memory, making it available for access in the RAJA kernel. 
+The data is printed in the second kernel which runs on the CPU (indicated by the
 RAJA sequential execution policy). So CHAI copies the data back to the host CPU.
 All necessary data copies are done transparently on demand for each kernel.
 
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 37bd07f769..0588c5a900 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -6,21 +6,21 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _policies-label:
+.. _feat-policies-label:
 
 ==================
 Policies
 ==================
 
-This section describes RAJA policies for loop kernel execution,
-scans, sorts, reductions, atomics, etc. Each policy is a type that is passed to
-a RAJA template method or class to specialize its behavior. Typically, the
-policy indicates which programming model back-end to use and sometimes
-specifies additional information about the execution pattern, such as
-number of CUDA threads per thread block, whether execution is synchronous
-or asynchronous, etc.
+RAJA kernel execution methods take an execution policy type template parameter
+to specialize execution behavior. Typically, the policy indicates which 
+programming model back-end to use and other information about the execution 
+pattern, such as number of CUDA threads per thread block, whether execution is 
+synchronous or asynchronous, etc. This section describes RAJA policies for 
+loop kernel execution, scans, sorts, reductions, atomics, etc. Please
+detailed examples in :ref:`tutorial-label` for a variety of use cases.
 
-As RAJA functionality evolves, new policies will be added and some may
+As RAJA functionality evolves, new policies are added and some may
 be redefined and to work in new ways.
 
 .. note:: * All RAJA policies are in the namespace ``RAJA``.
@@ -81,11 +81,11 @@ policies. Typically, they work by providing an *outer policy* and an
 flexibility to create more complex execution patterns.
 
 
-.. note:: To control the number of threads used by OpenMP policies
+.. note:: To control the number of threads used by OpenMP policies,
           set the value of the environment variable 'OMP_NUM_THREADS' (which is
           fixed for duration of run), or call the OpenMP routine
           'omp_set_num_threads(nthreads)' in your application, which allows 
-          one to change the number of threads at runtime.
+          one to change the number of threads at run time.
 
 The full policies are described in the following table. Partial policies
 are described in other tables below.
@@ -167,26 +167,24 @@ a template argument as described above.
  omp_for_runtime_exec                   forall,       Same as applying
                                         kernel (For)  'omp for
                                                       schedule(runtime)'
+ omp_parallel_collapse_exec             kernel        Use in Collapse statement
+                                        (Collapse +   to parallelize multiple
+                                        ArgList)      loop levels in loop nest
+                                                      indicated using ArgList
  ====================================== ============= ==========================
 
-.. important:: **RAJA only provides a nowait policy option for static schedule**
-               since that is the only schedule case that can be used with
-               nowait and be correct in general when chaining multiple loops
-               in a single parallel region. Paraphrasing the OpenMP standard:
+.. important:: **RAJA only provides a nowait policy option for static 
+               scheduling** since that is the only schedule case that can be 
+               used with nowait and be correct in general when executing 
+               multiple loops in a single parallel region. Paraphrasing the 
+               OpenMP standard:
                *programs that depend on which thread executes a particular
                loop iteration under any circumstance other than static schedule
                are non-conforming.*
 
 .. note:: As in the RAJA full policies for OpenMP scheduling, the ``ChunkSize``
           is optional. If not provided, the default chunk size that the OpenMP 
-          implementation applies will be used. For this case,
-          the RAJA policy syntax is 
-          ``omp_for_{static|dynamic|guided}_exec< >``, which will result 
-          in the OpenMP pragma 
-          ``omp for schedule({static|dynamic|guided})`` being applied.
-          Similarly, for ``nowait`` static policy, the RAJA policy syntax is
-          ``omp_for_nowait_static_exec< >``, which will result in the OpenMP 
-          pragma ``omp for schedule(static) nowait`` being applied.
+          implementation applies will be used.
 
 .. note:: As noted above, RAJA inner OpenMP policies must only be used within an
           **existing** parallel region to work properly. Embedding an inner 
@@ -230,8 +228,8 @@ a template argument as described above.
 Threading Building Block (TBB) Parallel CPU Policies
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-RAJA provides a basic set of TBB execution policies for users who would like
-to try it.
+RAJA provides a basic set of TBB execution policies for use with the 
+RAJA TBB back-end, which supports a subset of RAJA features.
 
  ====================================== ============= ==========================
  Threading Building Blocks Policies     Works with    Brief description
@@ -261,7 +259,7 @@ to try it.
 
             // do some more parallel work
 
-          This allows changing number of workers at runtime.
+          This allows changing number of workers at run time.
 
 
 GPU Policies for CUDA and HIP
@@ -301,7 +299,7 @@ policies have the prefix ``hip_``.
                                                         threads in y-dimension
  cuda/hip_thread_z_loop                   kernel (For)  Same as above, but for
                                                         threads in z-dimension
- cuda/hip_flatten_block_threads_{xyz}     Teams (Loop)  Reshapes threads in a
+ cuda/hip_flatten_block_threads_{xyz}     Launch (Loop) Reshapes threads in a
                                                         multi-dimensional thread
                                                         team into one-dimension,
                                                         accepts any permutation
@@ -323,14 +321,14 @@ policies have the prefix ``hip_``.
                                                         blocks in y-dimension
  cuda/hip_block_z_loop                    kernel (For)  Same as above, but use
                                                         blocks in z-dimension
- cuda/hip_global_thread_x                 Teams (Loop)  Creates a unique thread
-                                                        id for each thread on the
-                                                        x dimension of the grid
+ cuda/hip_global_thread_x                 Launch (Loop) Creates a unique thread
+                                                        id for each thread on 
+                                                        x-dimension of the grid
                                                         (expt namespace)
- cuda/hip_global_thread_y                 Teams (Loop)  Same as above, but uses
+ cuda/hip_global_thread_y                 Launch (Loop) Same as above, but uses
                                                         threads in y-dimension
                                                         (expt namespace)
- cuda/hip_global_thread_z                 Teams (Loop)  Same as above, but uses
+ cuda/hip_global_thread_z                 Launch (Loop) Same as above, but uses
                                                         threads in z-dimension
                                                         (expt namespace)
  cuda/hip_warp_direct                     kernel (For)  Map work to threads
@@ -391,8 +389,8 @@ Several notable constraints apply to RAJA CUDA/HIP *thread-direct* policies.
             different thread dimensions), the product of sizes of the
             corresponding iteration spaces cannot be greater than the
             maximum allowable threads per block. Typically, this is
-            equ:math:`\leq` 1024; e.g., attempting to launch a CUDA kernel
-            with more than 1024 threads per block will cause the CUDA runtime
+            1024 threads per block. Attempting to execute a kernel with more
+            than the maximum allowed the CUDA runtime
             to complain about *illegal launch parameters.*
           * **Thread-direct policies are recommended only for certain loop
             patterns, such as tiling.**
@@ -521,16 +519,14 @@ device, for example. They are summarized in the following table.
 RAJA IndexSet Execution Policies
 -----------------------------------------------------
 
-When an IndexSet iteration space is used in RAJA, such as passing an IndexSet
-to a ``RAJA::forall`` method, an index set execution policy is required. An
-index set execution policy is a **two-level policy**: an 'outer' policy for
-iterating over segments in the index set, and an 'inner' policy used to
-execute the iterations defined by each segment. An index set execution policy
-type has the form::
-
-  RAJA::ExecPolicy< segment_iteration_policy, segment_execution_policy>
+When an IndexSet iteration space is used in RAJA by passing an IndexSet
+to a ``RAJA::forall`` method, for example, an index set execution policy is 
+required. An index set execution policy is a **two-level policy**: an 'outer' 
+policy for iterating over segments in the index set, and an 'inner' policy 
+used to execute the iterations defined by each segment. An index set execution 
+policy type has the form::
 
-See :ref:`indexsets-label` for more information.
+  RAJA::ExecPolicy< segment_iteration_policy, segment_execution_policy >
 
 In general, any policy that can be used with a ``RAJA::forall`` method
 can be used as the segment execution policy. The following policies are
@@ -559,7 +555,7 @@ tbb_segit                              Iterate over index set segments in
 Parallel Region Policies
 -------------------------
 
-Earlier, we discussed an example using the ``RAJA::region`` construct to
+Earlier, we discussed using the ``RAJA::region`` construct to
 execute multiple kernels in an OpenMP parallel region. To support source code 
 portability, RAJA provides a sequential region concept that can be used to 
 surround code that uses execution back-ends other than OpenMP. For example::
@@ -625,7 +621,8 @@ sycl_reduce             any SYCL      Reduction in a SYCL kernel (device
 ======================= ============= ==========================================
 
 .. note:: RAJA reductions used with SIMD execution policies are not
-          guaranteed to generate correct results at present.
+          guaranteed to generate correct results. So they should not be used
+          for kernels containing reductions.
 
 .. _atomicpolicy-label:
 
@@ -640,34 +637,37 @@ type. Atomic policy types are distinct from loop execution policy types.
            policy for the kernel in which the atomic operation is used. The
            following table summarizes RAJA atomic policies and usage.
 
-========================= ============= ========================================
-Atomic Policy             Loop Policies Brief description
-                          to Use With
-========================= ============= ========================================
-seq_atomic                seq_exec,     Atomic operation performed in a
-                          loop_exec     non-parallel (sequential) kernel.
-omp_atomic                any OpenMP    Atomic operation performed in an OpenMP.
-                          policy        multithreading or target kernel; i.e.,
-                                        apply ``omp atomic`` pragma.
-cuda/hip_atomic           any CUDA/HIP  Atomic operation performed in a CUDA/HIP
-                          policy        kernel.
-cuda/hip_atomic_explicit  any CUDA/HIP  Atomic operation performed in a CUDA/HIP
-                          policy        kernel that may also be used in a host
-                                        execution context. The atomic policy
-                                        takes a host atomic policy template
-                                        argument. See additional explanation 
-                                        and example below.
-builtin_atomic            seq_exec,     Compiler *builtin* atomic operation.
-                          loop_exec,
-                          any OpenMP
-                          policy
-auto_atomic               seq_exec,     Atomic operation *compatible* with loop
-                          loop_exec,    execution policy. See example below.
-                          any OpenMP    Can not be used inside cuda/hip
-                          policy,       explicit atomic policies.
-                          any CUDA/HIP
-                          policy
-========================= ============= ========================================
+============================= ============= ========================================
+Atomic Policy                 Loop Policies Brief description
+                              to Use With
+============================= ============= ========================================
+seq_atomic                    seq_exec,     Atomic operation performed in a
+                              loop_exec     non-parallel (sequential) kernel.
+omp_atomic                    any OpenMP    Atomic operation performed in an OpenMP.
+                              policy        multithreading or target kernel; i.e.,
+                                              apply ``omp atomic`` pragma.
+cuda/hip/sycl_atomic          any           Atomic operation performed in a
+                              CUDA/HIP/SYCL CUDA/HIP/SYCL kernel.
+                              policy        
+
+cuda/hip_atomic_explicit      any CUDA/HIP  Atomic operation performed in a CUDA/HIP
+                              policy        kernel that may also be used in a host
+                                            execution context. The atomic policy
+                                            takes a host atomic policy template
+                                            argument. See additional explanation 
+                                            and example below.
+builtin_atomic                seq_exec,     Compiler *builtin* atomic operation.
+                              loop_exec,
+                              any OpenMP
+                              policy
+auto_atomic                   seq_exec,     Atomic operation *compatible* with loop
+                              loop_exec,    execution policy. See example below.
+                              any OpenMP    Can not be used inside cuda/hip
+                              policy,       explicit atomic policies.
+                              any 
+                              CUDA/HIP/SYCL
+                              policy
+============================= ============= ========================================
 
 .. note:: The ``cuda_atomic_explicit`` and ``hip_atomic_explicit`` policies
           take a host atomic policy template parameter. They are intended to
@@ -680,10 +680,9 @@ Here is an example illustrating use of the ``cuda_atomic_explicit`` policy::
     RAJA::atomicAdd< RAJA::cuda_atomic_explicit<omp_atomic> >(&sum, 1);
   };
 
-  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::RangeSegment seg(0, N), kernel);
+  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::TypedRangeSegment<int> seg(0, N), kernel);
 
-  RAJA::forall< RAJA::omp_parallel_for_exec >(RAJA::RangeSegment seg(0, N),
-      kernel);
+  RAJA::forall< RAJA::omp_parallel_for_exec >(RAJA::TypedRangeSegment<int> seg(0, N), kernel);
 
 In this case, the atomic operation knows when it is compiled for the device
 in a CUDA kernel context and the CUDA atomic operation is applied. Similarly
@@ -692,7 +691,7 @@ used and the OpenMP version of the atomic operation is applied.
 
 Here is an example illustrating use of the ``auto_atomic`` policy::
 
-  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::RangeSegment seg(0, N),
+  RAJA::forall< RAJA::cuda_exec<BLOCK_SIZE> >(RAJA::TypedRangeSegment<int> seg(0, N),
     [=] RAJA_DEVICE (RAJA::Index_type i) {
 
     RAJA::atomicAdd< RAJA::auto_atomic >(&sum, 1);
@@ -705,7 +704,8 @@ execution policy was used, the OpenMP version of the atomic operation would
 be used.
 
 .. note:: * There are no RAJA atomic policies for TBB (Intel Threading Building
-            Blocks) execution contexts at present.
+            Blocks) execution contexts since reductions are not supported
+            for the RAJA TBB back-end.
           * The ``builtin_atomic`` policy may be preferable to the
             ``omp_atomic`` policy in terms of performance.
 
@@ -717,7 +717,7 @@ Local Array Memory Policies
 
 ``RAJA::LocalArray`` types must use a memory policy indicating
 where the memory for the local array will live. These policies are described
-in :ref:`local_array-label`.
+in :ref:`feat-local_array-label`.
 
 The following memory policies are available to specify memory allocation
 for ``RAJA::LocalArray`` objects:
@@ -743,18 +743,20 @@ of Statements that are composed in the order that they appear in the kernel
 policy to construct a kernel. A Statement may contain an enclosed StatmentList. Thus, a ``RAJA::KernelPolicy`` type is really just a StatementList.
 
 The main Statement types provided by RAJA are ``RAJA::statement::For`` and
-``RAJA::statement::Lambda``, that we have shown above. A 'For' Statement
-indicates a for-loop structure and takes three template arguments:
-'ArgId', 'ExecPolicy', and 'EnclosedStatements'. The ArgID identifies the
-position of the item it applies to in the iteration space tuple argument to the
-``RAJA::kernel`` method. The ExecPolicy is the RAJA execution policy to
-use on that loop/iteration space (similar to ``RAJA::forall``).
-EnclosedStatements contain whatever is nested within the template parameter
-list to form a StatementList, which will be executed for each iteration of
-the loop. The ``RAJA::statement::Lambda<LambdaID>`` invokes the lambda
-corresponding to its position (LambdaID) in the sequence of lambda expressions
-in the ``RAJA::kernel`` argument list. For example, a simple sequential
-for-loop::
+``RAJA::statement::Lambda``, that we discussed in 
+:ref:`loop_elements-kernel-label`. 
+A ``RAJA::statement::For<ArgID, ExecPolicy, Enclosed Satements>`` type 
+indicates a for-loop structure. The ``ArgID`` parameter is an integral constant
+that identifies the position of the iteration space in the iteration space 
+tuple passed to the ``RAJA::kernel`` method to be used for the loop. The 
+``ExecPolicy`` is the RAJA execution policy to use on the loop, which is 
+similar to ``RAJA::forall`` usage. The ``EnclosedStatements`` type is a 
+nested template parameter that contains whatever is needed to execute the 
+kernel and which forms a valid StatementList. The 
+``RAJA::statement::Lambda<LambdaID>``
+type invokes the lambda expression corresponding to its position 'LambdaID' 
+in the sequence of lambda expressions in the ``RAJA::kernel`` argument list. 
+For example, a simple sequential for-loop::
 
   for (int i = 0; i < N; ++i) {
     // loop body
@@ -770,7 +772,7 @@ can be represented using the RAJA kernel interface as::
     >;
 
   RAJA::kernel<KERNEL_POLICY>(
-    RAJA::make_tuple(N_range),
+    RAJA::make_tuple(range),
     [=](int i) {
       // loop body
     }
@@ -787,15 +789,16 @@ RAJA::kernel Statement Types
 The list below summarizes the current collection of statement types that
 can be used with ``RAJA::kernel`` and ``RAJA::kernel_param``. More detailed
 explanation along with examples of how they are used can be found in
-:ref:`tutorial-label`.
+the ``RAJA::kernel`` examples in :ref:`tutorial-label`.
 
-.. note::  * ``RAJA::kernel_param`` functions similar to ``RAJA::kernel`` 
-             except that the second argument is a *tuple of parameters* used 
-             in a kernel for local arrays, thread local variables, tiling 
-             information, etc.
+.. note:: All of the statement types described below are in the namespace 
+          ``RAJA::statement``. For brevity, we omit the namespaces in
+          the discussion in this section.
 
-.. note:: * All of the statement types described below are in the namespace 
-            ``RAJA::statement``. For breavity, we omit the namespaces.
+.. note::  ``RAJA::kernel_param`` functions similarly to ``RAJA::kernel`` 
+           except that the second argument is a *tuple of parameters* used 
+           in a kernel for local arrays, thread local variables, tiling 
+           information, etc.
 
 Several RAJA statements can be specialized with auxilliary types, which are
 described in :ref:`auxilliarypolicy_label`.
@@ -814,11 +817,11 @@ There is one statement specific to OpenMP kernels.
 
 * ``OmpSyncThreads`` applies the OpenMP ``#pragma omp barrier`` directive.
 
-Statement types that lauch CUDA or HIP GPU kernels are listed next. They work 
+Statement types that launch CUDA or HIP GPU kernels are listed next. They work 
 similarly for each back-end and their names are distinguished by the prefix 
 ``Cuda`` or ``Hip``. For example, ``CudaKernel`` or ``HipKernel``.
 
-* ``Cuda/HipKernel< EnclosedStatements>`` launches ``EnclosedStatements' as a GPU kernel; e.g., a loop nest where the iteration spaces of each loop level are associated with threads and/or thread blocks as described by the execution policies applied to them. This kernel launch is synchronous.
+* ``Cuda/HipKernel< EnclosedStatements>`` launches ``EnclosedStatements`` as a GPU kernel; e.g., a loop nest where the iteration spaces of each loop level are associated with threads and/or thread blocks as described by the execution policies applied to them. This kernel launch is synchronous.
 
 * ``Cuda/HipKernelAsync< EnclosedStatements>`` asynchronous version of Cuda/HipKernel.
 
@@ -838,11 +841,11 @@ similarly for each back-end and their names are distinguished by the prefix
 
 * ``Cuda/HipKernelExpAsync<num_blocks, num_threads, EnclosedStatements>`` asynchronous version of Cuda/HipKernelExp.
 
-* ``Cuda/HipSyncThreads`` invokes CUDA or HIP '__syncthreads()' barrier.
+* ``Cuda/HipSyncThreads`` invokes CUDA or HIP ``__syncthreads()`` barrier.
 
-* ``Cuda/HipSyncWarp`` invokes CUDA '__syncwarp()' barrier. **Note: warp sync is not supported, so the HIP variant is a no-op.
+* ``Cuda/HipSyncWarp`` invokes CUDA ``__syncwarp()`` barrier. Warp sync is not supported in HIP, so the HIP variant is a no-op.
 
-Statement types that lauch SYCL kernels are listed next. 
+Statement types that launch SYCL kernels are listed next. 
 
 * ``SyclKernel<EnclosedStatements>`` launches ``EnclosedStatements`` as a SYCL kernel.  This kernel launch is synchronous.
 
@@ -858,14 +861,14 @@ e.g., by allowing CPU cache blocking or use of GPU shared memory.
 * ``ForICount< ArgId, ParamId, ExecPolicy, EnclosedStatements >`` abstracts an inner for-loop within an outer tiling loop **where it is necessary to obtain the local iteration index in each tile**. The ``ArgId`` indicates which entry in the iteration space tuple to which the loop applies and the ``ParamId`` indicates the position of the tile index parameter in the parameter tuple. The ``ExecPolicy`` and ``EnclosedStatements`` are similar to what they represent in a ``statement::For`` type.
 
 It is often advantageous to use local arrays for data accessed in tiled loops.
-RAJA provides a statement for allocating data in a :ref:`local_array-label`
+RAJA provides a statement for allocating data in a :ref:`feat-local_array-label`
 object according to a memory policy. See :ref:`localarraypolicy-label` for more information about such policies.
 
 * ``InitLocalMem< MemPolicy, ParamList<...>, EnclosedStatements >`` allocates memory for a ``RAJA::LocalArray`` object used in kernel. The ``ParamList`` entries indicate which local array objects in a tuple will be initialized. The ``EnclosedStatements`` contain the code in which the local array will be accessed; e.g., initialization operations.
 
 RAJA provides some statement types that apply in specific kernel scenarios.
 
-* ``Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads in a multi-threaded code region to a single thread. The ``ReducePolicy`` is similar to what it represents for RAJA reduction types. ``ParamId`` specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. ``Operator`` is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`scanops-label`). After the reduction is complete, the ``EnclosedStatements`` execute on the thread that received the final reduced value.
+* ``Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads in a multithreaded code region to a single thread. The ``ReducePolicy`` is similar to what it represents for RAJA reduction types. ``ParamId`` specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. ``Operator`` is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`feat-scanops-label`). After the reduction is complete, the ``EnclosedStatements`` execute on the thread that received the final reduced value.
 
 * ``If< Conditional >`` chooses which portions of a policy to run based on run-time evaluation of conditional statement; e.g., true or false, equal to some value, etc.
 
@@ -877,10 +880,10 @@ RAJA provides some statement types that apply in specific kernel scenarios.
 Auxilliary Types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The following list summarizes auxillary types used in the above statments. These
+The following list summarizes auxilliary types used in the above statements. These
 types live in the ``RAJA`` namespace.
 
-  * ``tile_fixed<TileSize>`` tile policy argument to a ``Tile`` or ``TileTCount`` statement; partitions loop iterations into tiles of a fixed size specified by ``TileSize``. This statement type can be used as the ``TilePolicy`` template paramter in the ``Tile`` statements above.
+  * ``tile_fixed<TileSize>`` tile policy argument to a ``Tile`` or ``TileTCount`` statement; partitions loop iterations into tiles of a fixed size specified by ``TileSize``. This statement type can be used as the ``TilePolicy`` template parameter in the ``Tile`` statements above.
  
   * ``tile_dynamic<ParamIdx>`` TilePolicy argument to a Tile or TileTCount statement; partitions loop iterations into tiles of a size specified by a ``TileSize{}`` positional parameter argument. This statement type can be used as the ``TilePolicy`` template paramter in the ``Tile`` statements above.
 
@@ -892,6 +895,5 @@ types live in the ``RAJA`` namespace.
 
   * ``ValuesT<T, ...>`` argument to a Lambda statement; used to specify compile time constants, of type T, that will be used as lambda arguments.
 
-
 Examples that show how to use a variety of these statement types can be found
-in :ref:`tutorialcomplex-label`.
+in :ref:`loop_elements-kernel-label`.
diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst
index 4e69d087d0..808669f03f 100644
--- a/docs/sphinx/user_guide/feature/reduction.rst
+++ b/docs/sphinx/user_guide/feature/reduction.rst
@@ -6,7 +6,7 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _reductions-label:
+.. _feat-reductions-label:
 
 ====================
 Reduction Operations
@@ -15,12 +15,10 @@ Reduction Operations
 RAJA does not provide separate loop execution methods for loops containing
 reduction operations like some other C++ loop programming abstraction models.
 Instead, RAJA provides reduction types that allow users to perform reduction 
-operations in ``RAJA::forall`` and ``RAJA::kernel`` kernels in a portable, 
-thread-safe manner. Users may use as many reduction objects in a loop kernel
-as they need. Available RAJA reduction types are described in this section.
-
-A detailed example of RAJA reduction usage can be found in 
-:ref:`reductions-label`.
+operations in kernels launched using ``RAJA::forall``, ``RAJA::kernel``,
+and ``RAJA::launch`` methods in a portable, thread-safe manner. Users may 
+use as many reduction objects in a loop kernel as they need. Available RAJA 
+reduction types are described in this section.
 
 .. note:: All RAJA reduction types are located in the namespace ``RAJA``.
 
@@ -28,14 +26,19 @@ Also
 
 .. note:: * Each RAJA reduction type is templated on a **reduction policy** 
             and a **reduction value type** for the reduction variable. The
-            **reduction policy type must be compatibe with the execution
-            policy used by the kernel.** For example, in a CUDA kernel,
-            a CUDA reduction policy must be used. 
+            **reduction policy type must be compatible with the execution
+            policy used by the kernel in which it is used.** For example, in 
+            a CUDA kernel, a CUDA reduction policy must be used. 
           * Each RAJA reduction type accepts an **initial reduction value or 
             values** at construction (see below).
           * Each RAJA reduction type has a 'get' method to access reduced
             values after kernel execution completes.
 
+Please see the following tutorial sections for detailed examples that use
+RAJA reductions:
+
+ * :ref:`tut-reduction-label`.
+
 
 ----------------
 Reduction Types
@@ -156,3 +159,153 @@ Reduction Policies
 For more information about available RAJA reduction policies and guidance
 on which to use with RAJA execution policies, please see 
 :ref:`reducepolicy-label`.
+
+--------------------------------
+Experimental Reduction Interface
+--------------------------------
+
+An experimental reduction interface is now available that offers several 
+usability and performance advantages over the current reduction model in RAJA. 
+The new interface allows ``RAJA::forall`` to take optional "plugin-like" 
+objects to extend the execution behavior of a ``RAJA::forall`` execution 
+context.
+
+The new interface passes ``RAJA::expt::Reduce<OP_TYPE>`` objects as function 
+arguments to ``RAJA::forall`` and provides users with thread-local variables
+of the reduction data type to be updated inside the lambda. This differs 
+from the current reduction model in which ``RAJA::ReduceOP<REDUCE_POL, T>``
+objects are captured by the user-supplied kernel body lambda expression.
+
+
+RAJA::expt::Reduce
+..................
+::
+
+  double* a = ...;
+
+  double rs = 0.0;
+  double rm = 1e100;
+      
+  RAJA::forall<EXEC_POL> ( Res, Seg, 
+  RAJA::expt::Reduce<RAJA::operators::plus>(&rs),
+  RAJA::expt::Reduce<RAJA::operators::minimum>(&rm),
+  [=] (int i, double& _rs, double& _rm) {
+    _rs += a[i];
+    _rm = RAJA_MIN(a[i], _rm); 
+  }
+  );
+  
+  std::cout << rs ...
+  std::cout << rm ...
+
+* Each ``RAJA::expt::Reduce`` argument to ``RAJA::forall`` is templated on
+  a reduction operator, and takes a pointer to a target variable to write 
+  the final reduction result to, ``&rs`` and ``&rm`` in the example code
+  above. The reduction operation will include the existing value of
+  the given target variable.
+* The kernel body lambda expression passed to ``RAJA::forall`` must have a 
+  parameter corresponding to each ``RAJA::expt::Reduce`` argument, ``_rs`` and 
+  ``_rm`` in the example code. These parameters refer to a local target for each
+  reduction operation. It is important to note that the parameters follow the
+  kernel iteration variable, ``i`` in this case, and appear in the same order as the 
+  corresponding ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. The
+  parameters' types must be references to the types used in the
+  ``RAJA::expt::Reduce`` arguments.
+* The local variables referred to by ``_rs`` and ``_rm`` are initialized with the
+  *identity* of the reduction operation to be performed.
+* The local variables are updated in the user supplied lambda.
+* The local variables are reduced to a single value, combining their values across all
+  threads participating in the ``RAJA::forall`` execution.
+* Finally, the target variable is updated with the result of the ``RAJA::forall`` reduction
+  by performing the reduction operation to combine the existing value of the target 
+  variable and the result of the ``RAJA::forall`` reduction.
+* The final reduction value is accessed by referencing the target variable 
+  passed to ``RAJA::expt::Reduce`` in the ``RAJA::forall`` method.
+
+.. note:: In the above example ``Res`` is a resource object that must be 
+          compatible with the ``EXEC_POL``. ``Seg`` is the iteration space
+          object for ``RAJA::forall``.
+
+.. important:: The order and types of the local reduction variables in the
+               kernel body lambda expression must match exactly with the
+               corresponding ``RAJA::expt::Reduce`` arguments to the
+               ``RAJA::forall`` to ensure that the correct result is obtained.
+
+RAJA::expt::ValLoc
+..................
+
+As with the current RAJA reduction interface, the new interface supports *loc* 
+reductions, which provide the ability to get a kernel/loop index at which the 
+final reduction value was found. With this new interface, *loc* reductions 
+are performed using ``ValLoc<T>`` types. Since they are strongly typed, they
+provide ``min()`` and ``max()`` operations that are equivalent to using 
+``RAJA_MIN()`` or ``RAJA_MAX`` macros as demonstrated in the code example below.
+Users must use the ``getVal()`` and ``getLoc()`` methods to access the reduction 
+results::
+
+  double* a = ...;
+
+  using VL_DOUBLE = RAJA::expt::ValLoc<double>;
+  VL_DOUBLE rm_loc;
+      
+  RAJA::forall<EXEC_POL> ( Res, Seg, 
+  RAJA::expt::Reduce<RAJA::operators::minimum>(&rm_loc),
+  [=] (int i, VL_DOUBLE& _rm_loc) {
+    _rm_loc = RAJA_MIN(VL_DOUBLE(a[i], i), _rm_loc);  
+    //_rm_loc.min(VL_DOUBLE(a[i], i)); // Alternative to RAJA_MIN
+  }
+  );
+
+  std::cout << rm_loc.getVal() ...
+  std::cout << rm_loc.getLoc() ...
+
+Lambda Arguments
+................
+
+This interface takes advantage of C++ parameter packs to allow users to pass
+any number of ``RAJA::expt::Reduce`` objects to the ``RAJA::forall`` method::
+
+  double* a = ...;
+
+  using VL_DOUBLE = RAJA::expt::ValLoc<double>;
+  VL_DOUBLE rm_loc;
+  double rs;
+  double rm;
+        
+  RAJA::forall<EXEC_POL> ( Res, Seg, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&rs),        // --> 1 double added
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&rm),     // --> 1 double added
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&rm_loc), // --> 1 VL_DOUBLE added
+    RAJA::expt::KernelName("MyFirstRAJAKernel"),           // --> NO args added
+    [=] (int i, double& _rs, double& _rm, VL_DOUBLE& _rm_loc) {
+      _rs += a[i];
+      _rm = RAJA_MIN(a[i], _rm); 
+      _rm_loc.min(VL_DOUBLE(a[i], i));
+    }
+  );
+
+  std::cout << rs ...
+  std::cout << rm ...
+  std::cout << rm_loc.getVal() ...
+  std::cout << rm_loc.getLoc() ...
+
+Again, the lambda expression parameters are in the same order as
+the ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. Both the types and 
+order of the parameters must match to get correct results and to compile 
+successfully. Otherwise, a static assertion will be triggered::
+
+  LAMBDA Not invocable w/ EXPECTED_ARGS.
+
+.. note:: This static assert is only enabled when passing an undecorated C++
+          lambda. Meaning, this check will not happen when passing 
+          extended-lambdas (i.e. DEVICE tagged lambdas) or other functor like
+          objects.
+
+.. note:: The experimental ``RAJA::forall`` interface is more flexible than the
+          current implementation, other optional arguments besides
+          ``RAJA::expt::Reduce`` can be passed to a ``RAJA::forall`` to extend
+          its behavior. In the above example we demonstrate using 
+          ``RAJA::expt::KernelName``, which wraps a ``RAJA::forall`` executing 
+          under a ``HIP`` or ``CUDA`` policy in a named region. Use of 
+          ``RAJA::expt::KernelName`` does not require an additional
+          parameter in the lambda expression.
diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst
index 1a28720553..284fdd560f 100644
--- a/docs/sphinx/user_guide/feature/resource.rst
+++ b/docs/sphinx/user_guide/feature/resource.rst
@@ -6,175 +6,232 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _resource-label:
+.. _feat-resource-label:
 
 =========
 Resources
 =========
 
-This section describes the basic concepts of Resource types and their 
-functionality in ``RAJA::forall``. Resources are used as an interface to 
-various backend constructs and their respective hardware. Currently there 
-exists Resource types for ``Cuda``, ``Hip``, ``Omp`` (target) and ``Host``. 
-Resource objects allow the user to execute ``RAJA::forall`` calls 
-asynchronously on a respective thread/stream. The underlying concept of each 
-individual Resource is still under development and it should be considered 
-that functionality / behaviour may change.
+This section describes the basic concepts of resource types and how to use 
+them with RAJA-based kernels using ``RAJA::forall``, ``RAJA::kernel``, ``
+RAJA::launch``, etc. Resources are used as an interface to various RAJA 
+back-end constructs and their respective hardware. Currently there 
+exist resource types for ``Cuda``, ``Hip``, ``Omp`` (target) and ``Host``.
+Resource objects allow one to allocate and deallocate storage in memory spaces
+associated with RAJA back-ends and copy data between memory spaces. They also
+allow one to execute RAJA kernels asynchronously on a respective thread/stream.
+Resource support in RAJA is rudimentary at this point and its functionality /
+behavior may change as it is developed.
 
 .. note:: * Currently feature complete asynchronous behavior and 
             streamed/threaded support is available only for ``Cuda`` and 
-            ``Hip`` resources. 
-          * The ``RAJA::resources`` namespace aliases the ``camp::resources`` 
+            ``Hip`` resources.
+          * RAJA resource support is based on camp resource support. The 
+            ``RAJA::resources`` namespace aliases the ``camp::resources`` 
             namespace.
 
 Each resource has a set of underlying functionality that is synonymous across 
-all resource types.  
+each resource type.  
 
  ===================== ===============================================
  Methods               Brief description
  ===================== ===============================================
  get_platform          Returns the underlying camp platform
-                       the resource is associated with.
- get_event             Return an Event object for the resource from
+                       associated with the resource.
+ get_event             Return an event object for the resource from
                        the last resource call.
- allocate              Allocate data per the resource's given
-                       backend.
- deallocate            Deallocate data per the resource's given
-                       backend.
- memcpy                Perform a memory copy from a src location
-                       to a destination location from the
-                       resource's backend.
- memset                Set memory value per the resourse's
-                       given backend.
- wait_for              Enqueue a wait on the resource's stream/thread
-                       for a user passed event to occur.
+ allocate              Allocate data on a resource back-end.
+ deallocate            Deallocate data on a resource back-end.
+ memcpy                Perform a memory copy from a source location
+                       to a destination location on a resource back-end.
+ memset                Set memory value in an allocation on a resource
+                       back-end.
+ wait                  Wait for all operations enqueued on a resource to
+                       complete before proceeding.
+ wait_for              Enqueue a wait on a resource stream/thread
+                       for a user passed event to complete.
  ===================== ===============================================
 
-.. note:: ``deallocate``, ``memcpy`` and ``memset`` will only work with 
+.. note:: ``deallocate``, ``memcpy`` and ``memset`` operations only work with 
           pointers that correspond to memory locations that have been 
           allocated on the resource's respective device.
   
-Each resource type also defines specific backend information/functionality. 
+Each resource type also defines specific back-end information/functionality. 
 For example, each CUDA resource contains a ``cudaStream_t`` value with an 
-associated get method. See the individual functionality for each resource 
-in ``raja/tpl/camp/include/resource/``.
+associated get method. The basic interface for each resource type is 
+summarized in `Camp resource <https://github.com/LLNL/camp/blob/main/include/camp/resource.hpp>`_.
 
 .. note:: Stream IDs are assigned to resources in a round robin fashion. The 
-          number of independent streams for a given backend is limited to the 
+          number of independent streams for a given back-end is limited to the 
           maximum number of concurrent streams that the back-end supports. 
 
 ------------
 Type-Erasure
 ------------
 
-Resources can be declared in two formats: An erased resource, and a concrete 
-resource. The underlying runtime functionality is the same for both formats. 
-An erased resource allows a user the ability to change the resource backend 
-at runtime. 
+Resources can be declared in two ways, as a type-erased resource or as a 
+concrete resource. The underlying run time functionality is the same for both.
 
-Concrete CUDA resource::
+Here is one way to construct a concrete CUDA resource type::
 
-    RAJA::resources::Cuda my_cuda_res;
+  RAJA::resources::Cuda my_cuda_res;
 
-Erased resource::
+A type-erased resource allows a user the ability to change the resource 
+back-end at run time. For example, to choose a CUDA GPU device resource or 
+host resource at run time, one could do the following::
 
-    if (use_gpu)
-      RAJA::resources::Resource my_res{RAJA::resources::Cuda()};
-    else
-      RAJA::resources::Resource my_res{RAJA::resources::Host()};
+  RAJA::resources::Resource* my_res = nullptr;
 
+  if (use_gpu)
+    my_res = new RAJA::resources::Resource{RAJA::resources::Cuda()};
+  else
+    my_res = new RAJA::resources::Resource{RAJA::resources::Host()};
 
-Memory allocation on resources::
+When ``use_gpu`` is true, ``my_res`` will be a CUDA GPU device resource. 
+Otherwise, it will be a host CPU resource.
 
-    int* a1 = my_cuda_res.allocate<int>(ARRAY_SIZE);
-    int* a2 = my_res.allocate<int>(ARRAY_SIZE);
+-------------------
+Memory Operations
+-------------------
 
-If ``use_gpu`` is ``true``, then the underlying type of ``my_res`` is a CUDA 
-resource. Therefore ``a1`` and ``a2`` will both be allocated on the GPU. If 
-``use_gpu`` is ``false``, then only ``a1`` is allocated on the GPU, and 
-``a2`` is allocated on the host.
+The example discussed in this section illustrates most of the memory
+operations that can be performed with 
+A common use case for a resource is to manage arrays in the appropriate 
+memory space to use in a kernel. Consider the following code example::
 
+  // create a resource for a host CPU and a CUDA GPU device
+  RAJA::resources::Resource host_res{RAJA::resources::Host()};
+  RAJA::resources::Resource cuda_res{RAJA::resources::Cuda()};
 
-------
-Forall
-------
+  // allocate arrays in host memory and device memory
+  int N = 100;
+
+  int* host_array = host_res.allocate<int>(N);
+  int* gpu_array = cuda_res.allocate<int>(N);
+
+  // initialize values in host_array....
+
+  // initialize gpu_array values to zero
+  cuda_res.memset(gpu_array, 0, sizeof(int) * N);
+
+  // copy host_array values to gpu_array 
+  cuda_res.memcpy(gpu_array, host_array, sizeof(int) * N);
+
+  // execute a CUDA kernel that uses gpu_array data
+  RAJA::forall<RAJA::cuda_exec<128>>(RAJA::TypedRangeSegment<int>(0, N), 
+    [=] RAJA_DEVICE(int i) {
+      // modify values of gpu_array...
+    }
+  );
+
+  // copy gpu_array values to host_array 
+  cuda_res.memcpy(host_array, gpu_array, sizeof(int) * N);
+
+  // do something with host_array on CPU...
+
+  // de-allocate array storage
+  host_res.deallocate(host_array); 
+  cuda_res.deallocate(gpu_array); 
+
+Here, we create a CUDA GPU device resource and a host CPU resource and use
+them to allocate an array in GPU memory and one in host memory, respectively.
+Then, after initializing the host array, we use the CUDA resource to copy the
+host array to the GPU array storage. Next, we run a CUDA device kernel
+which modifies the GPU array. After using the CUDA resource to copy the GPU
+array values into the host array, we can do something with the values 
+generated in the GPU kernel on the CPU host. Lastly, we de-allocate the 
+arrays.
+
+--------------------------------
+Kernel Execution and Resources
+--------------------------------
+
+Resources can be used with the following RAJA kernel execution interfaces:
 
-A resource is an optional argument to a ``RAJA::forall`` call. When used, 
-it is passed as the first argument to the method::
+  * ``RAJA::forall``
+  * ``RAJA::kernel``
+  * ``RAJA::launch``
+  * ``RAJA::sort``
+  * ``RAJA::scan``
 
-    RAJA::forall<ExecPol>(my_gpu_res, .... )
+Although we show examples using mainly ``RAJA::forall`` in the following
+discussion, resource usage with the other methods listed is similar and
+provides similar behavior.
 
-When specifying a CUDA or HIP resource, the ``RAJA::forall`` is executed 
-aynchronously on a stream. Currently, CUDA and HIP are the only Resources 
-that enable asynchronous threading with a ``RAJA::forall``. All other calls 
-default to using the ``Host`` resource until further support is added.
+Usage
+^^^^^
+ 
+Specifically, a resource can be passed optionally as the first argument in 
+a call to one of these methods. For example::
 
-The Resource type that is passed to a ``RAJA::forall`` call must be a concrete 
-type. This is to allow for a compile-time assertion that the resource is not
-compatible with the given execution policy. For example::
+  RAJA::forall<ExecPol>(my_res, .... );
+
+.. note:: When a resource is not passed when calling one of the methods listed 
+          above, the *default* resource type associated with the execution
+          policy is used in the internal implementation.
+
+When passing a CUDA or HIP resource, the method will execute asynchronously 
+on a GPU stream. Currently, CUDA and HIP are the only resource types that 
+enable asynchronous threading. 
+
+.. note:: Support for OpenMP CPU multithreading, which would use the 
+          ``RAJA::resources::Host`` resource type, and OpenMP target offload
+          which would use the ``RAJA::resources::Omp`` resource type,
+          is incomplete and under development.
+
+The resource type passed to one of the methods listed above must be a 
+concrete type; i.e., not type erased. The reason is that this allows 
+consistency checking via a compile-time assertion to ensure that the passed 
+resource is compatible with the given execution policy. For example::
     
-    using ExecPol = RAJA::cuda_exec_async<BLOCK_SIZE>;
-    RAJA::resources::Cuda my_cuda_res;
-    RAJA::resources::Resource my_res{RAJA::resources::Cuda()};
-    RAJA::resources::Host my_host_res;
-
-    RAJA::forall<ExecPol>(my_cuda_res, .... ) // Compiles.
-    RAJA::forall<ExecPol>(my_res, .... )      // Compilation Error. Not Concrete.
-    RAJA::forall<ExecPol>(my_host_res, .... ) // Compilation Error. Mismatched Resource and Exec Policy.
-
-Below is a list of the currently available concrete resource types and their 
-execution policy suport.
-
- ======== ==============================
- Resource Policies supported
- ======== ==============================
- Cuda     | cuda_exec
-          | cuda_exec_async
-          | cuda_exec_explicit
- Hip      | hip_exec
-          | hip_exec_async
- Omp*     | omp_target_parallel_for_exec
-          | omp_target_parallel_for_exec_n
- Host     | loop_exec
-          | seq_exec
-          | openmp_parallel_exec
-          | omp_for_schedule_exec
-          | omp_for_nowait_schedule_exec
-          | simd_exec
-          | tbb_for_dynamic
-          | tbb_for_static
- ======== ==============================
-
-.. note:: The ``RAJA::resources::Omp`` resource is still under development.
-
-IndexSet policies require two execution policies (see :ref:`indexsets-label`). 
-Currently, users may only pass a single resource to a forall method taking
-an IndexSet argument. This resource is used for the inner execution of 
-each Segment in the IndexSet::
-
-    using ExecPol = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>;
-    RAJA::forall<ExecPol>(my_cuda_res, iset,  .... );
-
-
-When a resource is not provided by the user, a *default* resource is assigned,
-which can be accessed in a number of ways. It can be accessed directly from 
+  using ExecPol = RAJA::cuda_exec_async<BLOCK_SIZE>;
+
+  RAJA::resources::Cuda my_cuda_res;
+  RAJA::forall<ExecPol>(my_cuda_res, .... ); // Successfully compiles
+
+  RAJA::resources::Resource my_res{RAJA::resources::Cuda()};
+  RAJA::forall<ExecPol>(my_res, .... )       // Compilation error since resource type is not concrete
+
+  RAJA::resources::Host my_host_res;
+  RAJA::forall<ExecPol>(my_host_res, .... )  // Compilation error since resource type is incompatible with the execution policy
+
+IndexSet Usage
+^^^^^^^^^^^^^^^
+ 
+Recall that a kernel that uses a RAJA IndexSet to describe the kernel iteration 
+space, require a two execution policies (see :ref:`indexsetpolicy-label`). 
+Currently, a user may only pass a single resource to a method taking
+an IndexSet argument. The resource is used for the *inner* execution over
+each segment in the IndexSet, not for the *outer* iteration over segments.
+For example::
+
+  using ExecPol = RAJA::ExecPolicy<RAJA::seq_segit, RAJA::cuda_exec<256>>;
+  RAJA::forall<ExecPol>(my_cuda_res, iset,  .... );
+
+Default Resources
+^^^^^^^^^^^^^^^^^^^^^^
+ 
+When a resource is not provided by the user, a *default* resource that
+corresponds to the execution policy is used. The default resource 
+can be accessed in multiple ways. It can be accessed directly from 
 the concrete resource type::
 
-    RAJA::resources::Cuda my_default_cuda = RAJA::resources::Cuda::get_default();
+  RAJA::resources::Cuda my_default_cuda = RAJA::resources::Cuda::get_default();
 
-The resource type can also be deduced from an execution policy::
+The resource type can also be deduced in two different ways from an execution 
+policy::
 
-    using Res = RAJA::resources::get_resource<ExecPol>::type;
-    Res r = Res::get_default();
+  using Res = RAJA::resources::get_resource<ExecPol>::type;
+  Res r = Res::get_default();
 
-Finally, the resource type can be deduced from an execution policy::
+Or::
 
-    auto my_resource = RAJA::resources::get_default_resource<ExecPol>();
+  auto my_resource = RAJA::resources::get_default_resource<ExecPol>();
 
-.. note:: For CUDA and HIP, the default resource is *NOT* the CUDA or HIP 
-          default stream. It is its own stream defined in 
-          ``camp/include/resource/``. This is an attempt to break away
-          from some of the issues that arise from the synchronization behaviour
+.. note:: For CUDA and HIP, the default resource is *NOT* associated with the 
+          default CUDA or HIP stream. It is its own stream defined by the
+          underlying camp resource. This is intentional to break away
+          from some issues that arise from the synchronization behavior
           of the CUDA and HIP default streams. It is still possible to use the 
           CUDA and HIP default streams as the default resource. This can be 
           enabled by defining the environment variable 
@@ -185,34 +242,40 @@ Finally, the resource type can be deduced from an execution policy::
 Events
 ------
 
-Event objects allow users to wait or query the status of a resource's action. An
-event can be returned from a resource::
+Event objects allow users to wait or query the status of a resource's action. 
+An event can be returned from a resource::
 
-    RAJA::resources::Event e = my_res.get_event();
+  RAJA::resources::Event e = my_res.get_event();
 
 Getting an event like this enqueues an event object for the given back-end. 
 
 Users can call the *blocking* ``wait`` function on the event::
 
-    e.wait();
+  e.wait();
 
-Preferably, users can enqueue the event on a specific resource, forcing only 
-that resource to wait for the event::
+This wait call will block all execution until all operations enqueued on a
+resource complete.
 
-    my_res.wait_for(&e);
+Alternatively, a user can enqueue the event on a specific resource, forcing 
+only the resource to wait for the operation associated with the event to
+complete::
 
-The usage allows one to set up dependencies between resource objects and 
-``RAJA::forall`` calls.
+  my_res.wait_for(&e);
+
+All methods listed above near the beginning of the RAJA resource discussion
+return an event object so users can access the event associated with the 
+method call. This allows one to set up dependencies between resource objects 
+and operations, as well as define and control asynchronous execution patterns.
 
 .. note:: An Event object is only created if a user explicitly sets the event 
           returned by the ``RAJA::forall`` call to a variable. This avoids 
           unnecessary event objects being created when not needed. For example::
     
-               forall<cuda_exec_async<BLOCK_SIZE>>(my_cuda_res, ...
+            RAJA::forall<cuda_exec_async<BLOCK_SIZE>>(my_cuda_res, ...);
 
           will *not* generate a cudaStreamEvent, whereas::
 
-                RAJA::resources::Event e = forall<cuda_exec_async<BLOCK_SIZE>>(my_cuda_res, ...
+            RAJA::resources::Event e = RAJA::forall<cuda_exec_async<BLOCK_SIZE>>(my_cuda_res, ...);
 
           will generate a cudaStreamEvent.
 
@@ -220,66 +283,69 @@ The usage allows one to set up dependencies between resource objects and
 Example
 -------
 
-This example executes three kernels across two cuda streams on the GPU with 
-a requirement that the first and second kernel finish execution before 
-launching the third. It also demonstrates copying memory from the device 
-to host on a resource:
+The example presented here executes three kernels across two CUDA streams on 
+a GPU with a requirement that the first and second kernel finish execution 
+before the third is launched. It also shows copying memory from the device 
+to host on a resource that we described earlier.
     
-First, define two concrete CUDA resources and one host resource:
+First, we define two concrete CUDA resources and one concrete host resource,
+and define an asynchronous CUDA execution policy type:
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_defres_start
    :end-before: _raja_res_defres_end
    :language: C++
 
-Next, allocate data for two device arrays and one host array:
+Next, we allocate data for two GPU arrays and one host array, all of length 'N':
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_alloc_start
    :end-before: _raja_res_alloc_end
    :language: C++
 
-Then, Execute a kernel on CUDA stream 1 ``res_gpu1``:
+Then, we launch a GPU kernel on the CUDA stream associated with the resource
+``res_gpu1``, without keeping a handle to the associated event:
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_k1_start
    :end-before: _raja_res_k1_end
    :language: C++
-    
-and execute another kernel on  CUDA stream 2 ``res_gpu2`` storing a handle to
-an ``Event`` object to a local variable:
+
+Next, we execute another GPU kernel on the CUDA stream associated with the
+resource ``res_gpu2`` and keep a handle to the corresponding event object 
+by assigning it to a local variable ``e``:
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_k2_start
    :end-before: _raja_res_k2_end
    :language: C++
-    
-The next kernel on ``res_gpu1`` requires that the last kernel on ``res_gpu2`` 
-finish first. Therefore, we enqueue a wait on ``res_gpu1`` that enforces 
-this:
+
+We require that the next kernel we launch to wait for the kernel launched on 
+the stream associated with the resource ``res_gpu2`` to complete. Therefore, 
+we enqueue a wait on that event on the ``res_gpu1`` resource:
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_wait_start
    :end-before: _raja_res_wait_end
    :language: C++
-    
-Execute the second kernel on ``res_gpu1`` now that the two previous kernels
-have finished:
+
+Now that the second GPU kernel is complete, we launch a second kernel on the
+stream associated with the resource ``res_gpu1``:
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_k3_start
    :end-before: _raja_res_k3_end
    :language: C++
-    
-We can enqueue a memcpy operation on ``res_gpu1`` to move data from the device 
-to the host:
+
+Next, we enqueue a memcpy operation on the resource ``res_gpu1`` to copy 
+the GPU array ``d_array`` to the host array ``h_array``:
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_memcpy_start
    :end-before: _raja_res_memcpy_end
    :language: C++
     
-Lastly, we use the copied data on the host side:
+Lastly, we use the copied data in a kernel executed on the host:
 
 .. literalinclude:: ../../../../examples/resource-forall.cpp
    :start-after: _raja_res_k4_start
diff --git a/docs/sphinx/user_guide/feature/scan.rst b/docs/sphinx/user_guide/feature/scan.rst
index 76c6eb6688..c74b444189 100644
--- a/docs/sphinx/user_guide/feature/scan.rst
+++ b/docs/sphinx/user_guide/feature/scan.rst
@@ -6,10 +6,10 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _scan-label:
+.. _feat-scan-label:
 
 ================
-Scans
+Scan Operations
 ================
 
 RAJA provides portable parallel scan operations, which are basic
@@ -20,7 +20,8 @@ A few important notes:
 .. note:: * All RAJA scan operations are in the namespace ``RAJA``.
           * Each RAJA scan operation is a template on an *execution policy*
             parameter. The same policy types used for ``RAJA::forall`` methods
-            may be used for RAJA scans.
+            may be used for RAJA scans. Please see :ref:`feat-policies-label` 
+            for more information.
           * RAJA scan operations accept an optional *operator* argument so
             users can perform different types of scan operations. If
             no operator is given, the default is a 'plus' operation and
@@ -28,20 +29,18 @@ A few important notes:
 
 Also:
 
-.. note:: For scans using the CUDA back-end, RAJA uses the NVIDIA CUB library
-          internally. The CMake variable ``CUB_DIR`` will be automatically
-          set to the location of the CUB library when CUDA is enabled. Details
-          for using a different version of the CUB library are available in
-          the :ref:`getting_started-label` section.
+.. note:: For scans using the CUDA or HIP back-end, RAJA implementation uses 
+          the NVIDIA CUB library or AMD rocPRIM library, respectively. 
+          Typically, the CMake variable ``CUB_DIR`` or ``ROCPRIM_DIR`` will 
+          be automatically set to the location of the CUB or rocPRIM library 
+          for the CUDA or rocPRIM installation specified when either back-end
+          is enabled. More details for configuring the CUB or rocPRIM library 
+          for a RAJA build can be found in :ref:`getting_started_depend-label`.
 
-.. note:: For scans using the HIP back-end, RAJA uses the AMD rocPRIM library
-          internally. The CMake variable ``ROCPRIM_DIR`` will be automatically
-          set to the location of the rocPRIM library when HIP is enabled.
-          Details for using a different version of the rocPRIM library are
-          available in the :ref:`getting_started-label` section.
+Please see the following tutorial sections for detailed examples that use
+RAJA scan operations:
 
-Please see the :ref:`scan-label` tutorial section for usage examples of RAJA
-scan operations.
+ * :ref:`tut-scan-label`.
 
 -----------------
 Scan Operations
@@ -97,11 +96,12 @@ scan operation above will be a *prefix-sum* since there is no operator argument
 given; i.e., the output array will contain partial sums of the input array. The
 second scan will apply the operator that is passed. Note that container
 arguments can be generated from iterators using ``RAJA::make_span(begin, len)``.
+This is shown in the examples in :ref:`tut-scan-label`.
 
 RAJA also provides *in-place* scans:
 
  * ``RAJA::inclusive_scan_inplace< exec_policy >(in_container)``
- * ``RAJA::inclusive_scan_inplace< exec_policy >(in_container, <operator>)``
+ * ``RAJA::inclusive_scan_inplace< exec_policy >(in_container, operator)``
 
 An in-place scan generates the same output sequence as a non-inplace scan.
 However, an in-place scan does not take separate input and output arrays and
@@ -121,7 +121,7 @@ and
  * ``RAJA::exclusive_scan_inplace< exec_policy >(in_container)``
  * ``RAJA::exclusive_scan_inplace< exec_policy >(in_container, <operator>)``
 
-.. _scanops-label:
+.. _feat-scanops-label:
 
 --------------------
 RAJA Scan Operators
@@ -139,11 +139,3 @@ types of scans, such as:
 
 .. note:: * All RAJA scan operators are in the namespace ``RAJA::operators``.
 
--------------------
-Scan Policies
--------------------
-
-For information about RAJA execution policies to use with scan operations,
-please see :ref:`policies-label`.
-
-
diff --git a/docs/sphinx/user_guide/feature/sort.rst b/docs/sphinx/user_guide/feature/sort.rst
index 115959a4dd..8d4db5032f 100644
--- a/docs/sphinx/user_guide/feature/sort.rst
+++ b/docs/sphinx/user_guide/feature/sort.rst
@@ -6,60 +6,61 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _sort-label:
+.. _feat-sort-label:
 
 ================
-Sorts
+Sort Operations
 ================
 
-RAJA provides portable parallel sort operations, which are basic
-parallel algorithm building blocks. They are described in this section.
+RAJA provides portable parallel sort operations, which are described in this 
+section.
 
 A few important notes:
 
 .. note:: * All RAJA sort operations are in the namespace ``RAJA``.
           * Each RAJA sort operation is a template on an *execution policy*
             parameter. The same policy types used for ``RAJA::forall`` methods
-            may be used for RAJA sorts.
+            may be used for RAJA sorts. Please see :ref:`feat-policies-label` 
+            for more information.
           * RAJA sort operations accept an optional *comparator* argument so
             users can perform different types of sort operations. If
             no operator is given, the default is a *less than* operation and
-            the result is **non-decreasing**.
+            the result is a sequence sorted in **non-decreasing** order.
 
 Also:
 
-.. note:: * For sorts using the CUDA back-end, RAJA uses the implementations
-            provided by the NVIDIA CUB library. For information please see
-            :ref:`build-external-tpl <build-external-tpl-label>`.
-          * For sorts using the HIP back-end, RAJA uses the implementations
-            provided by the AMD rocPRIM library. For information please see
-            :ref:`build-external-tpl <build-external-tpl-label>`.
-          * The RAJA CUDA and HIP back-ends only support sorting
-            arithmetic types using RAJA operators 'less than' and
-            'greater than'.
+.. note:: For sorts using the CUDA or HIP back-end, RAJA implementation uses
+          the NVIDIA CUB library or AMD rocPRIM library, respectively.
+          Typically, the CMake variable ``CUB_DIR`` or ``ROCPRIM_DIR`` will
+          be automatically set to the location of the CUB or rocPRIM library
+          for the CUDA or rocPRIM installation specified when either back-end
+          is enabled. More details for configuring the CUB or rocPRIM library
+          for a RAJA build can be found :ref:`getting_started_depend-label`.
+
+Please see the following tutorial sections for detailed examples that use
+RAJA scan operations:
 
-Please see the :ref:`sort-label` tutorial section for usage examples of RAJA
-sort operations.
+ * :ref:`tut-sort-label`
 
 -----------------
 Sort Operations
 -----------------
 
-In general, a sort operation takes a sequence of numbers ``x`` and a binary
-comparison operator ``op`` that forms a strict weak ordering of elements in
-input sequence ``x`` and produces a sequence of numbers ``y`` as output. The
+In general, a sort operation takes a sequence of numbers 'x' and a binary
+comparison operator 'op' to form a strict weak ordering of elements in
+input sequence 'x' and produce a sequence of numbers 'y' as output. The
 output sequence is a permutation of the input sequence where each pair of
-elements ``a`` and ``b``, where ``a`` is before ``b`` in the output sequence,
-satisfies ``!(b op a)``. Sorts are stable if they always preserve the order of
-equivalent elements, where equivalent elements satisfy ``!(a op b) && !(b op a)``.
+elements 'a' and 'b', where 'a' is before 'b' in the output sequence,
+satisfies '!(b op a)'. Sorts are stable if they always preserve the order of
+equivalent elements, where equivalent means '!(a op b) && !(b op a)' is true.
 
-A **stable sort** takes an input sequence ``x`` where a\ :sub:`i` appears
-before a\ :sub:`j` if i < j when a\ :sub:`i` and a\ :sub:`j` are equivalent for
-any i != j.
+A **stable sort** takes an input sequence 'x' where a\ :sub:`i` appears
+before a\ :sub:`j` if i < j when a\ :sub:`i` and a\ :sub:`j` are equivalent 
+for any i != j.
 
    x = { a\ :sub:`0`\, b\ :sub:`0`\, a\ :sub:`1`\, ... }
 
-and calculates the stably sorted output sequence ``y`` that preserves the
+and calculates the stably sorted output sequence 'y' that preserves the
 order of equivalent elements. That is, the sorted sequence where element
 a\ :sub:`i` appears before the equivalent element a\ :sub:`j` if i < j:
 
@@ -83,13 +84,13 @@ RAJA unstable sort operations look like the following:
  * ``RAJA::sort< exec_policy >(container)``
  * ``RAJA::sort< exec_policy >(container, comparator)``
 
-For example, sorting an array with this sequence of values::
+For example, sorting an integer array with this sequence of values::
 
    6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5
 
 with a sequential unstable sort operation:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_seq_start
    :end-before: _sort_seq_end
    :language: C++
@@ -98,14 +99,24 @@ produces the ``out`` array with this sequence of values::
 
    0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
 
-Note that the syntax is essentially the same as :ref:`scan-label`.
+Note that the syntax is essentially the same as :ref:`feat-scan-label`.
 Here, ``container`` is a random access range of elements. ``container`` provides
 access to the input sequence and contains the output sequence at the end of
-sort. The first sort operation listed above will be a *non-decreasing* sort
+sort. The sort operation listed above will be a *non-decreasing* sort
 since there is no comparator argument given; i.e., the sequences will be
-reordered *in-place* using operator::less. The second sort will apply the
-comparator that is passed into the function. Note that the container argument
-can be generated from iterators using ``RAJA::make_span(begin, len)``.
+reordered *in-place* using the default RAJA less-than comparator.
+
+Equivalently, the ``RAJA::operators::less`` comparator operator could be 
+passed as the second argument to the sort routine to produce the same result:
+
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
+   :start-after: _sort_seq_less_start
+   :end-before: _sort_seq_less_end
+   :language: C++
+
+Note that container arguments can be generated from iterators using 
+``RAJA::make_span(out, N)``, where we pass the base pointer for the array 
+and its length.
 
 RAJA also provides sort operations that operate on key-value pairs stored
 separately:
@@ -117,7 +128,8 @@ separately:
 ``keys_container`` as ``RAJA::sort`` does in ``container`` and reorders the
 sequence of values in ``vals_container`` by permuting the sequence of values in
 the same manner as the sequence of keys; i.e. the sequence of pairs is sorted
-based on comparing their keys.
+based on comparing their keys. Detailed examples are provided in 
+:ref:`tut-sort-label`.
 
 .. note:: The comparator used in ``RAJA::sort_pairs`` only compares keys.
 
@@ -125,7 +137,7 @@ based on comparing their keys.
 RAJA Stable Sorts
 ---------------------
 
-RAJA stable sorts are essentially the same as unstable sorts:
+RAJA stable sort operations are used essentially the same as unstable sorts:
 
  * ``RAJA::stable_sort< exec_policy >(container)``
  * ``RAJA::stable_sort< exec_policy >(container, comparator)``
@@ -136,11 +148,11 @@ separately:
  * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container)``
  * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container, comparator)``
 
-.. _sortops-label:
+.. _feat-sortops-label:
 
---------------------
+--------------------------
 RAJA Comparison Operators
---------------------
+--------------------------
 
 RAJA provides two operators that can be used to produce different ordered sorts:
 
@@ -149,11 +161,3 @@ RAJA provides two operators that can be used to produce different ordered sorts:
 
 .. note:: All RAJA comparison operators are in the namespace ``RAJA::operators``.
 
--------------------
-Sort Policies
--------------------
-
-For information about RAJA execution policies to use with sort operations,
-please see :ref:`policies-label`.
-
-
diff --git a/docs/sphinx/user_guide/feature/tiling.rst b/docs/sphinx/user_guide/feature/tiling.rst
index 8b2e18d501..e803590e08 100644
--- a/docs/sphinx/user_guide/feature/tiling.rst
+++ b/docs/sphinx/user_guide/feature/tiling.rst
@@ -6,26 +6,27 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _tiling-label:
+.. _feat-tiling-label:
 
 ===========
 Loop Tiling
 ===========
 
 In this section, we discuss RAJA statements that can be used to tile nested
-for-loops. Typical loop tiling involves partitioning an iteration space into 
+loops. Typical loop tiling involves partitioning an iteration space into 
 a collection of "tiles" and then iterating over tiles in outer loops and 
-entries within each tile in inner loops. Many scientific computing algorithms 
+indices within each tile in inner loops. Many scientific computing algorithms 
 can benefit from loop tiling due to more efficient cache usage on a CPU or
 use of GPU shared memory.
 
-For example, an operation performed using a for-loop with a range of [0, 10)::
+For example, consider an operation performed using a C-style for-loop with 
+a range of [0, 10)::
 
   for (int i=0; i<10; ++i) {
     // loop body using index 'i'
   }
 
-May be expressed as a loop nest that iterates over five tiles of size two::
+This May be written as a loop nest that iterates over five tiles of size two::
 
   int numTiles = 5;
   int tileDim  = 2;
@@ -36,11 +37,10 @@ May be expressed as a loop nest that iterates over five tiles of size two::
     }
   }
 
-Next, we show how this tiled loop can be represented using RAJA. Then, we
-present variations on it that illustrate the usage of different RAJA kernel
-statement types.
+Next, we show how loop tiling can be written using RAJA with variations that
+use different ``RAJA::kernel`` execution policy statement types.
 
-.. code-block:: cpp
+Here is a way to write the tiled loop kernel above using ``RAJA::kernel``::
 
    using KERNEL_EXEC_POL =
      RAJA::KernelPolicy<
@@ -51,28 +51,30 @@ statement types.
        >
      >;
 
-   RAJA::kernel<KERNEL_EXEC_POL>(RAJA::make_tuple(RAJA::RangeSegment(0,10)), 
+   RAJA::kernel<KERNEL_EXEC_POL>(
+     RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0,10)), 
      [=] (int i) {
-     // loop body using index 'i'
-   });
-
-In RAJA, the simplest way to tile an iteration space is to use RAJA 
-``statement::Tile`` and ``statement::For`` statement types. A
-``statement::Tile`` type is similar to a ``statement::For`` type, but takes
-a tile size as the second template argument. The ``statement::Tile`` 
-construct generates the outer loop over tiles and the ``statement::For`` 
-statement iterates over each tile.  Nested together, as in the example, these 
-statements will pass the global index 'i' to the loop body in the lambda 
-expression as in the non-tiled version above.
-
-.. note:: When using ``statement::Tile`` and ``statement::For`` types together
-          to define a tiled loop structure, the integer passed as the first
-          template argument to each statement type must be the same. This 
-          indicates that they both apply to the same item in the iteration
-          space tuple passed to the ``RAJA::kernel`` methods.
-
-RAJA also provides alternative tiling and for statements that provide the tile 
-number and local tile index, if needed inside the kernel body, as shown below::
+       // kernel body using index 'i'
+     }
+   );
+
+In RAJA, the simplest way to tile an iteration space is to use
+``RAJA::statement::Tile`` and ``RAJA::statement::For`` statement types. A
+``RAJA::statement::Tile`` type is similar to a ``RAJA::statement::For`` type, 
+but takes a tile size as the second template argument. The 
+``RAJA::statement::Tile`` type generates the outer loop over tiles and 
+the ``RAJA::statement::For`` type iterates over each tile.  Nested together, 
+these statements will pass the global index ('i' in the example) to the 
+lambda expression as (kernel body) in a non-tiled version above.
+
+.. note:: When using ``RAJA::statement::Tile`` and ``RAJA::statement::For`` 
+          types together to define a tiled loop structure, the integer passed 
+          as the first template argument to each statement type must be the 
+          same. This indicates that they both apply to the same iteration space
+          in the space tuple passed to the ``RAJA::kernel`` method.
+
+RAJA also provides alternative statements that provide the tile number and 
+local tile index, if needed inside the kernel body, as shown below::
 
   using KERNEL_EXEC_POL2 =
     RAJA::KernelPolicy<
@@ -86,8 +88,9 @@ number and local tile index, if needed inside the kernel body, as shown below::
     >;
 
 
-  RAJA::kernel_param<KERNEL_EXEC_POL2>(RAJA::make_tuple(RAJA::RangeSegment(0,10)),
-                                       RAJA::make_tuple((int)0, (int)0),
+  RAJA::kernel_param<KERNEL_EXEC_POL2>(
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0,10)),
+    RAJA::make_tuple((int)0, (int)0),
     [=](int i, int t, int j) {
 
       // i - global index
@@ -95,20 +98,22 @@ number and local tile index, if needed inside the kernel body, as shown below::
       // j - index within tile
       // Then, i = j + 2*t (2 is tile size)
 
-   });
-
-The ``statement::TileTCount`` type allows the tile number to be accessed as a
-lambda argument and the ``statement::ForICount`` type allows the local tile 
-loop index to be accessed as a lambda argument. These values are specified in 
-the tuple, which is the second argument passed to the ``RAJA::kernel_param`` 
-method above. The ``statement::Param<#>`` type appearing as the second 
+    }
+  );
+
+The ``RAJA::statement::TileTCount`` type indicates that the tile number will 
+be passed to the lambda expression and the ``RAJA::statement::ForICount`` type 
+indicates that the local tile loop index will be passed to the lambda 
+expression. Storage for these values is specified in the parameter tuple, the 
+second argument passed to the ``RAJA::kernel_param`` method. The 
+``RAJA::statement::Param<#>`` type appearing as the second 
 template parameter for each statement type indicates which parameter tuple 
-entry the tile number or local tile loop index is passed to the lambda, and 
+entry, the tile number or local tile loop index, is passed to the lambda and 
 in which order. Here, the tile number is the second lambda argument (tuple 
 parameter '0') and the local tile loop index is the third lambda argument 
 (tuple parameter '1').
 
 .. note:: The global loop indices always appear as the first lambda expression
           arguments. Then, the parameter tuples identified by the integers 
-          in the ``Param`` statement types given for the loop statement 
-          types follow. 
+          in the ``RAJA::Param`` statement types given for the loop statement 
+          types follow.
diff --git a/docs/sphinx/user_guide/feature/vectorization.rst b/docs/sphinx/user_guide/feature/vectorization.rst
index 081ebf9028..057c0695d6 100644
--- a/docs/sphinx/user_guide/feature/vectorization.rst
+++ b/docs/sphinx/user_guide/feature/vectorization.rst
@@ -14,86 +14,94 @@ Vectorization (SIMD/SIMT)
 
 .. warning:: **This section describes an initial draft of an incomplete,
              experimental RAJA capability. It is not considered ready
-             for production. A basic description is provided here so
-             that (potentially) interested users can take a look, try it 
-             out, and provide input if they wish to do so.** 
+             for production, but it is ready for interested users to try.** 
 
-The RAJA team is experimenting with an API for SIMD/SIMT programming. 
-The goal is to make the implementation perform as well as if one used
-vectorization intrinsics directly in their code, but without the 
+             * We provide a basic description here so that interested users 
+               can take a look, try it out, and provide input if they wish to 
+               do so. The RAJA team values early feedback from users on new 
+               capabilities.
+
+             * There are no usage examples available in RAJA yet, except for
+               tests. Examples will be made available as they are developed.
+
+The aim of the RAJA API for SIMD/SIMT programming described in this section
+is to make an implementation perform as well as if one used
+SIMD/SIMT intrinsics directly in her code, but without the 
 software complexity and maintenance burden associated with doing that. 
-In particular, our goal is to *guarantee* that specified vectorization
-occurs without needing to explicitly use intrinsics in user code or 
+In particular, we want to *guarantee* that specified vectorization
+occurs without requiring users to manually insert intrinsics in their code or 
 rely on compiler auto-vectorization implementations.
 
-.. note:: All RAJA vectorization types are in the namespace ``RAJA::expt``.
+.. note:: All RAJA vectorization types described here are in the namespace 
+          ``RAJA::expt``.
 
-Currently, the main abstractions developed in RAJA so far are:
+Currently, the main abstractions in RAJA for SIMD/SIMT programming are:
 
-  * ``Register`` wraps underlying SIMD/SIMT hardware registers and 
-    provides consistent uniform access to them, using intrinsics under the
-    API when possible. The RAJA register abstraction currently supports the 
-    following hardware-specific ISAs : AVX, AVX2, AVX512, CUDA, and HIP.
-  * ``Vector`` builds on ``Register`` to provide arbitrary length
+  * ``Register`` which wraps underlying SIMD/SIMT hardware registers and 
+    provides consistent uniform access to them, using intrinsics behind the
+    API when possible. The register abstraction currently supports the 
+    following hardware-specific ISAs (instruction set architectures): 
+    AVX, AVX2, AVX512, CUDA, and HIP.
+  * ``Vector`` which builds on ``Register`` to provide arbitrary length
     vectors and operations on them.
-  * ``Matrix`` builds on ``Register`` to provide arbitrary-sized
-    matrices, column-major and row-major layouts, and operations on them.
+  * ``Matrix`` which builds on ``Register`` to provide arbitrary-sized
+    matrices and operations on them, including support for column-major and 
+    row-major data layouts.
 
-Finally, these capabilities integrate with RAJA :ref:`view-label` 
-capabilities, which implements am expression-template system that allows 
-a user to write linear algebra expressions on arbitrarily sized scalars, 
+Using these abstractions, RAJA provides an expression-template system that 
+allows users to write linear algebra expressions on arbitrarily sized scalars, 
 vectors, and matrices and have the appropriate SIMD/SIMT instructions
-performed during expression evaluation.
+performed during expression evaluation. These capabilities integrate with 
+RAJA :ref:`feat-view-label` capabilities, which insulate load/store and other 
+operations from user code.
 
 
 ------------------------
 Why Are We Doing This?
 ------------------------
 
-Quoting Tim Foley in `Matt Pharr's blog <https://pharr.org/matt/blog/2018/04/18/ispc-origins>`_: "Auto-vectorization is not a programming model". Unless, of
-course, you consider "hope for the best" to be a sound plan.
-
-Auto-vectorization is problematic for multiple reasons. First, vectorization 
-is not explicit in the source code and so compilers must divine correctness 
-when attempting to apply vectorization optimizations. Since most compilers 
-are very conservative in this regard, many vectorization opportunities are 
-typically missed when one relies solely on compiler auto-vectorization. 
-Second, every compiler will treat your code differently since compiler 
-implementations use different heuristics, even for different versions of the 
-same compiler. So performance portability is not just an issue with respect to
-hardware, but also across compilers. Third, it is impossible in general for 
-most application developers to clearly understand the decisions made by a 
-compiler during its optimization process. 
+Quoting Tim Foley in `Matt Pharr's blog <https://pharr.org/matt/blog/2018/04/18/ispc-origins>`_ -- "Auto-vectorization is not a programming model". This is
+true, of course, unless you consider "hope for the best" that the compiler
+optimizes the way you want to be a sound code development strategy.
+
+Compiler auto-vectorization is problematic for multiple reasons. First, when 
+vectorization is not explicit in source code, compilers must divine correctness 
+when attempting to apply vectorization optimizations. Most compilers are very 
+conservative in this regard, due to the possibility of data aliasing in C and
+C++ and prioritizing correctness over performance. Thus, many vectorization 
+opportunities are usually missed when one relies solely on compiler 
+auto-vectorization.  Second, every compiler will treat your code differently 
+since compiler implementations use different optimization heuristics, even in
+different versions of the same compiler. So performance portability is not 
+just an issue with respect to hardware, but also for compilers. Third, it is 
+generally impossible for most application developers to clearly understand 
+the choices made by compilers during optimization processes.
 
 Using vectorization intrinsics in application source code is also problematic 
 because different processors support different instruction set architectures
 (ISAs) and so source code portability requires a mechanism that insulates it 
 from architecture-specific code.
 
-GPU programming makes us be explicit about parallelization, and SIMD 
+Writing GPU code makes a programmer be explicit about parallelization, and SIMD 
 is really no different. RAJA enables single-source portable code across a 
 variety of programming model back-ends. The RAJA vectorization abstractions
-introduced here are an attempt to bring a level of convergence between SIMD 
+introduced here are an attempt to bring some convergence between SIMD 
 and GPU programming by providing uniform access to hardware-specific 
 acceleration.
 
-.. note:: **Auto-vectorization is not a programming model.** --Tim Foley
+.. important:: **Auto-vectorization is not a programming model.** --Tim Foley
 
 ---------------------
 Register
 ---------------------
 
-``RAJA::expt::Register<T, REGISTER_POLICY>`` is a class template that takes a
-a data type parameter ``T`` and a register policy ``REGISTER_POLICY`` that
-indicates the hardware register type. The ``RAJA::expt::Register`` interface 
-provides uniform access to register-level operations. It is intended as a 
-building block for higher level abstractions. A ``RAJA::expt::Register`` type 
-represents one SIMD register on a CPU architecture and 1 value/SIMT lane on 
-a GPU architecture. 
-
-.. note:: A user can use the ``RAJA::expt::Register`` type directly in their
-          code. However, we do not recommend this. Instead, we want users to 
-          employ higher level abstractions that RAJA provides.
+``RAJA::expt::Register<T, REGISTER_POLICY>`` is a class template with 
+parameters for a data type ``T`` and a register policy ``REGISTER_POLICY``, 
+which specifies the hardware register type. It is intended as a building block 
+for higher level abstractions.  The ``RAJA::expt::Register`` interface provides
+uniform access to register-level operations for different hardware features 
+and ISA models. A ``RAJA::expt::Register`` type represents one SIMD register 
+on a CPU architecture and 1 value/SIMT lane on a GPU architecture. 
 
 ``RAJA::expt::Register`` supports four scalar element types, ``int32_t``, 
 ``int64_t``, ``float``, and ``double``. These are the only types that are 
@@ -101,15 +109,20 @@ portable across all SIMD/SIMT architectures. ``Bfloat``, for example, is not
 portable, so we don't provide support for that type.
 
 ``RAJA::expt::Register`` supports the following SIMD/SIMT hardware-specific 
-ISAs: AVX, AVX2, and AVX512 for SIMD CPU vectorization, and CUDA warp,
-HIP wavefront for GPUs. Scalar support is provided for all hardware for
-portability and experimentation/analysis. Extensions to support other 
-architectures may be forthcoming and should be straightforward to implement.
+ISAs: AVX, AVX2, and AVX512 for SIMD CPU vectorization, and CUDA warp and
+HIP wavefront for NVIDIA and AMD GPUs, respectively. Scalar support is 
+provided for all hardware for portability and experimentation/analysis. 
+Extensions to support other architectures may be forthcoming as they are 
+needed and requested by users.
+
+.. note:: One can use the ``RAJA::expt::Register`` type directly in her
+          code. However, we do not recommend it. Instead, we want users to 
+          employ higher level abstractions that RAJA provides.
 
 Register Operations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-``RAJA::expt::Register`` provides various operations, including:
+``RAJA::expt::Register`` provides various operations which include:
 
   * Basic SIMD handling: get element, broadcast
   * Memory operations: load (packed, strided, gather) and store (packed, strided, scatter)
@@ -124,41 +137,43 @@ Register Operations
 Register DAXPY Example
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The following is a code example that shows using the ``RAJA::expt::Register`` 
-class to perform a DAXPY kernel with AVX2 CPU SIMD instructions.
-Again, we do not recommend that you write code directly using the Register
-class, but use the higher level VectorRegister abstraction.  
-However, this example demonstrates how the higher level abstractions are
-using the Register class::
+The following code example shows how to use the ``RAJA::expt::Register`` 
+class to perform a DAXPY kernel with AVX2 SIMD instructions.
+While we do not recommend that you write code directly using the Register
+class, but instead use the higher level VectorRegister abstraction, we use
+the Register type here to illustrate the basics mechanics of SIMD 
+vectorization::
 
-  // define array length
+  // Define array length
   int len = ...;
 
-  // data used in kernel
+  // Define data used in kernel
   double a = ...;
   double const *X = ...; 
   double const *Y = ...; 
   double *Z = ...; 
 
+  // Define an avx2 register, which has width of 4 doubles	
   using reg_t = RAJA::expt::Register<double, RAJA::expt::avx2_register>;
-  int reg_width = reg_t::s_num_elem;    // width of avx2 register is 4 doubles	
+  int reg_width = reg_t::s_num_elem;
 
-  // Compute daxpy in chunks of 4 values at one time
+  // Compute daxpy in chunks of 4 values (register width) at a time
   for (int i = 0;i < len; i += reg_width){
     reg_t x, y;
     
-    // load 4 consecutive values of X, Y arrays into registers
+    // Load 4 consecutive values of X, Y arrays into registers
     x.load_packed( X+i );
     y.load_packed( Y+i );
 
-    // perform daxpy on 4 values simultaneously (store in register)
+    // Perform daxpy on 4 values simultaneously and store in a register
     reg_t z = a * x + y;
 
-    // store register result in Z array
+    // Store register result in Z array
     z.store_packed( Z+i );
   }
 
-  // loop postamble code
+  // Loop postamble code to complete daxpy operation when array length
+  // is not an integer multiple of the register width
   int remainder = len % reg_width;
   if (remainder) {
     reg_t x, y;
@@ -166,119 +181,114 @@ using the Register class::
     // 'i' is the starting array index of the remainder
     int i = len - remainder;
        
-    // load remainder values of X, Y arrays into registers 
+    // Load remainder values of X, Y arrays into registers 
     x.load_packed_n( X+i, remainder );
     y.load_packed_n( Y+i, remainder );
 
-    // perform daxpy on remainder values simultaneously (store in register)
+    // Perform daxpy on remainder values simultaneously and store in register
     reg_t z = a * x + y;
 
-    // store register result in Z array
+    // Store register result in Z array
     z.store_packed_n(Z+i, remainder);
   }
 
 This code is guaranteed to vectorize since the ``RAJA::expt::Register`` 
-operations insert the appropriate SIMD intrinsic operations into the method 
-calls. Note that ``RAJA::expt::Register`` provides overloads of basic 
-arithmetic operations so that the DAXPY operation itself (z = a * x + y) looks 
+operations insert the appropriate SIMD intrinsics into the operation 
+calls. Since ``RAJA::expt::Register`` provides overloads of basic 
+arithmetic operations, the SIMD DAXPY operation ``z = a * x + y`` looks 
 like vanilla scalar code.
 
-Note that since we are using bare pointers to the data, load and store 
+Because we are using bare pointers to the data, load and store 
 operations are performed by explicit method calls in the code. Also, we must
-write (duplicate) postamble code to handle cases where the array length 
-(len) is not an integer multiple of the register width. The postamble code 
-perform the DAXPY operation on the *remainder* of the array that remains after 
-the for-loop.
-
-**These extra lines of code should make it clear why we do not recommend
-using ``RAJA::Register`` directly in application code.**
+write explicit *postamble* code to handle cases where the array length 
+``len`` is not an integer multiple of the register width ``reg_width``. The 
+postamble code performs the DAXPY operation on the *remainder* of the array 
+that is excluded from the for-loop, which is strided by the register width.
 
+**The need to write extra postamble code should make clear one reason why we 
+do not recommend using ``RAJA::Register`` directly in application code.**
 
--------------------
-Tensor Register
--------------------
-
-``RAJA::expt::TensorRegister< >`` is a class template that provides a 
-higher-level interface on top of the ``RAJA::expt::Register`` class.  
-``RAJA::expt::TensorRegister< >`` wraps one or more 
-``RAJA::expt::Register< >`` objects to create a tensor-like object.
-
-.. note:: As with ``RAJA::expt::Register``, we don't recommend using 
-          ``RAJA::expt::TensorRegister`` directly. Rather, we recommend using
-          use-case specific types that RAJA provides and which are described 
-          below.
+------------------
+Vector Register
+------------------
 
 **To make code cleaner and more readable, the specific types are intended to
 be used with ``RAJA::View`` and ``RAJA::expt::TensorIndex`` objects.**
 
-Vector Register
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
 ``RAJA::expt::VectorRegister<T, REGISTER_POLICY, NUM_ELEM>`` provides an 
 abstraction for a vector of arbitrary length. It is implemented using one or 
 more ``RAJA::expt::Register`` objects. The vector length is independent of the 
-underlying register width. The template parameters are: ``T`` data type, 
-``REGISTER_POLICY`` vector register policy, and ``NUM_ELEM`` number of 
-data elements of type ``T`` that fit in a register. The last two of these
-have defaults for all cases, so they do not usually need to be provided by
-a user.
-
-Earlier, we said that we do not recommended using ``RAJA::expt::Register``
-directly. The reason for this is that it is good to decouple
-vector length from hardware register size since it allows one to write
+underlying register width. The template parameters are: data type ``T``, 
+vector register policy ``REGISTER_POLICY``, and ``NUM_ELEM`` which 
+is the number of data elements of type ``T`` that fit in a register. The last 
+two of these template parameters have defaults for all cases, so a user
+need note provide them in most cases.
+
+Recall that we said earlier that we do not recommended using 
+``RAJA::expt::Register`` directly. One important reason for this is that 
+decoupling the vector length from hardware register size allows one to write
 simpler, more readable code that is easier to get correct. This should be 
-clear from the code example below.
+clear from the code example below, when compared to the previous code example.
 
 Vector Register DAXPY Example
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The following code example shows the DAXPY computation shown above written 
-using ``RAJA::expt::VectorRegister``, ``RAJA::expt::VectorIndex``, and 
-``RAJA::View`` classes, which obviate the need for the extra lines of code 
-discussed earlier::
+The following code example shows the DAXPY computation discussed above,
+but written using ``RAJA::expt::VectorRegister``, ``RAJA::expt::VectorIndex``, 
+and ``RAJA::View`` types. Using these types, we can write cleaner, more 
+concise code that is easier to get correct because it is simpler. For example,
+we do not have to write the postamble code discussed earlier::
 
-  // define array length and data used in kernel (as before)
+  // Define array length and data used in kernel (as before)
   int len = ...;
   double a = ...;
   double const *X = ...;
   double const *Y = ...;
   double *Z = ...;
 
-  // define vector register and index types
+  // Define vector register and index types
   using vec_t = RAJA::expt::VectorRegister<double, RAJA::expt::avx2_register>;
   using idx_t = RAJA::expt::VectorIndex<int, vec_t>;
 
-  // wrap array pointers in RAJA View objects   
+  // Wrap array pointers in RAJA View objects   
   auto vX = RAJA::make_view( X, len );
   auto vY = RAJA::make_view( Y, len );
   auto vZ = RAJA::make_view( Z, len );
 
-  // 'all' knows the length of vX, vY, and vZ from the View objects
-  // and it encodes the vector type
+  // The 'all' variable gets the length of the arrays from the vX, vY, and 
+  // vZ View objects and encodes the vector register type
   auto all = idx_t::all();
 
-  // compute the complete array daxpy in one line of code
-  // this produces a vectorized loop, and the loop postamble
+  // Compute the complete array daxpy in one line of code
+  // this produces a vectorized loop and the loop postamble
+  // in the executable
   vZ( all ) = a * vX( all ) + vY( all );
 
-This code has several advantages over the previous example. It is guaranteed 
-to vectorize and is much easier to read, get correct, and maintain since 
-the ``RAJA::View`` class handles the looping and postamble code automatically 
-to allow arrays of arbitrary size. The ``RAJA::View`` class provides overloads 
-of the arithmetic operations based on the 'all' type and inserts the 
-appropriate SIMD instructions and load/store operations to vectorize the 
-operations as in the earlier example. It may be considered by some to be 
-inconvenient to have to use the ``RAJA::View`` class, but it is easy to wrap 
-bare pointers as can is shown in the example.
+It should be clear that this code has several advantages over the previous 
+code example. It is guaranteed to vectorize as before, but it is much easier 
+to read, get correct, and maintain since the ``RAJA::View`` class handles the 
+looping and postamble code automatically for arrays of arbitrary size. The 
+``RAJA::View`` class provides overloads of the arithmetic operations based on 
+the ``all`` variable and inserts the appropriate SIMD instructions and 
+load/store operations to vectorize the operations that were explicit in the 
+earlier example. It may be considered by some to be inconvenient to have to 
+use the ``RAJA::View`` class, but it is easy to wrap bare pointers as is shown
+here.
 
 Expression Templates
-^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^
 
-The figure below shows the sequence of SIMD operations, in the form of an
-*abstract syntax tree (AST)*, applied in the DAXPY code by the RAJA constructs 
-used in the code example. During compilation, a tree of *expression template*
-objects is constructed based on the order of operations that appear in the 
-kernel. Specifically, the operation sequence is the following:
+The figure below shows the sequence of SIMD operations, as they are parsed to
+form of an *abstract syntax tree (AST)*, for the DAXPY code in the vector 
+register code example above.
+
+.. figure:: ../figures/vectorET.png
+
+   An AST illustration of the SIMD operations in the DAXPY code.
+
+During compilation, a tree of *expression template* objects is constructed 
+based on the order of operations that appear in the DAXPY kernel. Specifically, 
+the operation sequence is the following:
 
   #. Load a chunk of values in 'vX' into a register.
   #. Broadcast the scalar value 'a' to each slot in a vector register.
@@ -289,26 +299,21 @@ kernel. Specifically, the operation sequence is the following:
   #. Write the result in the register to the 'vZ' array.
 
 ``RAJA::View`` objects indexed by ``RAJA::TensorIndex`` objects 
-(``RAJA::VectorIndex`` in this case) return *LoadStore* expression
+(``RAJA::VectorIndex`` in this case) return *Load/Store* expression
 template objects. Each expression template object is evaluated on assignment 
 and a register chunk size of values is loaded into another register object.
 Finally, the left-hand side of the expression is evaluated by storing the
-chunk of values in the right-hand side result register into the array on the
-left-hand side of the equal sign.
-
-.. figure:: ../figures/vectorET.png
-
-   An AST illustration of the SIMD operations in the DAXPY code.
-
+chunk of values in the right-hand side result register into the array associated
+with the view ``vZ`` on the left-hand side of the equal sign.
 
 
 CPU/GPU Portability
 ^^^^^^^^^^^^^^^^^^^^^
 
-It is important to note that the code in the example in the previous section is 
-*not* portable to run on a GPU because it does not include a way to launch a 
-GPU kernel. The following code example shows how to enable the code to run on 
-either a CPU or GPU via a run time choice::
+It is important to note that the code in the example above can only run on a 
+CPU; i.e., it is *not* portable to run on either a CPU or GPU because it does 
+not include a way to launch a GPU kernel. The following code example shows 
+how to enable the code to run on either a CPU or GPU via a run time choice::
 
   // array lengths and data used in kernel same as above
 
@@ -339,53 +344,69 @@ either a CPU or GPU via a run time choice::
 
 This version of the kernel can be run on a CPU or GPU depending on the run time
 chosen value of the variable ``cpu_or_gpu``. When compiled, the code will 
-generate versions of the kernel for the CPU and GPU based on the parameters 
-in the ``pol_t`` loop policy. The CPU version will be the same as the version
-in the previous section. The GPU version is essentially the same but will
-run in a GPU kernel. Note that there is only one template argument passed to 
-the register when ``vec_t`` is defined. ``RAJA::expt::VectorRegister<double>``
-uses defaults for the register policy, based on the system hardware, and 
-number of data elements of type double that will fit in a register.
+generate versions of the kernel for a CPU and an CUDA GPU based on the 
+parameters in the ``pol_t`` loop policy. The CPU version will be the same 
+as the version described earlier. The GPU version is essentially the same 
+but will run in a GPU kernel. Note that there is only one template argument 
+passed to the register when ``vec_t`` is defined. 
+``RAJA::expt::VectorRegister<double>`` uses defaults for the register policy, 
+based on the system hardware, and number of data elements of type double that 
+will fit in a register.
+
+-------------------
+Tensor Register
+-------------------
+
+``RAJA::expt::TensorRegister< >`` is a class template that provides a 
+higher-level interface on top of ``RAJA::expt::Register``.
+``RAJA::expt::TensorRegister< >`` wraps one or more 
+``RAJA::expt::Register< >`` objects to create a tensor-like object.
+
+.. note:: As with ``RAJA::expt::Register``, we don't recommend using 
+          ``RAJA::expt::TensorRegister`` directly. Rather, we recommend using
+          higher-level abstraction types that RAJA provides and which are 
+          described below.
 
+-----------------------
 Matrix Registers
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------
 
 RAJA provides ``RAJA::expt::TensorRegister`` type aliases to support
 matrices of arbitrary size and shape. These are:
 
-  * ``RAJA::expt::SquaretMatrixRegister<T, LAYOUT, REGISTER_POLICY>`` which
+  * ``RAJA::expt::SquareMatrixRegister<T, LAYOUT, REGISTER_POLICY>`` which
     abstracts operations on an N x N square matrix.
   * ``RAJA::expt::RectMatrixRegister<T, LAYOUT, ROWS, COLS, REGISTER_POLICY>`` 
-     which abstracts operations on an N x M rectangular matrix.
+    which abstracts operations on an N x M rectangular matrix.
 
 Matrices are implemented using one or more ``RAJA::expt::Register`` 
 objects. Data layout can be row-major or column major. Matrices are intended 
 to be used with ``RAJA::View`` and ``RAJA::expt::TensorIndex`` objects,
-similar to what was shown above with ``RAJA::expt::VectorRegister`` example.
+similar to what was shown above in the ``RAJA::expt::VectorRegister`` example.
 
-Matrix operations support matrix-matrix, matrix-vector, and vector-matrix 
+Matrix operations support matrix-matrix, matrix-vector, vector-matrix 
 multiplication, and transpose operations. Rows or columns can be represented
 with one or more registers, or a power-of-two fraction of a single register.
-This is important for CUDA GPU warp/wavefront registers, which are 32-wide for
+This is important for GPU warp/wavefront registers, which are 32-wide for
 CUDA and 64-wide for HIP.
 
-Here is a simple code example that performs the matrix-analogue of the 
-vector DAXPY operation presented above using square matrices::
+Here is a code example that performs the matrix-analogue of the 
+vector DAXPY operation using square matrices::
 
-  // define matrix size and data used in kernel (similar to before)
+  // Define matrix size and data used in kernel (similar to before)
   int N = ...;
   double a = ...;
   double const *X = ...;
   double const *Y = ...;
   double *Z = ...;
 
-  // define matrix register and row/column index types
+  // Define matrix register and row/column index types
   using mat_t = RAJA::expt::SquareMatrixRegister<double, 
                                                  RAJA::expt::RowMajorLayout>;
   using row_t = RAJA::expt::RowIndex<int, mat_t>;
   using col_t = RAJA::expt::ColIndex<int, mat_t>;
 
-  // wrap array pointers in RAJA View objects (similar to before)
+  // Wrap array pointers in RAJA View objects (similar to before)
   auto mX = RAJA::make_view( X, N, N );
   auto mY = RAJA::make_view( Y, N, N );
   auto mZ = RAJA::make_view( Z, N, N );
@@ -409,9 +430,10 @@ vector DAXPY operation presented above using square matrices::
     ); 
 
 Conceptually, as well as implementation-wise, this is similar to the previous
-vector example except the operations are in two dimensions. The kernel code is 
-easy to read, it is guaranteed to vectorize, and iterating over the data is 
-handled by RAJA (register width sized chunk, plus postamble scalar operations).
-Again, the ``RAJA::View`` arithmetic operation overloads insert the 
+vector example except the operations are on two-dimensional matrices. The 
+kernel code is easy to read, it is guaranteed to vectorize, and iterating 
+over the data is handled by RAJA view objects (register-width sized chunk, 
+plus postamble scalar operations), and it can run on a CPU or NVIDIA GPU. As 
+before, the ``RAJA::View`` arithmetic operation overloads insert the 
 appropriate vector instructions in the code.
 
diff --git a/docs/sphinx/user_guide/feature/view.rst b/docs/sphinx/user_guide/feature/view.rst
index da10c83a25..79814cd348 100644
--- a/docs/sphinx/user_guide/feature/view.rst
+++ b/docs/sphinx/user_guide/feature/view.rst
@@ -6,7 +6,7 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _view-label:
+.. _feat-view-label:
 
 ===============
 View and Layout
@@ -31,7 +31,16 @@ to access a matrix entry in row `r` and column `c`. However, this solution has
 limitations; e.g., additional macro definitions may be needed when adopting a 
 different matrix data layout or when using other matrices. To facilitate
 multi-dimensional indexing and different indexing layouts, RAJA provides 
-``RAJA::View`` and ``RAJA::Layout`` classes.
+``RAJA::View``, ``RAJA::Layout``, and ``RAJA::OffsetLayout`` classes.
+
+Please see the following tutorial sections for detailed examples that use
+RAJA Views and Layouts:
+
+ * :ref:`tut-view_layout-label`
+ * :ref:`tut-offsetlayout-label`
+ * :ref:`tut-permutedlayout-label`
+ * :ref:`tut-kernelexecpols-label`
+ * :ref:`tut-launchexecpols-label`
 
 ----------
 RAJA Views
@@ -167,7 +176,7 @@ stride, the third index (index 2 - extent 11) has stride 5, and the
 second index (index 1 - extent 7) has stride 55 (= 5*11).
 
 .. note:: If a permuted layout is created with the *identity permutation* 
-          (e.g., {0,1,2}, the layout is the same as if it were created by 
+          (e.g., {0,1,2}), the layout is the same as if it were created by 
           calling the Layout constructor directly with no permutation.
 
 The first argument to ``RAJA::make_permuted_layout`` is a C++ array whose
@@ -210,16 +219,16 @@ Offset Layout
 The ``RAJA::make_offset_layout`` method creates a ``RAJA::OffsetLayout`` object 
 with offsets applied to the indices. For example,::
 
-  double* C = new double[11]; 
+  double* C = new double[10]; 
 
   RAJA::Layout<1> layout = RAJA::make_offset_layout<1>( {{-5}}, {{5}} );
 
   RAJA::View<double, RAJA::OffsetLayout<1> > Cview(C, layout);
 
 creates a one-dimensional view with a layout that allows one to index into
-it using indices in :math:`[-5, 5]`. In other words, one can use the loop::
+it using indices in :math:`[-5, 5)`. In other words, one can use the loop::
 
-  for (int i = -5; i < 6; ++i) {
+  for (int i = -5; i < 5; ++i) {
     CView(i) = ...;
   } 
 
@@ -228,21 +237,22 @@ to an array offset index by subtracting the lower offset from it; i.e., in
 the loop, each 'i' value has '-5' subtracted from it to properly access the
 array entry. That is, the sequence of indices generated by the for-loop::
 
-  -5 -4 -3 ... 5
+  -5 -4 -3 ... 4
 
 will index into the data array as::
 
-  0 1 2 ... 10
+  0 1 2 ... 9
 
 The arguments to the ``RAJA::make_offset_layout`` method are C++ arrays that
-hold the start and end values of the indices. RAJA offset layouts support
-any number of dimensions; for example::
+hold the begin-end values of indices in the half-open interval 
+:math:[begin, end)`. RAJA offset layouts support any number of dimensions; 
+for example::
 
   RAJA::OffsetLayout<2> layout = 
      RAJA::make_offset_layout<2>({{-1, -5}}, {{2, 5}});
 
 defines a two-dimensional layout that enables one to index into a view using 
-indices :math:`[-1, 2]` in the first dimension and indices :math:`[-5, 5]` in
+indices :math:`[-1, 2)` in the first dimension and indices :math:`[-5, 5)` in
 the second dimension. As noted earlier, double braces are needed to 
 properly initialize the internal data in the layout object.
 
@@ -257,10 +267,10 @@ indices. For example,::
   RAJA::OffsetLayout<2> layout = 
     RAJA::make_permuted_offset_layout<2>( {{-1, -5}}, {{2, 5}}, perm ); 
 
-Here, the two-dimensional index space is :math:`[-1, 2] \times [-5, 5]`, the
+Here, the two-dimensional index space is :math:`[-1, 2) \times [-5, 5)`, the
 same as above. However, the index strides are permuted so that the first 
-index (index 0) has unit stride and the second index (index 1) has stride 4, 
-which is the extent of the first index (:math:`[-1, 2]`).
+index (index 0) has unit stride and the second index (index 1) has stride 3, 
+which is the extent of the first index (:math:`[-1, 2)`).
 
 .. note:: It is important to note some facts about RAJA layout types. 
           All layouts have a permutation. So a permuted layout and 
@@ -272,7 +282,7 @@ which is the extent of the first index (:math:`[-1, 2]`).
           ``RAJA::View`` data access operator when they are not needed.
 
 Complete examples illustrating ``RAJA::Layouts`` and ``RAJA::Views``  may 
-be found in the :ref:`offset-label` and :ref:`permuted-layout-label`
+be found in the :ref:`tut-offsetlayout-label` and :ref:`tut-permutedlayout-label`
 tutorial sections.
 
 Typed Layouts
@@ -282,7 +292,7 @@ RAJA provides typed variants of ``RAJA::Layout`` and ``RAJA::OffsetLayout``
 that enable users to specify integral index types. Usage requires 
 specifying types for the linear index and the multi-dimensional indicies. 
 The following example creates two two-dimensional typed layouts where the 
-linear index is of type TIL and the '(x, y)' indices for accesingg the data 
+linear index is of type TIL and the '(x, y)' indices for accessing the data 
 have types TIX and TIY::
 
    RAJA_INDEX_VALUE(TIX, "TIX");
@@ -404,5 +414,5 @@ runtime bounds checking for RAJA views. This may be a useful debugging aid for
 users. When attempting to use an index value that is out of bounds,
 RAJA will abort the program and print the index that is out of bounds and
 the value of the index and bounds for it. Since the bounds checking is a runtime
-operation, it incurs non-negligible overhead. When bounds checkoing is turned 
+operation, it incurs non-negligible overhead. When bounds checking is turned 
 off (default case), there is no additional run time overhead incurred. 
diff --git a/docs/sphinx/user_guide/feature/workgroup.rst b/docs/sphinx/user_guide/feature/workgroup.rst
index fba8e19310..25087d64dd 100644
--- a/docs/sphinx/user_guide/feature/workgroup.rst
+++ b/docs/sphinx/user_guide/feature/workgroup.rst
@@ -21,16 +21,15 @@ represents an executable form of those loops and when run makes a ``RAJA::WorkSi
 that the RAJA workgroup constructs API is still being developed and may change in later RAJA
 releases.
 
-.. note:: * All **workgroup** constructs are in the namespace ``RAJA``.
-          * The ``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates
-            are templated on:
+.. note:: * All workgroup constructs are in the namespace ``RAJA``.
+          * The ``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates are templated on:
               * a WorkGroup policy which is composed of:
-                  * a work execution policy.
-                  * a work ordering policy.
-                  * a work storage policy.
+                  * a work execution policy
+                  * a work ordering policy
+                  * a work storage policy
+                  * a work dispatch policy
               * an index type that is the first argument to the loop bodies.
-              * a list of extra argument types that are the rest of the arguments to
-                the loop bodies.
+              * a list of extra argument types that are the rest of the arguments to the loop bodies.
               * an allocator type to be used for the memory used to store and
                 manage the loop bodies.
           * The ``RAJA::WorkPool::enqueue`` method takes two arguments:
@@ -43,7 +42,7 @@ Examples showing how to use RAJA workgroup methods may be found in
 the :ref:`tutorial-label`.
 
 For more information on RAJA work policies and iteration space constructs,
-see :ref:`policies-label` and :ref:`index-label`, respectively.
+see :ref:`feat-policies-label` and :ref:`feat-index-label`, respectively.
 
 .. _workgroup-Policies-label:
 
@@ -52,19 +51,20 @@ Policies
 --------
 
 The behavior of the RAJA workgroup constructs is determined by a policy.
-The ``RAJA::WorkGroupPolicy`` has three components, a work execution policy,
-a work ordering policy, and a work storage policy. ``RAJA::WorkPool``,
-``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates all
-take the same policy and template arguments.  For example::
+The ``RAJA::WorkGroupPolicy`` has four components, a work execution policy,
+a work ordering policy, a work storage policy, and a work dispatch policy.
+``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates
+all take the same policy and template arguments.  For example::
 
   using workgroup_policy = RAJA::WorkGroupPolicy <
                                RAJA::seq_work,
                                RAJA::ordered,
-                               RAJA::ragged_array_of_objects >;
+                               RAJA::ragged_array_of_objects,
+                               RAJA::indirect_function_call_dispatch >;
 
 is a workgroup policy that will run loops sequentially on the host in the order
-they were enqueued and store the loop bodies sequentially in single buffer in
-memory.
+they were enqueued, stores the loop bodies sequentially in single buffer in
+memory, and dispatches each loop using a function pointer.
 
 The work execution policy acts like the execution policies used with ``RAJA::forall``
 and determines the backend used to run the loops and the parallelism within each
@@ -97,25 +97,24 @@ The work ordering policy acts like the segment iteration execution policies when
 ``RAJA::forall`` is used with a ``RAJA::IndexSet`` and determines the backend
 used when iterating over the loops and the parallelism between each loop.
 
- ====================================== ========================================
- Work Execution Policies                Brief description
- ====================================== ========================================
- ordered                                Execute loops sequentially in the order
-                                        they were enqueued using forall.
- reverse_ordered                        Execute loops sequentially in the
-                                        reverse of the order order they were
-                                        enqueued using forall.
- unordered_cuda_loop_y_block_iter_x_threadblock_average
-                                        Execute loops in parallel by mapping
-                                        each loop to a set of cuda blocks with
-                                        the same index in the y direction in
-                                        a cuda kernel. Each loop is given a
-                                        number of threads over one of more
-                                        blocks in the x direction equal to the
-                                        average number of iterations of all the
-                                        loops rounded up to a multiple of the
-                                        block size.
- ====================================== ========================================
+ ======================================================= ========================================
+ Work Ordering Policies                                  Brief description
+ ======================================================= ========================================
+ ordered                                                 Execute loops sequentially in the order
+                                                         they were enqueued using forall.
+ reverse_ordered                                         Execute loops sequentially in the
+                                                         reverse of the order order they were
+                                                         enqueued using forall.
+ unordered_cuda_loop_y_block_iter_x_threadblock_average  Execute loops in parallel by mapping
+                                                         each loop to a set of cuda blocks with
+                                                         the same index in the y direction in
+                                                         a cuda kernel. Each loop is given a
+                                                         number of threads over one of more
+                                                         blocks in the x direction equal to the
+                                                         average number of iterations of all the
+                                                         loops rounded up to a multiple of the
+                                                         block size.
+ ======================================================= ========================================
 
 The work storage policy determines the strategy used to allocate and layout the
 storage used to store the ranges, loop bodies, and other data necessary to
@@ -140,6 +139,23 @@ implement the workstorage constructs.
                                         the loop  data items as needed.
  ====================================== ========================================
 
+The work dispatch policy determines the technique used to dispatch from type
+erased storage to the loops or iterations of each range and loop body pair.
+
+ ====================================== ========================================
+ Work Dispatch Policies                 Brief description
+ ====================================== ========================================
+ indirect_function_call_dispatch        Dispatch using function pointers.
+ indirect_virtual_function_dispatch     Dispatch using virtual functions in a
+                                        class hierarchy.
+ direct_dispatch<                       Dispatch using a switch statement like
+     camp::list<Range, Callable>...>    coding to pick the right pair of
+                                        Range and Callable types from the
+                                        template parameter pack. You may only
+                                        enqueue a range and callable pair that
+                                        is in the list of types in the policy.
+ ====================================== ========================================
+
 
 .. _workgroup-Arguments-label:
 
@@ -187,16 +203,17 @@ policies::
 
   using Allocator = std::allocator<char>;
 
-.. note:: * The allocator type must use template argument char.
+.. note:: * The allocator type must use template argument ``char``.
           * Allocators must provide memory that is accessible where it is used.
               * Ordered work order policies only require memory that is accessible
                 where loop bodies are enqueued.
               * Unordered work order policies require memory that is accessible
                 from both where the loop bodies are enqueued and from where the
                 loop is executed based on the work execution policy.
-                  * For example when using cuda work exeution policies with cuda
-                    unordered work order policies pinned memory is a good choice
-                    because it is always accessible on the host and device.
+
+                  For example, when using cuda work exeution policies with CUDA
+                  unordered work order policies, pinned memory is a good choice
+                  because it is always accessible on the host and device.
 
 
 .. _workgroup-WorkPool-label:
diff --git a/docs/sphinx/user_guide/features.rst b/docs/sphinx/user_guide/features.rst
index 8752b546c9..d65384d782 100644
--- a/docs/sphinx/user_guide/features.rst
+++ b/docs/sphinx/user_guide/features.rst
@@ -12,7 +12,10 @@
 RAJA Features
 ************************
 
-The following sections describe key aspects of the main RAJA features.
+The following sections describe the main RAJA features. They are intended
+to introduce users to the features and basic usage and also to provide
+a syntax reference guide. The sections contain links to RAJA tutorial 
+materials that provide detailed examples of usage.
 
 .. toctree::
    :maxdepth: 2
@@ -22,13 +25,13 @@ The following sections describe key aspects of the main RAJA features.
    feature/iteration_spaces
    feature/view
    feature/reduction
-   feature/resource
    feature/atomic
    feature/scan
    feature/sort
+   feature/resource
    feature/local_array
    feature/tiling
-   feature/plugins
    feature/workgroup
    feature/vectorization
+   feature/plugins
 
diff --git a/docs/sphinx/user_guide/figures/vertexsum.jpg b/docs/sphinx/user_guide/figures/vertexsum.jpg
index ea61476db0..967c6aec06 100644
Binary files a/docs/sphinx/user_guide/figures/vertexsum.jpg and b/docs/sphinx/user_guide/figures/vertexsum.jpg differ
diff --git a/docs/sphinx/user_guide/figures/vertexsum_color.png b/docs/sphinx/user_guide/figures/vertexsum_color.png
new file mode 100644
index 0000000000..3071d526ad
Binary files /dev/null and b/docs/sphinx/user_guide/figures/vertexsum_color.png differ
diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst
index 0d66ada68c..b9f868d10a 100644
--- a/docs/sphinx/user_guide/getting_started.rst
+++ b/docs/sphinx/user_guide/getting_started.rst
@@ -13,145 +13,206 @@
 Getting Started With RAJA
 *************************
 
-This section will help get you up and running with RAJA quickly.
+This section should help get you up and running with RAJA quickly.
 
 ============
 Requirements
 ============
 
-The primary requirement for using RAJA is a C++14 compliant compiler.
-Accessing various programming model back-ends requires that they be supported
-by the compiler you chose. Available options and how to enable or disable 
-them are described in :ref:`configopt-label`. To build RAJA in its most basic
-form and use its simplest features:
+The primary requirement for using RAJA is a C++14 standard compliant compiler.
+Certain features, such as various programming model back-ends like CUDA or HIP, 
+msut be supported by the compiler you chose to use them. Available RAJA
+configuration options and how to enable or disable features are described 
+in :ref:`configopt-label`. 
+
+To build RAJA and use its most basic features, you will need:
 
 - C++ compiler with C++14 support
-- `CMake <https://cmake.org/>`_ version 3.14.5 or greater.
+- `CMake <https://cmake.org/>`_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise.
 
 
 ==================
 Get the Code
 ==================
 
-The RAJA project is hosted on `GitHub <https://github.com/LLNL/RAJA>`_.
-To get the code, clone the repository into a local working space using
-the command::
+The RAJA project is hosted on GitHub: 
+`GitHub RAJA project <https://github.com/LLNL/RAJA>`_. To get the code, clone 
+the repository into a local working space using the command::
 
    $ git clone --recursive https://github.com/LLNL/RAJA.git
 
-The ``--recursive`` argument above is needed to pull in necessary RAJA
-dependencies as Git *submodules*. Current RAJA dependencies are:
+The ``--recursive`` option above is used to pull RAJA Git *submodules*, on 
+which RAJA depends, into your local copy of the RAJA repository.
+
+After running the clone command, a copy of the RAJA repository will reside in
+the ``RAJA`` subdirectory where you ran the clone command. You will be on the 
+``develop`` branch, which is the default RAJA branch.
+
+If you do not pass the ``--recursive`` argument to the ``git clone``
+command, you can also type the following commands after cloning::
+
+  $ cd RAJA
+  $ git submodule update --init --recursive
+
+Either way, the end result is the same and you should be good to configure the
+code and build it.
+
+.. note:: * If you switch branches in a RAJA repo (e.g., you are on a branch,
+            with everything up-to-date, and you run the command 
+            ``git checkout <different branch name>``, you may need to run 
+            the command ``git submodule update`` to set the Git submodule 
+            versions to what is used by the new branch.
+          * If the set of submodules in a new branch is different than the
+            previous branch you were on, you may need to run the command
+            ``git submodule update --init --recursive`` to pull in the 
+            correct set of submodule and versions.
+
+.. _getting_started_depend-label:
+
+==================
+Dependencies
+==================
 
+RAJA has several dependencies that are required based on how you want to
+build and use it. The RAJA Git repository has submodules that contain 
+most of these dependencies.
+
+RAJA includes other submodule dependencies, which are used to support our 
+Gitlab CI testing. These are described in the RAJA Developer Guide. 
+
+Dependencies that are required to build the RAJA code are:
+
+- A C++ 14 standard compliant compiler
 - `BLT build system <https://github.com/LLNL/blt>`_
+- `CMake <https://cmake.org/>`_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise.
 - `Camp compiler agnostic metaprogramming library  <https://github.com/LLNL/camp>`_
-- `CUB CUDA utilities library <https://github.com/NVlabs/cub>`_
-- `rocPRIM HIP parallel primitives library <https://github.com/ROCmSoftwarePlatform/rocPRIM.git>`_
 
-You probably don't need to know much about these other projects to start
-using RAJA. But, if you want to know more about them, click on the links above.
+Other dependencies that users should be aware of that support certain 
+features are:
 
-After running the clone command, a copy of the RAJA repository will reside in
-a ``RAJA`` subdirectory where you ran the clone command. You will be on the 
-``develop`` branch of RAJA, which is our default branch.
+- `CUB CUDA utilities library <https://github.com/NVlabs/cub>`_, which is required for using the RAJA CUDA back-end.
+- `rocPRIM HIP parallel primitives library <https://github.com/ROCmSoftwarePlatform/rocPRIM.git>`_, which is required for using the RAJA HIP back-end.
+- `Desul <https://github.com/desul/desul>`_, which is required if you want to use Desul atomics in RAJA instead of our current default atomics. Note that we plan to switch over to Desul atomics exclusively at some point.
 
-If you do not pass the ``--recursive`` argument to the ``git clone``
-command, you can type the following commands after cloning::
+.. note:: You may want or need to use external versions of camp, CUB, or 
+          rocPRIM instead of the RAJA submodules. This is usually the case
+          when you are using RAJA along with some other library that also
+          needs one of these. To do so, you need to use CMake variables to 
+          pass a path to a valid installation of each library. Specifically:
 
-  $ cd RAJA
-  $ git submodule init
-  $ git submodule update
+            * External camp::
+
+                cmake \
+                ... \
+                -Dcamp_DIR=path/to/camp/install \
+                ...
+
+            * External CUB::
+
+                cmake \
+                ... \ 
+                -DRAJA_ENABLE_EXTERNAL_CUB=On \
+                -DCUB_DIR=path/to/cub \
+                ...
+
+            * External rocPRIM:: 
+
+                cmake \
+                ... \
+                -DRAJA_ENABLE_EXTERNAL_ROCPRIM=On \
+                -DROCPRIM_DIR=path/to/rocPRIM \
+                ... 
+
+More information about configuring GPU builds with CUDA or HIP is provided
+in :ref:`getting_started_build_gpu-label`
 
-Either way, the end result is the same and you should be good to go.
+Additional discussion of these dependencies, with respect to building RAJA, is 
+provided in :ref:`getting_started_build-label`. Other than that, you probably 
+don't need to know much about them. If you are curious and want to know more, 
+please click on the link to the library you want to know about in the above 
+list.
 
-.. note:: Any time you switch branches in RAJA, you need to re-run the
-          'git submodule update' command to set the Git submodules to
-          what is used by the new branch.
+.. _getting_started_build-label:
 
 ==================
 Build and Install
 ==================
 
-Building and installing RAJA can be very easy or more complicated, depending
-on which features you want to use and how easy it is to use your system.
+The complexity of building and installing RAJA depends on which features you 
+want to use and how easy it is to do this on your system.
 
---------------
-Building RAJA
---------------
+.. note:: RAJA builds must be *out-of-source*. In particular, RAJA does not 
+          allow building in its source directory. You must create a build 
+          directory and run CMake in it.
 
-RAJA uses CMake to configure a build. A "bare bones" configuration looks like::
+RAJA uses CMake to configure a build. To create a "bare bones" configuration, 
+build, and install it, you can do the following::
 
   $ mkdir build-dir && cd build-dir
   $ cmake -DCMAKE_INSTALL_PREFIX=/path/to/install ../
+  $ make  (or make -j <N> for a parallel build)
+  $ make install
 
-.. note:: * RAJA requires a minimum CMake version of 3.14.5.
-          * Builds must be *out-of-source*.  RAJA does not allow building in
-            the source directory, so you must create a build directory and
-            run CMake in it.
-
-When you run CMake, it will generate output about the build environment 
-(compiler and version, options, etc.). Some RAJA features, 
-like OpenMP support are enabled by default if, for example, the compiler 
-supports OpenMP. These can be disabled if desired. For a summary of 
-RAJA configuration options, please see :ref:`configopt-label`.
-
-After CMake successfully completes, you compile RAJA by executing the ``make``
-command in the build directory; i.e.,::
-
-  $ make
+Running ``cmake`` generates the RAJA build configuration. Running ``make``
+compiles the code. Running ``make install`` copies RAJA header files 
+to an ``include`` directory and installs the RAJA library in a ``lib`` 
+directory, both in the directory location specified with the
+``-DCMAKE_INSTALL_PREFIX`` CMake option.
 
-If you have access to a multi-core system, you can compile in parallel by 
-running ``make -j`` (to build with all available cores) or ``make -j N`` to 
-build using N cores.
+Other build configurations are accomplished by passing other options to CMake.
+For example, if you want to use a C++ compiler other than the default on 
+your system, you would pass a path to the compiler using the standard
+CMake option ``-DCMAKE_CXX_COMPILER=path/to/compiler``.
+When you run CMake, it will generate output about the build configuration 
+(compiler and version, options, etc.), which is helpful to make sure CMake
+is doing what you want. For a summary of RAJA configuration 
+options, please see :ref:`configopt-label`.
 
-.. note:: * RAJA is configured to build its unit tests by default. If you do not
-            disable them with the appropriate CMake option (please see
-            :ref:`configopt-label`), you can run them after the build completes
-            to check if everything is built properly.
+.. note:: RAJA is configured to build its tests, examples, and tutorial
+          exercises by default. If you do not disable them with the 
+          appropriate CMake option (see :ref:`configopt-label`), 
+          you can run them after the build completes to check if everything 
+          is built properly.
 
-            The easiest way to run the full set of RAJA tests is to type::
+          The easiest way to run the full set of RAJA tests is to type::
 
-               $ make test
+             $ make test
 
-            in the build directory after the build completes.
+          in the build directory after the build completes.
 
-            You can also run individual tests by invoking test 
-            executables directly. They will be located in the ``test`` 
-            subdirectory in the build space directory. RAJA tests use the 
-            `Google Test framework <https://github.com/google/googletest>`_, 
-            so you can also run tests via Google Test commands.
+          You can also run individual tests by invoking the corresponding
+          test executables directly. They will be located in the ``test`` 
+          subdirectory in your build space. RAJA tests use the 
+          `Google Test framework <https://github.com/google/googletest>`_, 
+          so you can also run and filter tests via Google Test commands.
 
-          * RAJA also contains example and tutorial exercise 
-            programs you can run if you wish. Similar to the RAJA tests, 
-            the examples and exercises are built by default and can be
-            disabled with CMake options (see :ref:`configopt-label`). The 
-            source files for these are located in the ``RAJA/examples`` and 
-            ``RAJA/exercises`` directories, respectively. When built, the
-            executables for the examples and exercises will be located in
-            the ``bin`` subdirectory in the build space directory. Feel free to 
-            experiment by editing the source files and recompiling.
+          The source files for RAJA examples and exercises are located in 
+          the ``RAJA/examples`` and ``RAJA/exercises`` directories, 
+          respectively. When built, the executables for the examples and 
+          exercises will be located in the ``bin`` subdirectory in your build
+          space.
 
-.. _build-external-tpl-label:
+.. _getting_started_build_gpu-label:
 
-.. note:: You may use externally-supplied versions of the camp, CUB, and rocPRIM
-          libraries with RAJA if you wish. To do so, pass the following
-          options to CMake:
-            * External camp: -DEXTERNAL_CAMP_SOURCE_DIR=<camp dir name>
-            * External CUB: -DRAJA_ENABLE_EXTERNAL_CUB=On -DCUB_DIR=<CUB dir name>
-            * External rocPRIM: -DRAJA_ENABLE_EXTERNAL_ROCPRIM=On
-                                -DROCPRIM_DIR=<rocPRIM dir name>
+-------------------------------------------
+Additional RAJA Back-end Build Information
+-------------------------------------------
 
------------------
-GPU Builds, etc.
------------------
+Configuring a RAJA build to support a GPU back-end, such as CUDA, HIP, or 
+OpenMP target offload, typically requires additional CMake options, which 
+we describe next. 
 
 CUDA
 ^^^^^^
 
 To run RAJA code on NVIDIA GPUs, one typically must have a CUDA compiler 
-installed on your system, in addition to a host code compiler. You may need 
+installed on the system, in addition to a host code compiler. You may need 
 to specify both when you run CMake. The host compiler is specified using the 
-``CMAKE_CXX_COMPILER`` CMake variable. The CUDA compiler is specified with
-the ``CMAKE_CUDA_COMPILER`` variable.
+``CMAKE_CXX_COMPILER`` CMake variable as described earlier. The CUDA software
+stack and compiler are specified using the following CMake options:
+
+  * -DCUDA_TOOLKIT_ROOT_DIR=path/to/cuda/toolkit
+  * -DCMAKE_CUDA_COMPILER=path/to/nvcc
 
 When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables:
 
@@ -159,21 +220,19 @@ When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables:
   * CMAKE_CUDA_FLAGS_DEBUG
   * CMAKE_CUDA_FLAGS_RELWITHDEBINFO
 
-which corresponding to the standard CMake build types are used to pass flags
-to nvcc.
+correspond to the standard CMake build types and are used to pass additional
+compiler options to nvcc.
 
-.. note:: When nvcc must pass options to the host compiler, the arguments
-          can be included using these CMake variables. Host compiler
-          options must be prepended with the `-Xcompiler` directive.
+.. note:: Often, nvcc must pass options to the host compiler, the arguments
+          can be included using the ``CMAKE_CUDA_FLAGS...`` CMake variables
+          listed above. Host compiler options must be prepended with the 
+          ``-Xcompiler`` directive to properly propagate.
 
-To set the CUDA compute architecture for the nvcc compiler, which should be
-chosen based on the NVIDIA GPU hardware you are using, you can use the
-``CUDA_ARCH`` CMake variable. For example, the CMake option::
-
-  -DCUDA_ARCH=sm_60
-
-will tell the compiler to use the `sm_60` SASS architecture in its second
-stage of compilation. It will pick the PTX architecture to use in the first
+To set the CUDA compute architecture, which should be chosen based on the 
+NVIDIA GPU hardware you are using, you can use the ``CUDA_ARCH`` CMake 
+variable. For example, the CMake option ``-DCUDA_ARCH=sm_70`` will tell the 
+compiler to use the `sm_70` SASS architecture in its second stage of 
+compilation. The compiler will pick the PTX architecture to use in the first 
 stage of compilation that is suitable for the SASS architecture you specify.
 
 Alternatively, you may specify the PTX and SASS architectures, using
@@ -182,29 +241,37 @@ appropriate nvcc options in the ``CMAKE_CUDA_FLAGS_*`` variables.
 .. note:: **RAJA requires a minimum CUDA architecture level of `sm_35` to use
           all supported CUDA features.** Mostly, the architecture level affects
           which RAJA CUDA atomic operations are available and how they are
-          implemented inside RAJA. This is described in :ref:`atomics-label`.
+          implemented inside RAJA. This is described in 
+          :ref:`feat-atomics-label`.
 
           * If you do not specify a value for ``CUDA_ARCH``, it will be set to
             `sm_35` by default and CMake will emit a status message 
-            indicatting this choice was made.
+            indicating this choice was made.
 
           * If you give a ``CUDA_ARCH`` value less than `sm_35` (e.g., `sm_30`),
-            CMake will report this and stop processing.
-
-Also, RAJA relies on the CUB CUDA utilities library for some CUDA functionality.
-The CUB included in the CUDA toolkit is used by default if available. RAJA
-includes a CUB submodule that is used if it is not available. To use
-an external CUB install provide the following option to CMake:
-``-DRAJA_ENABLE_EXTERNAL_CUB=On -DCUB_DIR=<pat/to/cub>``.
-
-.. note:: **It is important to note that the CUDA toolkit version of cub is
+            CMake will report this as an error and stop processing.
+
+Also, RAJA relies on the CUB CUDA utilities library, mentioned earlier, for 
+some CUDA back-end functionality. The CUB version included in the CUDA toolkit 
+installation is used by default when available. This is the case for CUDA 
+version 11 and later. RAJA includes a CUB submodule that is used by default
+with older versions of CUDA. To use an external CUB installation, provide the 
+following options to CMake:: 
+
+  cmake \
+  ... \
+  -DRAJA_ENABLE_EXTERNAL_CUB=On \
+  -DCUB_DIR=<path/to/cub> \
+  ...
+
+.. note:: The CUDA toolkit version of CUB is
           required for compatibility with the CUDA toolkit version of thrust
-          starting with CUDA toolkit version v11.0.0. So, if you build
-          RAJA with CUDA version 11 or higher you must use the CUDA
-          toolkit version of CUB to use Thrust and be compatible with libraries
-          that use Thrust.
+          starting with CUDA version 11.0.0. So, if you build
+          RAJA with CUDA version 11 or higher, you should use the version of
+          CUB contained in the CUDA toolkit version you are using to use 
+          Thrust and to be compatible with libraries that use Thrust.
 
-          *It is important to note that the version of Googletest that
+.. note:: The version of Googletest that
           is used in RAJA version v0.11.0 or newer requires CUDA version
           9.2.x or newer when compiling with nvcc. Thus, if you build
           RAJA with CUDA enabled and want to also enable RAJA tests, you
@@ -213,74 +280,88 @@ an external CUB install provide the following option to CMake:
 HIP
 ^^^^
 
-To run RAJA code on AMD GPUs, one typically uses the HIP compiler and tool 
-chain (which can also be used to compile code for NVIDIA GPUs).
+To run RAJA code on AMD GPUs, one typically uses a ROCm compiler and tool 
+chain (which can also be used to compile code for NVIDIA GPUs, which is not
+covered in detail in RAJA user documentation).
 
 .. note:: RAJA requires version 3.5 or newer of the ROCm software stack to 
           use the RAJA HIP back-end.
 
-Also, RAJA relies on the rocPRIM HIP utilities library for some HIP
+Unlike CUDA, you do not specify a host compiler and a device compiler when 
+using the AMD ROCm software stack. Typical CMake options to use when building 
+with a ROCm stack are:
+
+  * -DROCM_ROOT_DIR=path/to/rocm
+  * -DHIP_ROOT_DIR=path/to/hip
+  * -DHIP_PATH=path/to/hip/binaries
+  * -DCMAKE_CXX_COMPILER=path/to/rocm/compiler 
+
+Additionally, you use the CMake variable ``CMAKE_HIP_ARCHITECTURES`` to set
+the target compute architecture. For example::
+
+  -DCMAKE_HIP_ARCHITECTURES=gfx908
+
+RAJA relies on the rocPRIM HIP utilities library for some HIP
 functionality. The rocPRIM included in the ROCm install is used by default if
 available. RAJA includes a rocPRIM submodule that is used if it is not
-available. To use an external rocPRIM install provide the following option to CMake:
-``-DRAJA_ENABLE_EXTERNAL_ROCPRIM=On -DROCPRIM_DIR=<pat/to/rocPRIM>``.
+available. To use an external rocPRIM install provide the following options
+to CMake::
+
+  cmake \
+  ... \
+  -DRAJA_ENABLE_EXTERNAL_ROCPRIM=On \
+  -DROCPRIM_DIR=<pat/to/rocPRIM> \
+  ...
 
-.. note:: When using HIP and targeting NVIDIA GPUs RAJA uses CUB instead of
-          rocPRIM. In this case you must use an external CUB install using the
-          CMake variables described in the CUDA section.
+.. note:: When using HIP and targeting NVIDIA GPUs, RAJA uses CUB instead of
+          rocPRIM. In this case, you must configure with an external CUB 
+          install using the CMake variables described in the CUDA section above.
 
 OpenMP
 ^^^^^^^
 
-To use OpenMP target offlad GPU execution, additional options may need to be
+To use OpenMP target offload GPU execution, additional options may need to be
 passed to the compiler. The variable ``OpenMP_CXX_FLAGS`` is used for this.
 Option syntax follows the CMake *list* pattern. For example, to specify OpenMP 
 target options for NVIDIA GPUs using a clang-based compiler, one may do
 something like::
 
    cmake \
-     ....
-     -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda"
+     ... \
+     -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \
+     ...
 
 ----------------------------------------
 RAJA Example Build Configuration Files
 ----------------------------------------
 
-The ``RAJA/scripts`` directory contains subdirectories with a variety of
-build scripts we use to build and test RAJA on various platforms with
-various compilers. These scripts pass files (*CMake cache files*) located in
-the ``RAJA/host-configs`` directory to CMake using the '-C' option.
+The RAJA repository has subdirectories ``RAJA/scripts/*-builds`` that contain
+a variety of build scripts we use to build and test RAJA on various platforms 
+with various compilers. These scripts pass files (*CMake cache files*) 
+located in the ``RAJA/host-configs`` directory to CMake using the '-C' option.
 These files serve as useful examples of how to configure RAJA prior to
 compilation.
 
-----------------
-Installing RAJA
-----------------
-
-To install RAJA as a library, run the following command in your build 
-directory::
-
-  $ make install
-
-This will copy RAJA header files to the ``include`` directory and the RAJA
-library will be installed in the ``lib`` directory you specified using the
-``-DCMAKE_INSTALL_PREFIX`` CMake option.
-
-
 ======================
 Learning to Use RAJA
 ======================
 
-If you want to view and run a very simple RAJA example code, a good place to
-start is located in the file: ``RAJA/examples/daxpy.cpp``. After building 
-RAJA with the options you select, the executable for this code will reside 
-in the file: ``<build-dir>/examples/bin/daxpy``. Simply type the name
-of the executable in your build directory to run it; i.e.,::
-
-  $ ./examples/bin/daxpy 
-
-The ``RAJA/examples`` directory also contains many other RAJA example codes 
-you can run and experiment with.
+The RAJA repository contains a variety of example source codes that you are 
+encouraged to view and run to learn about how to use RAJA:
+
+  * The ``RAJA/examples`` directory contains various examples that illustrate
+    algorithm patterns.
+  * The ``RAJA/exercises`` directory contains exercises for users to work 
+    through along with complete solutions. These are described in detail
+    in the :ref:`tutorial-label` section.
+  * Other examples can also be found in the ``RAJA/test`` directories.
+
+We mentioned earlier that RAJA examples, exercises, and tests are built by
+default when RAJA is compiled. So, unless you explicitly disable them when 
+you run CMake to configure a RAJA build, you can run them after compiling RAJA.
+Executables for the examples and exercises will be located in the
+``<build-dir>/bin`` directory in your build space. Test executables will
+be located in the ``<build-dir>/test`` directory.
 
 For an overview of all the main RAJA features, see :ref:`features-label`.
 A full tutorial with a variety of examples showing how to use RAJA features
diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst
index 692bf6dd9f..bb4eb6d2db 100644
--- a/docs/sphinx/user_guide/index.rst
+++ b/docs/sphinx/user_guide/index.rst
@@ -26,11 +26,11 @@ Additional information about things to think about when considering whether
 to use RAJA in an application can be found in :ref:`app-considerations-label`.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
 
    getting_started
+   using_raja
+   config_options
    features
    app_considerations
    tutorial
-   using_raja
-   config_options
diff --git a/docs/sphinx/user_guide/tutorial.rst b/docs/sphinx/user_guide/tutorial.rst
index 3e2bb25f5d..a468c86984 100644
--- a/docs/sphinx/user_guide/tutorial.rst
+++ b/docs/sphinx/user_guide/tutorial.rst
@@ -8,38 +8,54 @@
 
 .. _tutorial-label:
 
-**********************
+****************************
+RAJA Tutorial and Examples
+****************************
+
+The following sections contain tutorial material and examples that describe
+how to use RAJA features.
+
+===============
 RAJA Tutorial
-**********************
-
-In addition to the tutorial portion of this RAJA User Guide, we maintain
-a repository of tutorial presentation materials here `RAJA Tutorials Repo <https://github.com/LLNL/RAJA-tutorials>`_.
-
-This RAJA tutorial introduces RAJA concepts and capabilities via a 
-sequence of examples of increasing complexity. Complete working codes for 
-the examples are located in the ``RAJA``examples`` directory. The RAJA 
-tutorial evolves as we add new features to RAJA, so refer to it periodically
-if you are interested in learning about them.
-
-To understand the discussion and code examples, a working knowledge of C++ 
-templates and lambda expressions is required. So, before we begin, we provide 
-a bit of background discussion of basic aspects of how RAJA use employs C++ 
-templates and lambda expressions, which is essential to using RAJA successfully.
-
-To understand the GPU examples (e.g., CUDA), it is also important to know the 
-difference between CPU (host) and GPU (device) memory allocations and how 
-transfers between those memory spaces work. For a detailed discussion, see 
-`Device Memory <http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory>`_. 
-
-RAJA does not provide a memory model. This is by design as developers of many
-of applications that use RAJA prefer to manage memory themselves. Thus, users 
-are responsible for ensuring that data is properly allocated and initialized 
-on a GPU device when running GPU code. This can be done using explicit host 
-and device allocation and copying between host and device memory spaces or via 
-unified memory (UM), if available. RAJA developers also support a library 
-called `CHAI <https://github.com/LLNL/CHAI>`_ which complements RAJA by 
-providing a alternative to manual host-device memory copy calls or UM. 
-For more information, see :ref:`plugins-label`.
+===============
+
+This section contains a self-paced tutorial that shows how to use many RAJA
+features by way of a sequence of examples and exercises. Each exercise is 
+located in files in the ``RAJA/exercises`` directory, one *exercise* file with 
+code sections removed and comments containing instructions to fill in the 
+missing code parts and one *solution* file containing complete working code to 
+compare with and for guidance if you get stuck working on the exercise file.
+You are encouraged to build and run the exercises and modify them to try out 
+different variations.
+
+We also maintain a repository of tutorial slide presentations
+`RAJA Tutorials Repo <https://github.com/LLNL/RAJA-tutorials>`_ which we use
+when we give in-person or virtual online tutorials in various venues. The
+presentations complement the material found here. The tutorial material
+evolves as we add new features to RAJA, so refer to it periodically if you
+are interested in learning about new things in RAJA.
+
+To understand the GPU examples (e.g., CUDA), it is also important to know the
+difference between CPU (host) and GPU (device) memory allocations and how
+transfers between those memory spaces work. For a detailed discussion, see
+`Device Memory <http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory>`_.
+
+It is important to note that RAJA does not provide a memory model. This is by 
+design as application developers who use RAJA prefer to manage memory 
+in different ways. Thus, users are responsible for ensuring that data is 
+properly allocated and initialized on a GPU device when running GPU code. 
+This can be done using explicit host and device allocation and copying between 
+host and device memory spaces or via unified memory (UM), if available. 
+The RAJA Portability Suite contains other libraries, namely
+`CHAI <https://github.com/LLNL/CHAI>`_ and
+`Umpire <https://github.com/LLNL/Umpire>`_, that complement RAJA by
+providing alternatives to manual programming model specific memory operations.
+
+.. note:: Most of the CUDA GPU exercises use unified memory (UM) via a simple
+          memory manager capability provided in a file in the ``RAJA/exercises``
+          directory. HIP GPU exercises use explicit host and device memory
+          allocations and explicit memory copy operations to move data between
+          the two.
 
 .. _tutorial-lambda-label:
 
@@ -47,20 +63,26 @@ For more information, see :ref:`plugins-label`.
 A Little C++ Background
 ===============================
 
-RAJA makes heavy use of C++ templates and using RAJA most easily and 
-effectively is done by representing the bodies of loop kernels as C++ lambda 
-expressions. Alternatively, C++ functors can be used, but they make 
-application source code more complex, potentially placing a significant 
-negative burden on source code readability and maintainability.
+To understand the discussion and code examples, a working knowledge of C++
+templates and lambda expressions is required. So, before we begin, we provide
+a bit of background discussion of basic aspects of how RAJA use employs C++
+templates and lambda expressions, which is essential to use RAJA successfully.
+
+RAJA is almost an entirely header-only library that makes heavy use of 
+C++ templates. Using RAJA most easily and effectively is done by representing 
+the bodies of loop kernels as C++ lambda expressions. Alternatively, C++ 
+functors can be used, but they make application source code more complex, 
+potentially placing a significant negative burden on source code readability 
+and maintainability.
 
 -----------------------------------
 C++ Templates
 -----------------------------------
 
-C++ templates enable one to write generic code and have the compiler generate 
-a specific implementation for each set of template parameter types you use.
-For example, the ``RAJA::forall`` method to execute loop kernels is a 
-template method defined as::
+C++ templates enable one to write type-generic code and have the compiler 
+generate an implementation for each set of template parameter types specified.
+For example, the ``RAJA::forall`` method to execute loop kernels is 
+essentially method defined as::
 
   template <typename ExecPol,
             typename IdxType,
@@ -69,16 +91,30 @@ template method defined as::
      ...
   }
 
-Here, "ExecPol", "IdxType", and "LoopBody" are C++ types a user specifies in
-their code; for example::
+Here, "ExecPol", "IdxType", and "LoopBody" are C++ types that a user specifies 
+in her code and which are seen by the compiler when the code is built.
+For example::
 
-  RAJA::forall< RAJA::seq_exec >( RAJA::RangeSegment(0, N), [=](int i) {
+  RAJA::forall< RAJA::loop_exec >( RAJA::TypedRangeSegment<int>(0, N), [=](int i) {
     a[i] = b[i] + c[i];
   });
 
-The "IdxType" and "LoopBody" types are deduced by the compiler based on what 
-arguments are passed to the ``RAJA::forall`` method. Here, the loop body type 
-is defined by the lambda expression::
+is a sequential CPU RAJA kernel that performs an element-by-element vector sum.
+The C-style analogue of this kernel is::
+
+  for (int i = 0; i < N; ++i) {
+    a[i] = b[i] + c[i];
+  }
+
+The execution policy type ``RAJA::loop_exec`` template argument
+is used to choose as specific implementation of the
+``RAJA::forall`` method. The ``IdxType`` and ``LoopBody`` types are deduced by
+the compiler based the arguments passed to the ``RAJA::forall`` method;
+i.e., the ``IdxType`` is the stride-1 index range::
+
+  RAJA::TypedRangeSegment<int>(0, N)
+
+and the ``LoopBody`` type is the lambda expression::
 
   [=](int i) { a[i] = b[i] + c[i]; }
 
@@ -88,11 +124,11 @@ Elements of C++ Lambda Expressions
 
 Here, we provide a brief description of the basic elements of C++ lambda
 expressions. A more technical and detailed discussion is available here:
-`Lambda Functions in C++11 - the Definitive Guide <https://www.cprogramming.com/c++11/c++11-lambda-closures.html>`_ 
+`Lambda Functions in C++11 - the Definitive Guide <https://www.cprogramming.com/c++11/c++11-lambda-closures.html>`_
 
-Lambda expressions were introduced in C++ 11 to provide a lexical-scoped 
-name binding; specifically, a *closure* that stores a function with a data 
-environment. That is, a lambda expression can *capture* variables from an 
+Lambda expressions were introduced in C++ 11 to provide a lexical-scoped
+name binding; specifically, a *closure* that stores a function with a data
+environment. That is, a lambda expression can *capture* variables from an
 enclosing scope for use within the local scope of the function expression.
 
 A C++ lambda expression has the following form::
@@ -100,36 +136,39 @@ A C++ lambda expression has the following form::
   [capture list] (parameter list) {function body}
 
 The ``capture list`` specifies how variables outside the lambda scope are pulled
-into the lambda data environment. The ``parameter list`` defines arguments 
+into the lambda data environment. The ``parameter list`` defines arguments
 passed to the lambda function body -- for the most part, lambda arguments
-are just like arguments in a regular C++ method. Variables in the capture list 
-are initialized when the lambda expression is created, while those in the 
-parameter list are set when the lambda expression is called. The body of a 
+are just like arguments in a regular C++ method. Variables in the capture list
+are initialized when the lambda expression is created, while those in the
+parameter list are set when the lambda expression is called. The body of a
 lambda expression is similar to the body of an ordinary C++ method.
-RAJA templates, such as ``RAJA::forall`` and ``RAJA::kernel`` pass arguments 
-to lambdas based on usage and context; e.g., loop iteration indices.
+RAJA kernel execution templates, such as ``RAJA::forall`` and ``RAJA::kernel``
+that we will describe in detail later, pass arguments
+to lambdas based on usage and context such as loop iteration indices.
 
-A C++ lambda expression can capture variables in the capture list by value 
-or by reference. This is similar to how arguments to C++ methods are passed; 
-i.e., *pass-by-reference* or *pass-by-value*. However, there are some subtle 
+A C++ lambda expression can capture variables in the capture list *by value*
+or *by reference*. This is similar to how arguments to C++ methods are passed;
+i.e., *pass-by-reference* or *pass-by-value*. However, there are some subtle
 differences between lambda variable capture rules and those for ordinary
-methods. Variables mentioned in the capture list with no extra symbols are 
-captured by value. Capture-by-reference is accomplished by using the 
-reference symbol '&' before the variable name; for example::
+methods. **Variables included in the capture list with no extra symbols are
+captured by value.** Variables captured by value are effectively *const* 
+inside the lambda expression body and cannot be written to. 
+Capture-by-reference is accomplished by using the reference symbol '&' before 
+the variable name similar to C++ method arguments.  For example::
 
   int x;
   int y = 100;
   [&x, &y](){ x = y; };
 
-generates a lambda expression that captures both 'x' and 'y' by reference 
-and assigns the value of 'y' to 'x' when called. The same outcome would be 
+generates a lambda expression that captures both 'x' and 'y' by reference
+and assigns the value of 'y' to 'x' when called. The same outcome would be
 achieved by writing::
 
   [&](){ x = y; };   // capture all lambda arguments by reference...
 
 or::
 
-  [=, &x](){ x = y; };  // capture 'x' by reference and 'y' by value...  
+  [=, &x](){ x = y; };  // capture 'x' by reference and 'y' by value...
 
 Note that the following two attempts will generate compilation errors::
 
@@ -138,80 +177,90 @@ Note that the following two attempts will generate compilation errors::
   [x, &y](){ x = y; };  // error: cannot assign to 'x' since it is captured
                         //        by value.
 
-**Specifically, a variable hat is captured by value is read-only.**
+.. note:: A variable that is captured by value in a lambda expression is 
+          **read-only.**
 
 ----------------------------------------
-A Few Notes About Lambda Usage With RAJA 
+A Few Notes About Lambda Usage With RAJA
 ----------------------------------------
 
-There are several issues to note about C++ lambda expressions; in particular, 
-with respect to RAJA usage. We describe them here.
+There are several issues to note about using C++ lambda expressions to
+represent kernel bodies with RAJA. We describe them here.
 
- * **Prefer by-value lambda capture.** 
+ * **Prefer by-value lambda capture.**
 
-   We recommended `capture by-value` for all lambda loop bodies passed to 
-   RAJA execution methods. To execute a RAJA loop on a non-CPU device, such 
-   as a GPU, all variables accessed in the loop body must be passed into the 
-   GPU device data environment. Using capture by-value for all RAJA-based 
-   lambda usage will allow your code to be portable for either CPU or GPU 
-   execution. In addition, the read-only nature of variables captured 
-   by-value can help avoid incorrect CPU code since the compiler will report 
+   We recommend `capture by-value` for all lambda kernel bodies passed to
+   RAJA execution methods. To execute a RAJA loop on a non-CPU device, such
+   as a GPU, all variables accessed in the loop body must be passed into the
+   GPU device data environment. Using capture by-value for all RAJA-based
+   lambda usage will allow your code to be portable for either CPU or GPU
+   execution. In addition, the read-only nature of variables captured
+   by-value can help avoid incorrect CPU code since the compiler will report
    incorrect usage.
 
+|br|
 
- * **Must use 'device' annotation for CUDA device execution.** 
+ * **The '__device__' annotation is required for device execution using CUDA or HIP.**
 
-   Any lambda passed to a CUDA execution context (or function called from a
-   CUDA device kernel, for that matter) must be decorated with 
+   Any lambda passed to a CUDA or HIP execution context (or function called from a
+   device kernel, for that matter) must be decorated with
    the ``__device__`` annotation; for example::
-     
+
      RAJA::forall<RAJA::cuda_exec<BLOCK_SIZE>>( range, [=] __device__ (int i) { ... } );
 
    Without this, the code will not compile and generate compiler errors
-   indicating that a 'host' lambda cannot be called from 'device' code.
+   indicating that a 'host' lambda cannot be called in 'device' code.
 
    RAJA provides the macro ``RAJA_DEVICE`` that can be used to help switch
-   between host-only or device-only CUDA compilation.
-    
+   between host-only or device-only compilation.
+
+|br|
 
  * **Use 'host-device' annotation on a lambda carefully.**
 
    RAJA provides the macro ``RAJA_HOST_DEVICE`` to support the dual
-   CUDA annotation ``__ host__ __device__``. This makes a lambda or function
-   callable from CPU or CUDA device code. However, when CPU performance is 
-   important, **the host-device annotation should be applied carefully on a 
-   lambda that is used in a host (i.e., CPU) execution context**. 
-   Unfortunately, a loop kernel containing a lambda annotated in this way 
-   may run noticeably slower on a CPU than the same lambda with no annotation 
-   depending on the version of the nvcc compiler you are using.
-    
+   annotation ``__ host__ __device__``, which makes a lambda or function
+   callable from CPU or GPU device code. However, when CPU performance is
+   important, **the host-device annotation should be applied carefully on a
+   lambda that is used in a host (i.e., CPU) execution context**. Although
+   compiler improvements in recent years have significantly
+   improved support for host-device lambda expressions, a loop kernel
+   containing a lambda annotated in this way may run noticeably slower on
+   a CPU than the same lambda with no annotation depending on the version of
+   the compiler (e.g., nvcc) you are using. To be sure that your code does not 
+   suffer in performance, we recommend comparing CPU execution timings of
+   important kernels with and without the ``__host__ __device__`` annotation.
+
+|br|
+
+ * **Cannot use 'break' and 'continue' statements in a lambda.**
 
- * **Cannot use 'break' and 'continue' statements in a lambda.** 
+   In this regard, a lambda expression is similar to a function. So, if you
+   have loops in your code with these statements, they should be rewritten.
 
-   In this regard, a lambda expression is similar to a function. So, if you 
-   have loops in your code with these statements, they should be rewritten. 
-    
+|br|
 
- * **Global variables are not captured in a lambda.** 
+ * **Global variables are not captured in a lambda.**
 
-   This fact is due to the C++ standard. If you need (read-only) access to a 
-   global variable inside a lambda expression, one solution is to make a local 
+   This fact is due to the C++ standard. If you need access to a
+   global variable inside a lambda expression, one solution is to make a local
    reference to it; for example::
 
      double& ref_to_global_val = global_val;
 
-     RAJA::forall<RAJA::cuda_exec<BLOCK_SIZE>>( range, [=] __device__ (int i) { 
+     RAJA::forall<RAJA::cuda_exec<BLOCK_SIZE>>( range, [=] __device__ (int i) {
        // use ref_to_global_val
      } );
-    
 
- * **Local stack arrays may not be captured by CUDA device lambdas.** 
+|br|
+
+ * **Local stack arrays may not be captured by CUDA device lambdas.**
 
    Although this is inconsistent with the C++ standard (local stack arrays
-   are properly captured in lambdas for code that will execute on a CPU), 
-   attempting to access elements in a local stack array in a CUDA device 
-   lambda may generate a compilation error depending on the version of the 
-   nvcc compiler you are using. One solution to this problem is to wrap the 
+   are properly captured in lambdas for code that will execute on a CPU),
+   attempting to access elements in a local stack array in a CUDA device
+   lambda may generate a compilation error depending on the version of the
+   device compiler you are using. One solution to this problem is to wrap the
    array in a struct; for example::
 
      struct array_wrapper {
@@ -224,33 +273,32 @@ with respect to RAJA usage. We describe them here.
        // access entries of bounds.array
      } );
 
-   This issue appears to be resolved in in the 10.1 release of CUDA. If you 
-   are using an earlier version of nvcc, an implementation
-   similar to the one above will be required. 
-    
-    
-================
-RAJA Examples
-================
+   This issue was resolved in the 10.1 release of CUDA. If you are using an 
+   earlier version, an implementation similar to the one above will be required.
+
+.. |br| raw:: html
+
+   <br />
+
+===========================
+RAJA Examples and Exercises
+===========================
 
 The remainder of this tutorial illustrates how to use RAJA features with
-working code examples that are located in  the ``RAJA/examples`` 
-directory. Additional information about the RAJA features 
-used can be found in :ref:`features-label`.
-
-The examples demonstrate CPU execution (sequential, SIMD, OpenMP
-multithreading) and CUDA GPU execution. Examples that show how to use
-RAJA with other parallel programming model back-ends that are in 
-development will appear in future RAJA releases. For adventurous users who 
-wish to try experimental features, usage is similar to what is shown in the 
+working code examples and interactive exercises. Files containing the 
+exercise source code are located in  the ``RAJA/exercises`` directory. 
+Additional information about the RAJA features used can be found 
+in :ref:`features-label`.
+
+The examples demonstrate CPU execution (sequential and OpenMP
+multithreading) and GPU execution (CUDA and/or HIP). Examples that show how
+to use RAJA with other parallel programming model back-ends will appear in 
+future RAJA releases. For adventurous users who wish to try experimental 
+RAJA back-end support, usage is similar to what is shown in the
 examples here.
 
 All RAJA programming model support features are enabled via CMake options,
-which are described in :ref:`configopt-label`. 
-
-For the purposes of discussion of each example, we assume that any and all 
-data used has been properly allocated and initialized. This is done in the 
-example code files, but is not discussed further here.
+which are described in :ref:`configopt-label`.
 
 .. _tutorialbasic-label:
 
@@ -260,52 +308,147 @@ Simple Loops and Basic RAJA Features
 
 The examples in this section illustrate how to use ``RAJA::forall`` methods
 to execute simple loop kernels; i.e., non-nested loops. It also describes
-iteration spaces, reductions, atomic operations, scans, and sorts.
+iteration spaces, reductions, atomic operations, scans, sorts, and RAJA
+data views. 
 
 .. toctree::
    :maxdepth: 1
 
    tutorial/add_vectors.rst
-   tutorial/dot_product.rst
    tutorial/indexset_segments.rst
    tutorial/vertexsum_coloring.rst
+   tutorial/dot_product.rst
    tutorial/reductions.rst
    tutorial/atomic_histogram.rst
    tutorial/scan.rst
    tutorial/sort.rst
+   tutorial/view_layout.rst
+   tutorial/permuted-layout-batch-matrix-multiply.rst
 
 .. _tutorialcomplex-label:
 
 =================================================================
-Complex Loops: Transformations and Advanced RAJA Features
+Complex Loops and Advanced RAJA Features
 =================================================================
 
-The examples in this section illustrate how to use ``RAJA::kernel`` methods
-to execute complex loop kernels, such as nested loops. It also describes
-how to construct kernel execution policies, use different view types and
-tiling mechanisms to transform loop patterns.
+RAJA provides two APIs for writing complex kernels involving nested
+loops: ``RAJA::kernel`` that has been available for several years and 
+``RAJA::expt::launch``, which is more recent and which will be moved out of
+the ``expt`` namespace soon. We briefly introduce both interfaces here.
+The tutorial sections that follow provide much more detailed descriptions.
+
+``RAJA::kernel`` is analogous to ``RAJA::forall`` in that it involves
+kernel execution templates, execution policies, iteration spaces, and lambda 
+expression kernel bodies. The main differences between ``RAJA::kernel`` and
+``RAJA::forall`` are:
+
+  * ``RAJA::kernel`` requires a tuple of iteration spaces, one for each level 
+    in a loop nest, whereas ``RAJA::forall`` takes exactly one iteration
+    space.
+  * ``RAJA::kernel`` can accept multiple lambda expressions to express 
+    different parts of a kernel body, whereas ``RAJA::forall`` accepts
+    exactly one lambda expression for a kernel body.
+  * ``RAJA::kernel`` execution policies are more complicated than those 
+    for ``RAJA::forall``. ``RAJA::forall`` policies essentially represent 
+    the kernel execution back-end only. ``RAJA::kernel`` execution policies 
+    enable complex compile time algorithm transformations to be done without 
+    changing the kernel code. 
+
+The following exercises illustrate the common usage of ``RAJA::kernel``
+and ````RAJA::expt::launch``. Please see :ref:`loop_elements-kernelpol-label` 
+for more information about other execution policy constructs ``RAJA::kernel`` 
+provides. ``RAJA::expt::launch`` takes a ``RAJA::expt::Grid`` type argument for
+representing a teams-thread launch configuration, and a lambda expression
+which takes a ``RAJA::expt::LaunchContext`` argument. ``RAJA::expt::launch``
+allows an optional run time choice of execution environment, either CPU or GPU.
+Code written inside the lambda expression body will execute in the chosen 
+execution environment. Within that environment, a user executes 
+kernel operations using ``RAJA::expt::loop<EXEC_POL>`` method calls, which 
+take lambda expressions to express loop body operations.
+
+.. note:: A key difference between the ``RAJA::kernel`` and 
+          ``RAJA::expt::launch`` approaches is that almost all of the
+          kernel execution pattern is expressed in the execution policy 
+          when using ``RAJA::kernel``, whereas with ``RAJA::expt::launch`` the 
+          kernel execution pattern is expressed mostly in the lambda
+          expression kernel body. 
+
+One may argue that ``RAJA::kernel`` is more portable and flexible in that
+the execution policy enables compile time code transformations without 
+changing kernel body code. On the other hand, ``RAJA::expt::launch`` is 
+less opaque and more intuitive, but may require kernel body code changes for
+algorithm changes. Which interface to use depends on personal preference
+and other concerns, such as portability requirements, the need for run time 
+execution selection, etc. Kernel structure is more explicit in application 
+source code with ``RAJA::expt::launch``, and more concise and arguably more 
+opaque with ``RAJA::kernel``. There is a large overlap of algorithms that can 
+be expressed with either interface. However, there are things that one can do 
+with one or the other but not both.
+
+In the following sections, we introduce the basic mechanics and features
+of both APIs with examples and exercises. We also present a sequence of
+execution policy examples and matrix transpose examples using both 
+``RAJA::kernel`` and ``RAJA::expt::launch`` to compare and contrast the
+two interfaces.
+
+===========================================================================
+Nested Loops with ``RAJA::kernel``
+===========================================================================
+
+The examples in this section illustrate various features of the
+``RAJA::kernel`` API used to execute nested loop kernels. It describes how to
+construct kernel execution policies and use different view types and tiling
+mechanisms to transform loop patterns. More information can be found in
+:ref:`loop_elements-kernel-label`.
 
 .. toctree::
    :maxdepth: 1
 
-   tutorial/matrix_multiply.rst
-   tutorial/nested_loop_reorder.rst
-   tutorial/permuted-layout.rst
-   tutorial/offset-layout.rst
-   tutorial/tiled_matrix_transpose.rst
-   tutorial/matrix_transpose_local_array.rst
-   tutorial/halo-exchange.rst
+   tutorial/kernel_nested_loop_reorder.rst
+   tutorial/kernel_exec_pols.rst
+   tutorial/offset-layout-5pt-stencil.rst
 
 =================================================================
-Team based Loops: Nested loops with a thread/team model
+Nested Loops with ``RAJA::expt::launch``
 =================================================================
 
 The examples in this section illustrate how to use ``RAJA::expt::launch``
-to create an run-time selectable execution space for expressing algorithms
-in terms of threads and teams.
+to create an run time selectable execution space for expressing algorithms
+as nested loops.
+
+.. toctree::
+   :maxdepth: 1
+
+   tutorial/launch_basic.rst
+   tutorial/launch_exec_pols.rst
+   tutorial/launch_naming_kernels.rst
+
+.. _tutorialmatrixtranspose-label:
+
+===============================================================================
+Comparing ``RAJA::kernel`` and ``RAJA::expt::launch``: Matrix-Transpose
+===============================================================================
+
+In this section, we compare ``RAJA::kernel`` and ``RAJA::expt::launch`` 
+implementations of a matrix transpose algorithm. We illustrate 
+implementation differences of the two interfaces as we build upon each 
+example with more complex features.
 
 .. toctree::
    :maxdepth: 1
 
-   tutorial/teams_basic.rst
-   tutorial/naming_kernels.rst
+   tutorial/matrix_transpose.rst
+   tutorial/matrix_transpose_tiled.rst
+   tutorial/matrix_transpose_local_array.rst
+
+==========================================
+Other RAJA Features and Usage Examples
+==========================================
+
+.. toctree::
+   :maxdepth: 1
+
+   tutorial/halo-exchange.rst
+   tutorial/matrix_multiply.rst
+
+
diff --git a/docs/sphinx/user_guide/tutorial/add_vectors.rst b/docs/sphinx/user_guide/tutorial/add_vectors.rst
index 73e7ed13d9..9b15a1a34d 100644
--- a/docs/sphinx/user_guide/tutorial/add_vectors.rst
+++ b/docs/sphinx/user_guide/tutorial/add_vectors.rst
@@ -6,23 +6,29 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _addvectors-label:
+.. _tut-addvectors-label:
 
 --------------------------------------
-Vector Addition (Basic Loop Execution)
+Basic Loop Execution: Vector Addition
 --------------------------------------
 
-Key RAJA features shown in this example:
+This section contains an exercise file ``RAJA/exercises/vector-addition.cpp`` 
+for you to work through if you wish to get some practice with RAJA. The 
+file ``RAJA/exercises/vector-addition_solution.cpp`` contains complete 
+working code for the examples discussed in this section. You can use the 
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make vector-addition`` and ``make vector-addition_solution``
+from the build directory.
 
-  * ``RAJA::forall`` loop execution template
-  * ``RAJA::RangeSegment`` iteration space construct
-  * RAJA execution policies
+Key RAJA features shown in this example are:
 
+  * ``RAJA::forall`` loop execution template and execution policies
+  * ``RAJA::TypedRangeSegment`` iteration space construct
 
 In the example, we add two vectors 'a' and 'b' of length N and
 store the result in vector 'c'. A simple C-style loop that does this is:
 
-.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp
    :start-after: _cstyle_vector_add_start
    :end-before: _cstyle_vector_add_end
    :language: C++ 
@@ -31,28 +37,24 @@ store the result in vector 'c'. A simple C-style loop that does this is:
 RAJA Variants
 ^^^^^^^^^^^^^^^^^^^^^
 
-The RAJA variants of the vector addition operation illustrate how the 
-same kernel can be run with a variety of different programming model
-back-ends by simply swapping out the execution policy. This can be done
-by defining type aliases in a header file so that execution policy types
-can be easily switched, and the code can be compiled to run differently,
-without changing the loop kernel code. In the example code, we
-make all execution policy types explicit for clarity.
-
-For the RAJA variants, we replace the C-style for-loop with a call to the 
-``RAJA::forall`` loop execution template method.
+For the RAJA variants of the vector addition kernel, we replace the C-style 
+for-loop with a call to the ``RAJA::forall`` loop execution template method.
 The method takes an iteration space and the vector addition loop body as
-a C++ lambda expression. We pass a ``RAJA::RangeSegment`` object, which 
-describes a contiguous sequence of integral values [0, N) for the iteration
-space (for more information about RAJA loop indexing concepts, 
-see :ref:`index-label`). The loop execution template method requires an 
+a C++ lambda expression. We pass the object::
+
+  RAJA::TypedRangeSegment<int>(0, N)
+
+for the iteration space, which is contiguous sequence of integral 
+values [0, N) (for more information about RAJA loop indexing concepts, 
+see :ref:`feat-index-label`). The loop execution template method requires an 
 execution policy template type that specifies how the loop is to run
-(for more information about RAJA execution policies, see :ref:`policies-label`).
+(for more information about RAJA execution policies,
+see :ref:`feat-policies-label`).
 
-For the RAJA sequential variant, we use the ``RAJA::seq_exec`` execution
+For a RAJA sequential variant, we use the ``RAJA::seq_exec`` execution
 policy type:
 
-.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp
    :start-after: _rajaseq_vector_add_start
    :end-before: _rajaseq_vector_add_end
    :language: C++ 
@@ -65,59 +67,66 @@ execution policy::
 
   RAJA::simd_exec
 
-Alternatively, RAJA provides a *loop execution* policy::
+An alternative RAJA policy is::
 
   RAJA::loop_exec
 
-This policy allows the compiler to generate optimizations, such as SIMD if 
-compiler heuristics suggest that it is safe to do so and potentially 
+which allows the compiler to generate optimizations based on how its internal
+heuristics suggest that it is safe to do so and potentially 
 beneficial for performance, but the optimizations are not forced.
 
 To run the kernel with OpenMP multithreaded parallelism on a CPU, we use the
 ``RAJA::omp_parallel_for_exec`` execution policy:
 
-.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp
    :start-after: _rajaomp_vector_add_start
    :end-before: _rajaomp_vector_add_end
    :language: C++ 
 
 This will distribute the loop iterations across CPU threads and run the 
-loop over threads in parallel.
+loop over threads in parallel. In particular, this is what you would get if
+you wrote the kernel using a C-style loop with an OpenMP pragma directly::
+
+  #pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
+    c[i] = a[i] + b[i];
+  }
 
 To run the kernel on a CUDA GPU device, we use the ``RAJA::cuda_exec``
 policy:
 
-.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp
    :start-after: _rajacuda_vector_add_start
    :end-before: _rajacuda_vector_add_end
    :language: C++ 
 
-Note that the CUDA execution policy type accepts a template argument 
-``CUDA_BLOCK_SIZE``, which specifies that each CUDA thread block launched 
-to execute the kernel will have the given number threads in the block.
+Since the lambda defining the loop body will be passed to a device kernel, 
+it must be decorated with the ``__device__`` attribute.
+This can be done directly or by using the ``RAJA_DEVICE`` macro.
+
+Note that the CUDA execution policy type requires a template argument 
+``CUDA_BLOCK_SIZE``, which specifies the number of threads to run in each 
+CUDA thread block launched to run the kernel.
 
-For performance tuning, the ``RAJA::cuda_exec_explicit`` policy is also
-provided. This allows the user to specify the number of blocks allocated
-per streaming multiprocessor (SM) to allow additional block level
-parallelism. Note that the third boolean argument representing asynchronous
-execution can be omitted, and is ``false`` by default:
+For additional performance tuning options, the ``RAJA::cuda_exec_explicit`` 
+policy is also provided, which allows a user to specify the minimum number 
+of thread blocks to launch at a time on each streaming multiprocessor (SM):
 
-.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp
    :start-after: _rajacuda_explicit_vector_add_start
    :end-before: _rajacuda_explicit_vector_add_end
    :language: C++ 
 
-Since the lambda defining the loop body will be passed to a device kernel, 
-it must be decorated with the ``__device__`` attribute when it is defined. 
-This can be done directly or by using the ``RAJA_DEVICE`` macro.
+Note that the third boolean template argument is used to express whether the
+kernel launch is synchronous or asynchronous. This is optional and is 
+'false' by default. A similar defaulted optional argument is supported for 
+other RAJA GPU (e.g., CUDA or HIP) policies.
 
-Similarly, to run the kernel on a GPU using the RAJA HIP back-end, 
+Lastly, to run the kernel on a GPU using the RAJA HIP back-end, 
 we use the ``RAJA::hip_exec`` policy:
 
-.. literalinclude:: ../../../../examples/tut_add-vectors.cpp
+.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp
    :start-after: _rajahip_vector_add_start
    :end-before: _rajahip_vector_add_end
    :language: C++
 
-The file ``RAJA/examples/tut_add-vectors.cpp`` contains the complete 
-working example code.
diff --git a/docs/sphinx/user_guide/tutorial/atomic_histogram.rst b/docs/sphinx/user_guide/tutorial/atomic_histogram.rst
index 72b136a269..970271c875 100644
--- a/docs/sphinx/user_guide/tutorial/atomic_histogram.rst
+++ b/docs/sphinx/user_guide/tutorial/atomic_histogram.rst
@@ -6,96 +6,108 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _atomichist-label:
+.. _tut-atomichist-label:
 
 --------------------------------------------------
-Computing a Histogram with Atomic Operations
+Atomic Operations: Computing a Histogram
 --------------------------------------------------
 
-Key RAJA features shown in this example:
+This section contains an exercise file ``RAJA/exercises/atomic-histogram.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/atomic-histogram_solution.cpp`` contains complete
+working code for the examples discussed in this section. You can use the
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make atomic-histogram`` and ``make atomic-histogram_solution``
+from the build directory.
 
-  * ``RAJA::forall`` loop execution template 
-  * ``RAJA::RangeSegment`` iteration space construct
-  * RAJA atomic add operation
+Key RAJA features shown in this exercise are:
+
+  * ``RAJA::forall`` kernel execution template and execution policies
+  * ``RAJA::TypedRangeSegment`` iteration space construct
+  * RAJA atomic add operation and RAJA atomic operation policies
 
 The example uses an integer array of length 'N' randomly initialized with 
-values in the interval [0, M). While iterating over the array, the kernel 
-accumulates the number of occurrences of each value in the array using atomic 
-add operations. Atomic operations allow one to update a memory location 
+values in the interval [0, M). 
+
+.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp
+   :start-after: _array_atomic_histogram_start
+   :end-before: _array_atomic_histogram_end
+   :language: C++
+
+Each kernel iterates over the array and accumulates the number of occurrences 
+of each value in [0, M) in another array named 'hist'. The kernels use atomic 
+operations for the accumulation, which allow one to update a memory location 
 referenced by a specific address in parallel without data races. The example 
 shows how to use RAJA portable atomic operations and that they are used 
 similarly for different programming model back-ends. 
 
-.. note:: Each RAJA reduction operation requires an atomic policy type
+.. note:: Each RAJA atomic operation requires an atomic policy type
           parameter that must be compatible with the execution policy for 
-          the kernel in which it is used.
+          the kernel in which it is used. This is similar to the reduction
+          policies we described in :ref:`tut-dotproduct-label`.
 
 For a complete description of supported RAJA atomic operations and 
-atomic policies, please see :ref:`atomics-label`.
+atomic policies, please see :ref:`feat-atomics-label`.
 
-All code snippets described below use the loop range:
+All code snippets described below use the stride-1 iteration space range:
 
-.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp
+.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp
    :start-after: _range_atomic_histogram_start
    :end-before: _range_atomic_histogram_end
    :language: C++
 
-and the integer array 'bins' of length 'M' to accumulate the number of 
-occurrences of each value in the array.
-
 Here is the OpenMP version:
 
-.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp
+.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp
    :start-after: _rajaomp_atomic_histogram_start
    :end-before: _rajaomp_atomic_histogram_end
    :language: C++
 
-Each slot in the 'bins' array is incremented by one when a value associated 
+One is added to a slot in the 'bins' array when a value associated 
 with that slot is encountered. Note that the ``RAJA::atomicAdd`` 
 operation uses an OpenMP atomic policy, which is compatible with the OpenMP 
-loop execution policy.
+kernel execution policy.
 
 The CUDA and HIP versions are similar:
 
-.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp
+.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp
    :start-after: _rajacuda_atomic_histogram_start
    :end-before: _rajacuda_atomic_histogram_end
    :language: C++
 
 and:
 
-.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp
+.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp
    :start-after: _rajahip_atomic_histogram_start
    :end-before: _rajahip_atomic_histogram_end
    :language: C++
 
 Here, the atomic add operations uses CUDA and HIP atomic policies, which are 
-compatible with the CUDA and HIP loop execution policies.
+compatible with the CUDA and HIP kernel execution policies.
 
 Note that RAJA provides an ``auto_atomic`` policy for easier usage and 
-improved portability. This policy will do the right thing in most 
-circumstances. If OpenMP is enabled, the OpenMP atomic policy will be used, 
-which is correct in a sequential execution context as well. Otherwise, the 
-sequential atomic policy will be applied. Similarly, if it is encountered in 
-a CUDA or HIP execution context, the corresponding GPU back-end atomic policy 
+improved portability. This policy will choose the proper atomic operation 
+for the execution policy used to run the kernel. Specifically, when OpenMP 
+is enabled, the OpenMP atomic policy will be used, which is correct in a 
+sequential or OpenMP execution context. Otherwise, the sequential atomic 
+policy will be applied. Similarly, if it is encountered in a CUDA or HIP 
+execution context, the corresponding GPU back-end atomic policy 
 will be applied. 
 
 For example, here is the CUDA version that uses the 'auto' atomic policy:
 
-.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp
+.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp
    :start-after: _rajacuda_atomicauto_histogram_start
    :end-before: _rajacuda_atomicauto_histogram_end
    :language: C++
 
 and the HIP version:
 
-.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp
+.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp
    :start-after: _rajahip_atomicauto_histogram_start
    :end-before: _rajahip_atomicauto_histogram_end
    :language: C++
 
-The same CUDA and HIP loop execution policies as in the previous examples 
+The same CUDA and HIP kernel execution policies as in the previous examples 
 are used.
 
-The file ``RAJA/examples/tut_atomic-histogram.cpp`` contains the complete 
-working example code.
diff --git a/docs/sphinx/user_guide/tutorial/dot_product.rst b/docs/sphinx/user_guide/tutorial/dot_product.rst
index 9c10217edb..feac719918 100644
--- a/docs/sphinx/user_guide/tutorial/dot_product.rst
+++ b/docs/sphinx/user_guide/tutorial/dot_product.rst
@@ -6,31 +6,37 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _dotproduct-label:
+.. _tut-dotproduct-label:
 
 -----------------------------------
-Vector Dot Product (Sum Reduction)
+Sum Reduction: Vector Dot Product
 -----------------------------------
 
-Key RAJA features shown in this example:
+This section contains an exercise file ``RAJA/exercises/dot-product.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/dot-product_solution.cpp`` contains complete
+working code for the examples discussed in this section. You can use the
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make dot-product`` and ``make dot-product_solution``
+from the build directory.
 
-  * ``RAJA::forall`` loop execution template
-  * ``RAJA::RangeSegment`` iteration space construct
-  * RAJA execution policies
-  * ``RAJA::ReduceSum`` sum reduction template
-  * RAJA reduction policies
+Key RAJA features shown in this example are:
 
+  * ``RAJA::forall`` loop execution template and execution policies
+  * ``RAJA::TypedRangeSegment`` iteration space construct
+  * ``RAJA::ReduceSum`` sum reduction template and reduction policies
 
 In the example, we compute a vector dot product, 'dot = (a,b)', where 
-'a' and 'b' are two vectors length N and 'dot' is a scalar. Typical
+'a' and 'b' are two vectors of length N and 'dot' is a scalar. Typical
 C-style code to compute the dot product and print its value afterward is: 
 
-.. literalinclude:: ../../../../examples/tut_dot-product.cpp
+.. literalinclude:: ../../../../exercises/dot-product_solution.cpp
    :start-after: _csytle_dotprod_start
    :end-before: _csytle_dotprod_end
    :language: C++
 
-Note that this operation performs a *reduction*, a computational pattern that
+Although this kernel is serial, it is representative of a *reduction* 
+operation which is a common algorithm pattern that
 produces a single result from a set of values. Reductions present a variety
 of issues that must be addressed to operate properly in parallel. 
 
@@ -39,65 +45,69 @@ RAJA Variants
 ^^^^^^^^^^^^^^^^^^^^^
 
 Different programming models support parallel reduction operations differently.
-Some models, such as CUDA, do not provide support for reductions at all and 
+Some models, such as CUDA, do not provide direct support for reductions and 
 so such operations must be explicitly coded by users. It can be challenging 
 to generate a correct and high performance implementation. RAJA provides 
 portable reduction types that make it easy to perform reduction operations
-in loop kernels. The RAJA variants of the dot product computation show how 
+in kernels. The RAJA variants of the dot product computation show how 
 to use the ``RAJA::ReduceSum`` sum reduction template type. RAJA provides
-other reduction types and also allows multiple reduction operations to be
-performed in a single kernel along with other computation. Please see 
-:ref:`reductions-label` for an example that does this.
+other reduction types and allows multiple reduction operations to be
+performed in a single kernel alongside other computations. Please see 
+:ref:`feat-reductions-label` for more information.
 
 Each RAJA reduction type takes a `reduce policy` template argument, which
 **must be compatible with the execution policy** applied to the kernel
 in which the reduction is used. Here is the RAJA sequential variant of the dot 
 product computation:
 
-.. literalinclude:: ../../../../examples/tut_dot-product.cpp
-   :start-after: _rajaseq_atomic_histogram_start
-   :end-before: _rajaseq_atomic_histogram_end
+.. literalinclude:: ../../../../exercises/dot-product_solution.cpp
+   :start-after: _rajaseq_dotprod_start
+   :end-before: _rajaseq_dotprod_end
    :language: C++
 
 The sum reduction object is defined by specifying the reduction 
-policy ``RAJA::seq_reduce``, which matches the loop execution policy, and
-a reduction value type (i.e., 'double'). An initial value of zero for the 
-sum is passed to the reduction object constructor. After the kernel executes,
-we use the 'get' method to retrieve the reduced value.
+policy ``RAJA::seq_reduce`` matching the kernel execution policy
+``RAJA::seq_exec``, and a reduction value type (i.e., 'double'). An initial 
+value of zero for the sum is passed to the reduction object constructor. After 
+the kernel executes, we use the 'get' method to retrieve the reduced value.
 
-The OpenMP multithreaded variant of the loop is implemented similarly:
+The OpenMP multithreaded variant of the kernel is implemented similarly:
 
-.. literalinclude:: ../../../../examples/tut_dot-product.cpp
+.. literalinclude:: ../../../../exercises/dot-product_solution.cpp
    :start-after: _rajaomp_dotprod_start
    :end-before: _rajaomp_dotprod_end
    :language: C++
 
 Here, we use the ``RAJA::omp_reduce`` reduce policy to match the OpenMP
-loop execution policy.
+kernel execution policy.
 
-The RAJA CUDA variant is achieved by using appropriate loop execution and 
+The RAJA CUDA variant is achieved by using appropriate kernel execution and 
 reduction policies:
 
-.. literalinclude:: ../../../../examples/tut_dot-product.cpp
+.. literalinclude:: ../../../../exercises/dot-product_solution.cpp
    :start-after: _rajacuda_dotprod_start
    :end-before: _rajacuda_dotprod_end
    :language: C++
 
 Here, the CUDA reduce policy ``RAJA::cuda_reduce`` matches the CUDA 
-loop execution policy. Note that the CUDA thread block size is not 
+kernel execution policy. Note that the CUDA thread block size is not 
 specified in the reduce policy as it will use the same value as the
 loop execution policy.
 
 Similarly, for the RAJA HIP variant:
 
-.. literalinclude:: ../../../../examples/tut_dot-product.cpp
+.. literalinclude:: ../../../../exercises/dot-product_solution.cpp
    :start-after: _rajahip_dotprod_start
    :end-before: _rajahip_dotprod_end
    :language: C++
 
-It is worth noting how similar the code looks for each of these variants.
+It is worth repeating how similar the code looks for each of these variants.
 The loop body is identical for each and only the loop execution policy
 and reduce policy types change.
 
-The file ``RAJA/examples/tut_dot-product.cpp`` contains the complete 
-working example code.
+.. note:: Currently available reduction capabilities in RAJA require a
+          *reduction policy* type that is compatible with the execution
+          policy for the kernel in which the reduction is used. We
+          are developing a new reduction interface for RAJA that will
+          provide an alternative for which the reduction policy is not 
+          required.
diff --git a/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP b/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP
index 255bf0abdd..10b0dc44d7 100644
--- a/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP
+++ b/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP
@@ -50,7 +50,7 @@ index range, has been used in all the other examples. A ``RAJA::ListSegment``
 represents an arbitrary collection of indices, similar to an indirection
 array that is common in unstructured mesh applications. In the example, we
 use two ``RAJA::ListSegment`` objects to hold these two sets of indices. See
-:ref:`index-label` for more information about RAJA segments and index sets.
+:ref:`feat-index-label` for more information about RAJA segments and index sets.
 
 The code in the example that constructs the segments and index set is:
 
diff --git a/docs/sphinx/user_guide/tutorial/halo-exchange.rst b/docs/sphinx/user_guide/tutorial/halo-exchange.rst
index babc04a6e9..c451340ae2 100644
--- a/docs/sphinx/user_guide/tutorial/halo-exchange.rst
+++ b/docs/sphinx/user_guide/tutorial/halo-exchange.rst
@@ -6,26 +6,31 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _halo_exchange-label:
+.. _tut-halo_exchange-label:
 
 ------------------------------------
-Halo Exchange (Workgroup Constructs)
+Workgroup Constructs: Halo Exchange
 ------------------------------------
 
+The example code discussed in this section can be found in the file
+``RAJA/examples/tut_halo-exchange.cpp``. The file contains complete working 
+code for multiple OpenMP, CUDA, and HIP RAJA variants. Here, we describe
+a subset of these variants.
+
 Key RAJA features shown in this example:
 
   * ``RAJA::WorkPool`` workgroup construct
   * ``RAJA::WorkGroup`` workgroup construct
   * ``RAJA::WorkSite`` workgroup construct
-  * ``RAJA::RangeSegment`` iteration space construct
+  * ``RAJA::TypedRangeSegment`` iteration space construct
   * RAJA workgroup policies
 
 In this example, we show how to use the RAJA workgroup constructs to implement
 buffer packing and unpacking for data halo exchange on a computational grid,
-a common MPI communication operation. This may not provide a performance gain
-on a CPU system, but it can significantly speedup halo exchange on a GPU 
-system compared to using ``RAJA::forall`` to run individual packing/unpacking
-kernels.
+a common MPI communication operation for distributed memory applications. 
+This technique may not provide a performance gain on a CPU system, but it can 
+significantly speedup halo exchange on a GPU system compared to running
+many individual packing/unpacking kernels, for example.
 
 .. note:: Using an abstraction layer over RAJA can make it easy to switch
           between using individual ``RAJA::forall`` loops or the RAJA workgroup
@@ -33,16 +38,17 @@ kernels.
           compile time or run time.
 
 We start by setting the parameters for the halo exchange by using default
-values or values provided via command line input. These parameters determine 
-the size of the grid, the width of the halo, the number of grid variables 
-and the number of cycles.
+values or values provided via command line input to the example code. These 
+parameters determine the size of the grid, the width of the halo, the number 
+of grid variables to pack/unpack, and the number of cycles; (iterations
+to run).
 
 .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
    :start-after: _halo_exchange_input_params_start
    :end-before: _halo_exchange_input_params_end
    :language: C++
 
-Next, we allocate the variables array (the memory manager in
+Next, we allocate the variable data arrays (the memory manager in
 the example uses CUDA Unified Memory if CUDA is enabled). These grid variables
 are reset each cycle to allow checking the results of the packing and
 unpacking.
@@ -89,19 +95,24 @@ into the adjacent halo cells:
   | 7 | 7 | 8 | 9 | 9 |
   +---+---+---+---+---+
 
+Although the example code does not use MPI and multiple domains (one per
+MPI rank, for example), as would be the case in a real distributed memory
+parallel application, the data copy operations represent the spirit of how
+data communication would be done.
+
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Packing and Unpacking (Basic Loop Execution)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-A sequential non-RAJA example of packing:
+A sequential non-RAJA example of data packing and unpacking would look like:
 
 .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
    :start-after: _halo_exchange_sequential_cstyle_packing_start
    :end-before: _halo_exchange_sequential_cstyle_packing_end
    :language: C++
 
-and unpacking:
+and:
 
 .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
    :start-after: _halo_exchange_sequential_cstyle_unpacking_start
@@ -135,7 +146,7 @@ and unpack the buffer data into the grid variable array:
    :language: C++
 
 
-For parallel multi-threading execution via OpenMP, the example can be run
+For parallel multithreading execution via OpenMP, the example can be run
 by replacing the execution policy with:
 
 .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
@@ -151,6 +162,9 @@ policy:
    :end-before: _halo_exchange_cuda_forall_policies_end
    :language: C++
 
+Note that we can use an asynchronous execution policy because there are
+no data races due to the intermediate buffer usage.
+
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 RAJA Variants using workgroup constructs
@@ -165,8 +179,8 @@ policies and types:
    :language: C++
 
 which are used in a slightly rearranged version of packing. See how the comment
-indicating where a message could be sent has been moved down after the call to
-run on the workgroup:
+indicating where messages are sent has been moved down after the call to
+run the operations enqueued on the workgroup:
 
 .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
    :start-after: _halo_exchange_loop_workgroup_packing_start
@@ -184,7 +198,7 @@ unpacking the data:
 This reorganization has the downside of not overlapping the message sends with
 packing and the message receives with unpacking.
 
-For parallel multi-threading execution via OpenMP, the example using workgroup
+For parallel multithreading execution via OpenMP, the example using workgroup
 can be run by replacing the policies and types with:
 
 .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
@@ -192,6 +206,10 @@ can be run by replacing the policies and types with:
    :end-before: _halo_exchange_openmp_workgroup_policies_end
    :language: C++
 
+The main differences between these types and the ones defined for the sequential
+case above are the ``forall_policy`` and the ``workgroup_policy``, which use
+OpenMP execution policy types.
+
 Similarly, to run the loops in parallel on a CUDA GPU use these policies and
 types, taking note of the unordered work ordering policy that allows the
 enqueued loops to all be run using a single CUDA kernel:
@@ -201,21 +219,28 @@ enqueued loops to all be run using a single CUDA kernel:
    :end-before: _halo_exchange_cuda_workgroup_policies_end
    :language: C++
 
+The main differences between these types and the ones defined for the 
+sequential and OpenMP cases above are the ``forall_policy`` and the 
+``workgroup_policy``, which use different template parameters, and the
+``workpool``, ``workgroup``, and ``worksite`` types which use 'pinned' 
+memory allocation.
+
 The packing is the same as the previous workgroup packing examples with the
-exception of added synchronization after calling run and before sending the
-messages. The previous CUDA example used forall to launch
-``num_neighbors * num_vars`` CUDA kernels and performed ``num_neighbors``
-synchronizations to send each message in turn. Here, the reorganization to pack
-all messages before sending lets us use an unordered CUDA work ordering policy
-in the workgroup constructs that reduces the number of CUDA kernel launches to
-one. It also allows us to synchronize once before sending all of the messages:
+exception of added synchronization after calling the workgroup run method
+and before sending the messages. In the example code, there is a CUDA version
+that uses forall to launch ``num_neighbors * num_vars`` CUDA kernels and 
+performs ``num_neighbors`` synchronizations to send each message in turn. 
+Here, the reorganization to pack all messages before sending lets us use an 
+unordered CUDA work ordering policy in the ``workgroup_policy`` that reduces 
+the number of CUDA kernel launches to one. It also allows us to need to 
+synchronize only once before sending all of the messages:
 
 .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp
    :start-after: _halo_exchange_cuda_workgroup_packing_start
    :end-before: _halo_exchange_cuda_workgroup_packing_end
    :language: C++
 
-After waiting to receive all of the messages we use workgroup constructs using
+After waiting to receive all of the messages we use workgroup constructs with
 a CUDA unordered work ordering policy to unpack all of the messages using a
 single kernel launch:
 
@@ -228,6 +253,3 @@ Note that the synchronization after unpacking is done to ensure that
 ``group_unpack`` and ``site_unpack`` survive until the unpacking loop has
 finished executing.
 
-
-The file ``RAJA/examples/tut_halo-exchange.cpp`` contains a complete
-working example code, with OpenMP, CUDA, and HIP variants.
diff --git a/docs/sphinx/user_guide/tutorial/indexset_segments.rst b/docs/sphinx/user_guide/tutorial/indexset_segments.rst
index 729df37632..d651bd1a03 100644
--- a/docs/sphinx/user_guide/tutorial/indexset_segments.rst
+++ b/docs/sphinx/user_guide/tutorial/indexset_segments.rst
@@ -6,200 +6,349 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _indexset-label:
+.. _tut-indexset-label:
 
------------------------------------------
-Iteration Spaces: IndexSets and Segments
------------------------------------------
+-------------------------------------------------
+Iteration Spaces: Segments and IndexSets
+-------------------------------------------------
 
-Key RAJA features shown in this example:
+This section contains an exercise file ``RAJA/exercises/segment-indexset-basics.cpp`` for you to work through if you wish to get some practice with RAJA. The 
+file ``RAJA/exercises/segment-indexset-basics_solution.cpp`` contains complete 
+working code for the examples discussed in this section. You can use the 
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make segment-indexset-basics`` and ``make segment-indexset-basics_solution``
+from the build directory.
+
+Key RAJA features shown in this example are:
 
   * ``RAJA::forall`` loop execution template
-  * ``RAJA::RangeSegment`` (i.e., ``RAJA::TypedRangeSegment``) iteration space construct
-  * ``RAJA::TypedListSegment`` iteration space construct
-  * ``RAJA::IndexSet`` iteration construct and associated execution policies
-
-The example uses a simple daxpy kernel and its usage of RAJA is similar to
-previous simple loop examples. The example
-focuses on how to use RAJA index sets and iteration space segments, such 
-as index ranges and lists of indices. These features are important for 
-applications and algorithms that use indirection arrays for irregular array 
+  * ``RAJA::TypedRangeSegment``, ``RAJA::TypedRangeStrideSegment``, and 
+    ``RAJA::TypedListSegment`` iteration space constructs
+  * ``RAJA::TypedIndexSet`` container and associated execution policies
+
+The concepts of iteration spaces and associated Loop variables are central to
+writing kernels in RAJA. RAJA provides basic iteration space types
+that serve as flexible building blocks that can be used to form a variety
+of loop iteration patterns. These types can be used to define a particular
+order for loop iterates, aggregate and partition iterates, as well as other
+configurations. 
+
+The examples in this section focus on how to use RAJA index sets and iteration 
+space segments, such as index ranges and lists of indices. Lists of indices 
+are important for algorithms that use indirection arrays for irregular array 
 accesses. Combining different segment types, such as ranges and lists in an 
 index set allows a user to launch different iteration patterns in a single loop 
 execution construct (i.e., one kernel). This is something that is not 
 supported by other programming models and abstractions and is unique to RAJA. 
-Applying these concepts judiciously can increase performance by allowing 
+Applying these concepts judiciously can help improve performance by allowing 
 compilers to optimize for specific segment types (e.g., SIMD for range 
 segments) while providing the flexibility of indirection arrays for general
 indexing patterns.
 
-.. note:: For the following examples, it is useful to remember that all
-          RAJA segment types are templates, where the type of the index
-          value is the template argument. So for example, the basic RAJA
-          range segment type is ``RAJA::TypedRangeSegment<T>``. The type
-          ``RAJA::RangeSegment`` used here (for convenience) is a type alias 
-          for ``RAJA::TypedRangeSegment<RAJA::Index_type>``, where the
-          template parameter is a default index type that RAJA defines.
-
-For a summary discussion of RAJA segment and index set concepts, please 
-see :ref:`index-label`.
+Although the constructs described in the section are 
+useful in numerical computations and parallel execution, the examples only 
+contain print statements and sequential execution. The goal is to show you 
+how to use RAJA iteration space constructs. 
 
 ^^^^^^^^^^^^^^^^^^^^^
 RAJA Segments
 ^^^^^^^^^^^^^^^^^^^^^
 
-In previous examples, we have seen how to define a contiguous range of loop
-indices [0, N) with a ``RAJA::RangeSegment`` object and use it in a RAJA
-loop execution template to run a loop kernel over the range. For example:
+A RAJA *Segment* represents a set of indices that one wants to execute as a 
+unit for a kernel. RAJA provides the following Segment types:
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _rajaseq_daxpy_range_start
-   :end-before: _rajaseq_daxpy_range_end
-   :language: C++
+   * ``RAJA::TypedRangeSegment`` represents a stride-1 range
+   * ``RAJA::TypedRangeStrideSegment`` represents a (non-unit) stride range
+   * ``RAJA::TypedListSegment`` represents an arbitrary set of indices
+
+These segment types are used in ``RAJA::forall`` and other RAJA kernel
+execution mechanisms to define the iteration space for a kernel.
+
+After we briefly introduce these types, we will present several examples using
+them.
+
+TypedRangeSegment
+^^^^^^^^^^^^^^^^^^^
+
+A ``RAJA::TypedRangeSegment`` is the fundamental type for defining a
+stride-1 (i.e., contiguous) range of indices. This is illustrated in the
+figure below.
+
+.. figure:: ../figures/RangeSegment.png
+
+   A range segment defines a stride-1 index range [beg, end).
+
+One creates a range segment object as follows::
+
+   // A stride-1 index range [beg, end) using type int.
+   RAJA::TypedRangeSegment<int> my_range(beg, end);
+
+Any integral type can be given as the template parameter.
+
+.. note:: When using a RAJA range segment, no loop iterations will be run when
+          begin >= end.
+
+TypedRangeStrideSegment
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+A ``RAJA::TypedRangeStrideSegment`` defines a range with a constant stride,
+including negative stride values if needed. This is illustrated in the
+figure below.
+
+.. figure:: ../figures/RangeStrideSegment.png
+
+   A range-stride segment defines an index range with arbitrary stride [beg, end, stride). In the figure the stride is 2.
+
+One creates a range stride segment object as follows::
+
+   // A stride-3 index range [beg, end) using type int.
+   RAJA::TypedRangeStrideSegment<int> my_stride2_range(beg, end, 3);
+
+   // A index range with -1 stride [0, N-1) using type int
+   RAJA::TypedRangeStrideSegment<int> my_neg1_range( N-1, -1, -1);
+
+Any integral type can be given as the template parameter.
+
+When the negative-stride segment above is passed to a ``RAJA::forall`` method,
+for example, the loop will run in reverse order with iterates::
+
+  N-1  N-2  N-3 ... 1 0
+
+.. note:: When using a RAJA strided range, no loop iterations will be run
+          under the following conditions:
+
+            * Stride > 0 and begin > end
+            * Stride < 0 and begin < end
+            * Stride == 0
+
+TypedListSegment
+^^^^^^^^^^^^^^^^^^
+
+A ``RAJA::TypedListSegment`` is used to define an arbitrary set of
+indices, akin to an indirection array. This is illustrated in the figure below.
+
+.. figure:: ../figures/ListSegment.png
+
+   A list segment defines an arbitrary collection of indices. Here, we have a list segment with 5 irregularly-spaced indices.
+
+One creates a list segment object by passing a container of integral values to 
+a list segment constructor. For example::
+
+   // Create a vector holding some integer index values
+   std::vector<int> idx = {0, 2, 3, 4, 7, 8, 9, 53};
+
+   // Create list segment with these indices where the indices are
+   // stored in the CUDA device memory space
+   camp::resources::Resource cuda_res{camp::resources::Cuda()};
+   RAJA::TypedListSegment<int> idx_list( idx[0], cuda_res );
+
+   // Alternatively
+   RAJA::TypedListSegment<int> idx_list( &idx[0], idx.size(),
+                                         cuda_res );
+
+When the list segment above is passed to a ``RAJA::forall`` method,
+for example, the kernel will execute with iterates::
 
-We can accomplish the same result by enumerating the indices in a 
-``RAJA::TypedListSegment`` object. Here, we assemble the indices in a standard 
-vector, create a list segment from that, and then pass the list segment to the 
-forall execution template:
+  0 2 3 4 7 8 9 53
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _rajaseq_daxpy_list_start
-   :end-before: _rajaseq_daxpy_list_end
+Note that a ``RAJA::TypedListSegment`` constructor can take a pointer to
+an array of indices and an array length. If the indices are
+in a container, such as ``std::vector`` that provides ``begin()``, ``end()``,
+and ``size()`` methods, the container can be passed to the constructor and
+the length argument is not required.
+
+.. note:: Currently, a camp resource object must be passed to a list segment 
+          constructor to copy the indices in the indices into the proper 
+          memory space for a kernel to execute (as shown above). In the future,
+          this will change and the user will be responsible for providing
+          the indices in the proper memory space.
+
+^^^^^^^^^^^
+IndexSets
+^^^^^^^^^^^
+
+A ``RAJA::TypedIndexSet`` is a container that can hold an arbitrary collection
+of segment objects. The following figure shows an index set with two contiguous
+ranges and an irregularly-spaced list of indices.
+
+.. figure:: ../figures/IndexSet.png
+
+   An index set with two range segments and one list segment.
+
+We can create such an index set as follows::
+
+   // Create an index set that can hold range and list segments with
+   // int index value type
+   RAJA::TypedIndexSet< RAJA::TypedRangeSegment<int>, 
+                        RAJA::TypedListSegment<int> > iset;
+
+   // Add two range segments and one list segment to the index set
+   iset.push_back( RAJA::TypedRangeSegment<int>( ... ) );
+   iset.push_back( RAJA::TypedListSegment<int>(...) );
+   iset.push_back( RAJA::TypedRangeSegment<int>( ... ) );
+
+A ``RAJA::TypedIndexSet`` object can be passed to a RAJA kernel execution 
+method, such as ``RAJA::forall`` to execute all segments in the index set
+with one method call. We will show this in detail in the examples below.
+
+.. note:: It is the responsibility of the user to ensure that segments are
+          defined properly when using RAJA index sets. For example, if the
+          same index appears in multiple segments, the corresponding loop
+          iteration will be run multiple times.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Segment and IndexSet Examples
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The examples in this section illustrate how the  segment types that RAJA 
+provides can be used to define kernel iteration spaces. We use the following
+type aliases to make the code more compact:
+
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_segment_type_start
+   :end-before: _raja_segment_type_end
    :language: C++
 
-Note that we are using the following type aliases:
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_list_segment_type_start
-   :end-before: _raja_list_segment_type_end
+Stride-1 Indexing
+^^^^^^^^^^^^^^^^^^^
+
+Consider a simple C-style kernel that prints a contiguous sequence of values:
+
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _cstyle_range1_start
+   :end-before: _cstyle_range1_end
    :language: C++
 
-Recall from discussion in :ref:`index-label` that ``RAJA::Index_type`` is
-a default index type that RAJA defines and which is used in some RAJA
-constructs as a convenience for users who want a simple mechanism to apply
-index types consistently.
-
-It is important to understand what happens when using list segments.
-During loop execution, indices stored in the list segment are passed to the 
-loop body one-by-one, effectively mimicking an indirection array except that
-the indirection does not appear in the loop body. For example, we 
-can reverse the order of the indices, run the loop with a new list segment 
-object, and get the same result since the loop is `data-parallel`:
-
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_list_segment_daxpy_reverse_start
-   :end-before: _raja_list_segment_daxpy_reverse_end
+When run, the kernel prints the following sequence, as expected::
+
+  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
+
+Three RAJA variants of the kernel using a ``RAJA::TypedRangeSegment``, a
+``RAJA::TypedRangeStrideSegment``, and a ``RAJA::TypedListSegment`` are:
+
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_range1_start
+   :end-before: _raja_range1_end
    :language: C++
 
-Alternatively, we can also use a RAJA strided range segment to run the loop 
-in reverse by giving it a stride of -1. For example:
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_striderange1_start
+   :end-before: _raja_striderange1_end
+   :language: C++
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_range_segment_daxpy_negstride_start
-   :end-before: _raja_range_segment_daxpy_negstride_end
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_list1_start
+   :end-before: _raja_list1_end
    :language: C++
 
-The fact that RAJA always passes loop index values to lambdas in a kernel
-explains why we can run a kernel with multiple segment types in a single 
-RAJA construct as we discuss next. 
+Each of these variants prints the same integer sequence shown above. 
 
-^^^^^^^^^^^^^^^^^^^^^
-RAJA IndexSets
-^^^^^^^^^^^^^^^^^^^^^
+One interesting thing to note is that with ``RAJA::TypedListSegment`` and
+``RAJA::forall``, the actual iteration value is passed to the lambda loop body.
+So the indirection array concept is not visible. In contrast, in C-style code, 
+one has to manually retrieve the index value from the indirection array to 
+achieve the desired result. For example:
 
-The ``RAJA::TypedIndexSet`` template is a container that can hold
-any number of segments, of the same or different types. An index set object 
-can be passed to a RAJA loop execution method, just like a segment, to
-run a loop kernel. When the loop is run, the execution method iterates 
-over the segments and the loop indices in each segment. Thus, the loop 
-iterates can be grouped into different segments to partition the iteration 
-space and iterate over the loop kernel chunks (defined by segments), in 
-serial, in parallel, or in some specific dependency ordering. Individual 
-segments can be executed in serial or parallel.
-
-When an index set is defined, the segment types it may hold must be specified
-as template arguments. For example, here we create an index set that can
-hold list segments. Then, we add the list segment we created earlier to it
-and run the loop:
-
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_indexset_list_daxpy_start
-   :end-before: _raja_indexset_list_daxpy_end
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _cstyle_list1_start
+   :end-before: _cstyle_list1_end
    :language: C++
 
-You are probably wondering: What is the 'SEQ_ISET_EXECPOL' type used for the 
-execution policy? 
+Non-unit Stride Indexing 
+^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Well, it is similar to execution policy types we have seen up to this point, 
-except that it specifies a two-level policy -- one for iterating over the 
-segments and one for executing the iterates defined by each segment. In the 
-example, we specify that we should do each of these operations sequentially 
-by defining the policy as follows:
+Consider the following C-style kernel that prints the integer sequence 
+discussed earlier in reverse order:
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_seq_indexset_policy_daxpy_start
-   :end-before: _raja_seq_indexset_policy_daxpy_end
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _cstyle_negstriderange1_start
+   :end-before: _cstyle_negstriderange1_end
    :language: C++
 
-Next, we perform the daxpy operation by partitioning the iteration space into
-two range segments:
+We can accomplish the same result using a ``RAJA::TypedRangeStrideSegment``:
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_indexset_2ranges_daxpy_start
-   :end-before: _raja_indexset_2ranges_daxpy_end
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_negstriderange1_start
+   :end-before: _raja_negstriderange1_end
    :language: C++
 
-The first range segment is used to run the index range [0, N/2) and the
-second is used to run the range [N/2, N).
+Alternatively, we can use a ``RAJA::TypedListSegment``, where we reverse the
+index array we used earlier to define the appropriate list segment:
+
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_negstridelist1_start
+   :end-before: _raja_negstridelist1_end
+   :language: C++
+
+The more common use of the ``RAJA::TypedRangeStrideSegment`` type is to run
+constant strided loops with a positive non-unit stride. For example:
+
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_range2_start
+   :end-before: _raja_range2_end
+   :language: C++
 
-We can also break up the iteration space into three segments, 2 ranges 
-and 1 list:
+The C-style equivalent of this is:
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_indexset_2ranges_1list_daxpy_start 
-   :end-before: _raja_indexset_2ranges_1list_daxpy_end
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _cstyle_range2_start
+   :end-before: _cstyle_range2_end
    :language: C++
 
-The first range segment runs the index range [0, N/3), the list segment
-enumerates the indices in the interval [N/3, 2*N/3), and the second range
-segment runs the range [2*N/3, N). Note that we use the same execution
-policy as before. 
+IndexSets: Complex Iteration Spaces
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We noted earlier that ``RAJA::TypedIndexSet`` objects can be used to partition
+iteration spaces into disjoint parts. Among other things, this can be useful to
+expose parallelism in algorithms that would otherwise require significant 
+code transformation to do so. Please see :ref:`tut-vertexsum-label` for 
+discussion of an example that illustrates this.
 
-Before we end the discussion of these examples, we demonstrate a few more 
-index set execution policy variations. To run the previous three segment 
-code by iterating over the segments sequentially and executing each 
-segment in parallel using OpenMP multithreading, we would use this policy 
-definition:
+Here is an example that uses two ``RAJA::TypedRangeSegment`` objects in an
+index set to represent an iteration space broken into two disjoint 
+contiguous intervals:
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_indexset_ompinnerpolicy_daxpy_start
-   :end-before: _raja_indexset_ompinnerpolicy_daxpy_end
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_indexset_2ranges_start
+   :end-before: _raja_indexset_2ranges_end
    :language: C++
 
-If we wanted to iterate over the segments in parallel using OpenMP 
-multi-threading and execute each segment sequentially, we would use the
-following policy:
-
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_indexset_ompouterpolicy_daxpy_start
-   :end-before: _raja_indexset_ompouterpolicy_daxpy_end
-   :language: C++ 
-
-Finally, to iterate over the segments sequentially and execute each segment in
-parallel on a GPU using either CUDA or HIP kernel, we would use a policy,
-such as:
- 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_indexset_cudapolicy_daxpy_start
-   :end-before: _raja_indexset_cudapolicy_daxpy_end
+The integer sequence that is printed is::
+
+  0  1  2  3  4  5  6  7  8  9  15  16  17  18  19
+
+as we expect. 
+
+The execution policy type when using a RAJA index set is a 
+*two-level* policy. The first level specifies how to iterate over the segments
+in the index set, such as sequentially or in parallel using OpenMP. The second
+level is the execution policy used to execute each segment.
+
+.. note:: Iterating over the indices of all segments in a RAJA index set
+          requires a two-level execution policy, with two template parameters,
+          as shown above. The first parameter specifies how to iterate over
+          the segments. The second parameter specifies how the kernel will 
+          execute each segment over each segment. 
+          See :ref:`indexsetpolicy-label` for more information about
+          RAJA index set execution policies.
+  
+It is worth noting that a C-style version of this kernel requires either 
+an indirection array to run in one loop or two for-loops. For example:
+
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _cstyle_2ranges_start
+   :end-before: _cstyle_2ranges_end
    :language: C++
 
-or:
+Finally, we show an example that uses an index set holding two range segments
+and one list segment to partition an iteration space into three parts:
 
-.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp
-   :start-after: _raja_indexset_hippolicy_daxpy_start
-   :end-before: _raja_indexset_hippolicy_daxpy_end
+.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp
+   :start-after: _raja_indexset_3segs_start
+   :end-before: _raja_indexset_3segs_end
    :language: C++
 
-The file ``RAJA/examples/tut_indexset-segments.cpp`` contains working code
-for these examples.
+The integer sequence that is printed is::
+
+  0  1  2  3  4  5  6  7  10  11  14  20  22  24  25  26  27
diff --git a/docs/sphinx/user_guide/tutorial/kernel_exec_pols.rst b/docs/sphinx/user_guide/tutorial/kernel_exec_pols.rst
new file mode 100644
index 0000000000..3271e8a584
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/kernel_exec_pols.rst
@@ -0,0 +1,249 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-kernelexecpols-label:
+
+-----------------------------------------------------------
+``RAJA::kernel`` Execution Policies
+-----------------------------------------------------------
+
+This section contains an exercise file 
+``RAJA/exercises/kernelintro-execpols.cpp`` for you to work through if you 
+wish to get some practice with RAJA. The file 
+``RAJA/exercises/kernelintro-execpols_solution.cpp`` contains
+complete working code for the examples discussed in this section. You can use
+the solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make kernelintro-execpols`` and 
+``make kernelintro-execpols_solution`` from the build directory.
+
+Key RAJA features shown in this section are:
+
+  * ``RAJA::kernel`` kernel execution template and execution policies 
+
+The examples in this section illustrate various execution policies for
+``RAJA::kernel``. The goal is for you to gain an understanding of how
+execution policies are constructed and used to perform various nested
+loop execution patterns. All examples use the same simple kernel, which
+is a three-level loop nest to initialize the entries in an array. 
+The C++ lambda expression representing the kernel inner loop body is identical 
+for all kernel variants described here, whether we are executing the kernel 
+on a CPU sequentially or in parallel with OpenMP, or in parallel on a GPU 
+(CUDA or HIP). The kernels perform the same operations as the examples in the
+:ref:`tut-launchexecpols-label` tutorial section, which uses 
+``RAJA::expt::launch``. By comparing the two sets of examples, you will gain 
+an understanding of the differences between the ``RAJA::kernel`` and the 
+``RAJA::expt::launch`` interfaces.
+
+We begin by defining some constants used throughout the examples and 
+allocating two arrays:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _init_define_start
+   :end-before: _init_define_end
+   :language: C++
+
+Note that we use the 'memory manager' routines contained in the exercise
+directory to simplify the allocation process. In particular, CUDA unified 
+memory is used when CUDA is enabled to simplify accessing the data on the
+host or device.
+
+Next, we execute a C-style nested for-loop version of the kernel to initialize
+the entries in the 'reference' array that we will use to compare the results
+of other variants for correctness:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _cstyle_tensorinit_seq_start
+   :end-before: _cstyle_tensorinit_seq_end
+   :language: C++
+
+Note that we manually compute pointer offsets for the (i,j,k) indices. 
+To simplify the remaining kernel variants we introduce a ``RAJA::View``
+object, which wraps the tensor data pointer and simplifies the multi-dimensional
+indexing:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _3D_raja_view_start
+   :end-before: _3D_raja_view_end
+   :language: C++
+
+Here 'aView' is a three-dimensional View with extent 'N' in each
+coordinate based on a three-dimensional ``RAJA::Layout`` object where the
+array entries will be accessed using indices of type 'int'. Please see 
+:ref:`feat-view-label` for more information about the View and Layout types that 
+RAJA provides for various indexing patterns and data layouts.
+
+Using the View, the C-style kernel now looks like:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _cstyle_tensorinit_view_seq_start
+   :end-before: _cstyle_tensorinit_view_seq_end
+   :language: C++
+
+Notice how accessing each (i,j,k) entry in the array is more natural,
+and less error prone, using the View.
+
+The corresponding RAJA sequential version using ``RAJA::kernel`` is:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_seq_start
+   :end-before: _raja_tensorinit_seq_end
+   :language: C++
+
+This should be familiar to the reader who has read the preceding
+:ref:`tut-kernelnestedreorder-label` section of this tutorial.
+
+Suppose we wanted to parallelize the outer 'k' loop using OpenMP multithreading.
+A C-style version of this is:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _cstyle_tensorinit_omp_outer_start
+   :end-before: _cstyle_tensorinit_omp_outer_end
+   :language: C++
+
+where we have placed the OpenMP directive ``#pragma omp parallel for`` before
+the outer loop of the kernel.
+
+To parallelize all iterations in the entire loop nest, we can apply the OpenMP
+``collapse(3)`` clause to map the iterations for all loop levels to OpenMP 
+threads:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _cstyle_tensorinit_omp_collapse_start
+   :end-before: _cstyle_tensorinit_omp_collapse_end
+   :language: C++
+
+The corresponding RAJA versions of these two OpenMP variants are,
+respectively:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_omp_outer_start
+   :end-before: _raja_tensorinit_omp_outer_end
+   :language: C++
+
+and 
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_omp_collapse_start
+   :end-before: _raja_tensorinit_omp_collapse_end
+   :language: C++
+
+The first of these, in which we parallelize the outer 'k' loop, replaces
+the ``RAJA::loop_exec`` loop execution policy with the 
+``RAJA::omp_parallel_for_exec`` policy, which applies the same OpenMP
+directive to the outer loop used in the C-style variant.
+
+The RAJA OpenMP collapse variant introduces the ``RAJA::statement::Collapse``
+statement type. We use the ``RAJA::omp_parallel_collapse_exec`` execution
+policy type and indicate that we want to collapse all three loop levels
+in the second template argument ``RAJA::ArgList<2, 1, 0>``. The integer
+values in the list indicate the order of the loops in the collapse operation:
+'k' (2) outer, 'j' (1) middle, and 'i' (0) inner. The integers represent
+the order of the lambda arguments and the order of the range segments in the
+iteration space tuple.
+
+The first RAJA-based kernel for parallel GPU execution using the RAJA CUDA
+back-end we introduce is:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_cuda_start
+   :end-before: _raja_tensorinit_cuda_end
+   :language: C++
+
+Here, we use the ``RAJA::statement::CudaKernel`` statement type to
+indicate that we want a CUDA kernel to be launched. The 'k', 'j', 'i'
+iteration variables are mapped to CUDA threads using the CUDA execution
+policy types ``RAJA::cuda_thread_z_loop``, ``RAJA::cuda_thread_y_loop``,
+and ``RAJA::cuda_thread_x_loop``, respectively. Thus, we use a 
+a three-dimensional CUDA thread-block to map the loop iterations to CUDA
+threads. The ``_loop`` part of each execution policy name indicates that
+the indexing in the associated portion of the mapping will use a block-stride
+loop. This is useful to guarantee that the policy will work for any 
+array regardless of size in each coordinate dimension.
+
+To execute the kernel with a prescribed mapping of iterations to a
+thread-block using RAJA, we could do the following:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_cuda_tiled_direct_start
+   :end-before: _raja_tensorinit_cuda_tiled_direct_end
+   :language: C++
+
+where we have defined the CUDA thread-block dimensions as:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _cuda_blockdim_start
+   :end-before: _cuda_blockdim_end
+   :language: C++
+
+The ``RAJA::statement::CudaKernelFixed`` statement indicates that we want to 
+use a fixed thread-block size of 256. To ensure that we are mapping the kernel 
+iterations properly in chunks of 256 threads to each thread-block, we use RAJA 
+tiling statements in which we specify the tile size for each dimension/loop 
+index so that each tile has dimensions (32, 8, 1). For example, the statement
+``RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>`` is used on the 
+'j' loop, which has a tile size of 8 associated with that dimension. Note that 
+we do not tile the 'k' loop, since the block size is one in that dimension. 
+
+The other main difference with the previous block-stride loop kernel
+version is that we map iterations within each tile directly to threads in
+a block; for example, using a ``RAJA::cuda_block_y_direct`` policy type
+for the 'j' loop. RAJA *direct* policy types eliminate the block-stride looping,
+which is not necessary here since we prescribe a block-size of 256 which 
+fits within the thread-block size limitation of the CUDA device programming 
+model.
+
+For context and comparison, here is the same kernel implementation using
+CUDA directly:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _cuda_tensorinit_tiled_direct_start
+   :end-before: _cuda_tensorinit_tiled_direct_end
+   :language: C++
+
+The ``nested_init`` device kernel used here is:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _cuda_tensorinit_kernel_start
+   :end-before: _cuda_tensorinit_kernel_end
+   :language: C++
+
+A few differences between the CUDA and RAJA-CUDA versions are worth noting. 
+First, the CUDA version uses the CUDA ``dim3`` construct to express the 
+threads-per-block and number of thread-blocks to use: i.e., the 
+``nthreads_per_block`` and ``nblocks`` variable definitions. Note that
+RAJA provides a macro ``RAJA_DIVIDE_CEILING_INT`` to perform the proper
+integer arithmetic to calculate the number of blocks based on the size of the
+array and the block size in each dimension. Second, the mapping of thread
+identifiers to the (i,j,k) indices is explicit in the device kernel. Third,
+an explicit check of the (i,j,k) values is required in the CUDA implementation 
+to avoid addressing memory out-of-bounds; i.e., 
+``if ( i < N && j < N && k < N )...``. The RAJA kernel variants set similar
+definitions internally and **mask out indices that would be out-of-bounds.**
+Note that we also inserted additional error checking with ``static_assert``
+and ``cudaErrchk``, which is a RAJA macro, for printing CUDA device error
+codes, to catch device errors if there are any.
+
+Lastly, we show the RAJA HIP variants of the kernel, which are semantically
+identical to the RAJA CUDA variants we just described. First, the RAJA-HIP 
+block-stride loop variant:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_hip_start
+   :end-before: _raja_tensorinit_hip_end
+   :language: C++
+
+and then the RAJA-HIP fixed thread-block size, tiled, direct thread mapping 
+version:
+
+.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_hip_tiled_direct_start
+   :end-before: _raja_tensorinit_hip_tiled_direct_end
+   :language: C++
+
+The only differences are that type names are changed to replace 'CUDA' types 
+with 'HIP' types to use the RAJA HIP back-end.
diff --git a/docs/sphinx/user_guide/tutorial/kernel_nested_loop_reorder.rst b/docs/sphinx/user_guide/tutorial/kernel_nested_loop_reorder.rst
new file mode 100644
index 0000000000..757b153ce2
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/kernel_nested_loop_reorder.rst
@@ -0,0 +1,237 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-kernelnestedreorder-label:
+
+-----------------------------------------------------------
+Basic ``RAJA::kernel`` Mechanics and Nested Loop Ordering
+-----------------------------------------------------------
+
+This section contains an exercise file ``RAJA/exercises/kernelintro-nested-loop-reorder.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/kernelintro-nested-loop-reorder_solution.cpp`` contains 
+complete working code for the examples discussed in this section. You can use 
+the solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make kernelintro-nested-loop-reorder`` and ``make kernelintro-nested-loop-reorder_solution``
+from the build directory.
+
+Key RAJA features shown in this section are:
+
+  * ``RAJA::kernel`` loop iteration templates and execution policies
+  * Nested loop reordering
+  * RAJA strongly-types indices
+
+The examples in this
+section show the nested loop reordering process in more detail. 
+Specifically, we describe how to reorder execution policy statements, which
+is conceptually analogous to how one would reorder for-loops in a C-style loop
+nest. We also introduce strongly-typed index variables that can help users 
+write correct nested loop code with RAJA. The examples do not perform any 
+computation; each kernel simply prints out the loop indices in the 
+order that the iteration spaces are traversed. Thus, only sequential execution 
+policies are used to avoid complications resulting from print statements
+used in parallel programs. The mechanics shown here work the same way for 
+parallel RAJA execution policies.
+
+Before we dive into code, we reiterate important features that 
+represent the main differences between nested-loop RAJA and the 
+``RAJA::forall`` construct for simple, non-nested loop kernels: 
+
+  * An index space (e.g., range segment) and lambda index argument are 
+    required for each level in a loop nest. This example contains
+    triply-nested loops, so there will be three ranges and three index 
+    arguments.
+
+  * The index spaces for the nested loop levels are specified in a RAJA tuple 
+    object. The order of spaces in the tuple must match the order of index 
+    arguments to the lambda for this to be correct in general. RAJA provides 
+    strongly-typed indices to help with this, which we show below.
+
+  * An execution policy is required for each level in a loop nest. These
+    are specified as nested statements in the ``RAJA::KernelPolicy`` type.
+
+  * The loop nest ordering is specified in the nested kernel policy --
+    the first ``statement::For`` type identifies the outermost loop, the 
+    second ``statement::For`` type identifies the loop nested inside the 
+    outermost loop, and so on.
+
+We begin by defining three named **strongly-typed** variables for the loop 
+index variables (i, j, k):
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _raja_typed_indices_start
+   :end-before: _raja_typed_indices_end
+   :language: C++
+
+Specifically, the 'i' index variable type is ``IIDX``, the 'j' index variable
+is ``JIDX``, and the 'k' variable is ``KIDX``, which are aliases to
+``int`` type.
+
+We also define [min, max) intervals for each loop index:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after:  _range_min_max_start
+   :end-before:  _range_min_max_end
+   :language: C++
+
+and three corresponding **typed** range segments which bind the ranges to the
+index variable types via template specialization:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _raja_typed_index_ranges_start
+   :end-before: _raja_typed_index_ranges_end
+   :language: C++
+
+When these features are used as in this example, the compiler will 
+generate error messages if the lambda expression index argument ordering
+and types do not match the index ordering in the tuple. This is illustrated
+at the end of this section.
+
+We begin with a C-style loop nest with 'i' in the inner loop, 'j' in the
+middle loop, and 'k' in the outer loop, which prints the (i, j, k) triple 
+in the inner loop body:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _cstyle_kji_loops_start
+   :end-before: _cstyle_kji_loops_end
+   :language: C++
+
+The ``RAJA::kernel`` version of this is:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _raja_kji_loops_start
+   :end-before: _raja_kji_loops_end
+   :language: C++
+
+The integer template parameters in the ``RAJA::statement::For`` types 
+represent the lambda expression index argument and the range types in the
+iteration space tuple argument to ``RAJA::kernel``.
+
+Both kernels generate the same output, as expected::
+
+  (I, J, K)
+  ---------
+  (0, 1, 2) 
+  (1, 1, 2) 
+  (0, 2, 2) 
+  (1, 2, 2) 
+  (0, 1, 3) 
+  (1, 1, 3) 
+  (0, 2, 3) 
+  (1, 2, 3) 
+
+which you can see by running the exercise code.
+
+Here, the ``RAJA::kernel`` execution template takes two arguments: a tuple of 
+ranges, one for each of the three levels in the loop nest, and the lambda 
+expression loop body. Note that the lambda has an index argument for each 
+range and that their order and types match. This is required for the code to
+compile.
+
+.. note:: RAJA provides mechanisms to explicitly specify which loop variables, 
+          for example, and in which order they appear in a lambda expression
+          argument list. Please refer to :ref:`loop_elements-kernel-label`
+          for more information.
+
+The execution policy for the loop nest is specified in the 
+``RAJA::KernelPolicy`` type. The policy uses two statement types:
+``RAJA::statement::For`` and ``RAJA::statement::Lambda``.
+
+The ``RAJA::statement::Lambda`` is used to generate code that invokes the
+lambda expression. The '0' template parameter refers to the index of the 
+lambda expression in the ``RAJA::kernel`` argument list following the
+iteration space tuple. Since there is only one lambda expression, we reference
+it with the '0' identifier. Sometimes more complicated kernels require multiple
+lambda expressions, so we need a way to specify where they will appear in the
+generated executable code. We show examples of this in the matrix transpose
+discussion later in the tutorial.
+
+Each level in the loop nest is identified by a
+``RAJA::statement::For`` type, which identifies the iteration space and
+execution policy for the level. Here, each level uses a 
+sequential execution policy, which is for illustration purposes.
+The integer that appears as the first template argument to each 
+``RAJA::statement::For`` type corresponds to the index of a range in the tuple 
+and also to the associated lambda index argument; i.e., '0' for 'i', 
+'1' for 'j', and '2' for 'k'. 
+
+The integer argument to each ``RAJA::statement::For`` type is needed so 
+that the levels in the loop nest can be reordered by changing the policy 
+while the kernel remains the same. To illustrate, we permute the loop nest 
+ordering so that the 'j' loop is the outermost, the 'i' loop is in the middle, 
+and the 'k' loop is the innermost with the following policy:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _raja_jik_loops_start
+   :end-before: _raja_jik_loops_end
+   :language: C++
+
+This generates the following output::
+
+  (I, J, K)
+  ---------
+  (0, 1, 2) 
+  (0, 1, 3) 
+  (1, 1, 2) 
+  (1, 1, 3) 
+  (0, 2, 2) 
+  (0, 2, 3) 
+  (1, 2, 2) 
+  (1, 2, 3)
+
+which is the same as the corresponding C-style version:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _cstyle_jik_loops_start
+   :end-before: _cstyle_jik_loops_end
+   :language: C++
+
+Note that we have simply reordered the nesting of the ``RAJA::statement::For``
+types in the execution policy. This is analogous to reordering the for-loops 
+in C-style version.
+
+For completeness, we permute the loops again so that the 'i' loop 
+is the outermost, the 'k' loop is in the middle, and the 'j' loop is the 
+innermost with the following policy:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _raja_ikj_loops_start
+   :end-before: _raja_ikj_loops_end
+   :language: C++
+
+The analogous C-style loop nest is:
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _cstyle_ikj_loops_start
+   :end-before: _cstyle_ikj_loops_end
+   :language: C++
+
+The output generated by these two kernels is::
+
+  (I, J, K)
+  ---------
+  (0, 1, 2) 
+  (0, 2, 2) 
+  (0, 1, 3) 
+  (0, 2, 3) 
+  (1, 1, 2) 
+  (1, 2, 2) 
+  (1, 1, 3) 
+  (1, 2, 3)
+
+Finally, we show an example that will generate a compilation error because
+there is a type mismatch in the ordering of the range segments in the tuple
+and the lambda expression argument list.
+
+.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp
+   :start-after: _raja_compile_error_start
+   :end-before: _raja_compile_error_end
+   :language: C++
+
+Do you see the problem? The last kernel is included in the exercise source
+file, so you can see what happens when you attempt to compile it.
diff --git a/docs/sphinx/user_guide/tutorial/launch_basic.rst b/docs/sphinx/user_guide/tutorial/launch_basic.rst
new file mode 100644
index 0000000000..19f6f153ca
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/launch_basic.rst
@@ -0,0 +1,99 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-launchintro-label:
+
+------------------------------
+``RAJA::Launch`` Basics
+------------------------------
+
+There are no exercise files to work through for this section. Instead, there
+is an example source file ``RAJA/examples/tut_launch_basic.cpp`` which
+contains complete code examples of the concepts described here.
+
+Key RAJA features shown in the following examples are:
+
+  * ``RAJA::launch`` method to create a run-time
+    selectable host/device execution space.
+  * ``RAJA::loop`` methods to express algorithms
+    in terms of nested for loops.
+
+In this example, we introduce the RAJA Launch framework and discuss
+hierarchical loop-based parallelism. Kernel execution details 
+with RAJA Launch occur inside the lambda expression
+passed to the ``RAJA::launch`` method, which defines an execution
+space::
+
+  RAJA::launch<launch_policy>(RAJA::ExecPlace ,
+  RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams),
+                     RAJA::Threads(Nthreads,Nthreads)),
+  [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+    /* Kernel code goes here */
+
+  });
+
+The ``RAJA::launch`` method accepts a ``RAJA::LaunchPolicy``
+template parameter that can be defined using up to two policies 
+(host and device). For example, the following creates an execution space 
+for a sequential and CUDA kernel dispatch::
+
+  using launch_policy = RAJA::LaunchPolicy
+    <RAJA::seq_launch_t, RAJA::cuda_launch_t<false>>;
+
+Whether a kernel executes on the host or device is determined by the first 
+argument passed to the ``RAJA::launch`` method, which is a 
+``RAJA::ExecPlace`` enum value, either ``HOST`` or ``DEVICE``.
+Similar to GPU thread and block programming models, RAJA Launch carries out
+computation in a predefined compute grid made up of threads which are
+then grouped into teams when executing on the device. The execution space is 
+then enclosed by a host/device lambda which takes a 
+``RAJA::LaunchContext`` object, which may be used to control the flow 
+within the kernel, for example by creating thread-team synchronization points.
+
+Inside the execution space, developers write a kernel using nested
+``RAJA::loop`` methods. The manner in which each loop is executed 
+is determined by a template parameter type, which
+indicates how the corresponding iterates are mapped to the Teams/Threads
+configuration defined by the ``RAJA::LaunchParams`` type passed as the second
+argument to the ``RAJA::launch`` method. Following the CUDA and HIP 
+programming models, this defines an hierarchical structure in which outer loops 
+are executed by thread-teams and inner loops are executed by threads in a team.
+
+.. literalinclude:: ../../../../examples/tut_launch_basic.cpp
+   :start-after: // _team_loops_start
+   :end-before: // _team_loops_end
+   :language: C++
+
+The mapping between teams and threads to the underlying programming 
+model depends on how the ``RAJA::loop`` template parameter types are
+defined. For example, we may define host and device mapping strategies as::
+
+  using teams_x = RAJA::LoopPolicy<RAJA::loop_exec,
+                                         RAJA::cuda_block_x_direct>;
+  using thread_x = RAJA::LoopPolicy<RAJA::loop_exec,
+                                          RAJA::cuda_block_x_direct>;
+
+Here, the ``RAJA::LoopPolicy`` type holds both the host (CPU) and 
+device (CUDA GPU) loop mapping strategies. On the host, both the team/thread 
+strategies expand out to standard C-style loops for execution:
+
+.. literalinclude:: ../../../../examples/tut_launch_basic.cpp
+   :start-after: // _c_style_loops_start
+   :end-before: // _c_style_loops_end
+   :language: C++
+
+On the device the ``teams_x/y`` policies will map loop iterations directly to
+CUDA (or HIP) thread blocks, while the ``thread_x/y`` policies will map loop 
+iterations directly to threads in a CUDA (or HIP) thread block. The direct CUDA 
+equivalent of the kernel body using the policy shown above is:
+
+.. literalinclude:: ../../../../examples/tut_launch_basic.cpp
+   :start-after: // _device_loop_start
+   :end-before: // _device_loop_end
+   :language: C++
diff --git a/docs/sphinx/user_guide/tutorial/launch_exec_pols.rst b/docs/sphinx/user_guide/tutorial/launch_exec_pols.rst
new file mode 100644
index 0000000000..76866804c1
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/launch_exec_pols.rst
@@ -0,0 +1,225 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-launchexecpols-label:
+
+-----------------------------------------------------------
+``RAJA::Launch`` Execution Policies
+-----------------------------------------------------------
+
+This section contains an exercise file 
+``RAJA/exercises/launchintro-execpols.cpp`` for you to work through if you 
+wish to get some practice with RAJA. The file 
+``RAJA/exercises/launchintro-execpols_solution.cpp`` contains complete working 
+code for the examples discussed in this section. You can use the solution file 
+to check your work and for guidance if you get stuck. To build the exercises 
+execute ``make launchintro-execpols`` and ``make launchintro-execpols_solution``
+from the build directory.
+
+Key RAJA features shown in this section are:
+
+  * ``RAJA::launch`` kernel execution environment template
+  * ``RAJA::loop`` loop execution template and execution policies
+
+The examples in this section illustrate how to construct nested loop kernels
+inside an ``RAJA::launch`` execution environment. In particular,
+the goal is for you to gain an understanding of how to use execution policies
+with nested ``RAJA::loop`` method calls to perform various nested
+loop execution patterns. All examples use the same simple kernel, which
+is a three-level loop nest to initialize the entries in an array. The kernels 
+perform the same operations as the examples in :ref:`tut-kernelexecpols-label`.
+By comparing the two sets of examples, you will gain an understanding of the
+differences between the ``RAJA::kernel`` and the ``RAJA::launch`` 
+interfaces.
+
+We begin by defining some constants used throughout the examples and allocating
+arrays to represent the array data:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _init_define_start
+   :end-before: _init_define_end
+   :language: C++
+
+Note that we use the 'memory manager' routines contained in the exercise
+directory to simplify the allocation process. In particular, CUDA unified 
+memory is used when CUDA is enabled to simplify accessing the data on the
+host or device.
+
+Next, we execute a C-style nested for-loop version of the kernel to initialize
+the entries in the 'reference' array that we will use to compare the results
+of other variants for correctness:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _cstyle_tensorinit_seq_start
+   :end-before: _cstyle_tensorinit_seq_end
+   :language: C++
+
+Note that we manually compute the pointer offsets for the (i,j,k) indices. 
+To simplify the remaining kernel variants we introduce a ``RAJA::View``
+object, which wraps the array data pointer and simplifies the multi-dimensional
+indexing:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _3D_raja_view_start
+   :end-before: _3D_raja_view_end
+   :language: C++
+
+Here 'aView' is a three-dimensional View with extent 'N' in each
+coordinate based on a three-dimensional ``RAJA::Layout`` object where the
+array entries will be accessed using indices of type 'int'.
+indices of type ``int``. Please see :ref:`feat-view-label` for more information 
+about the View and Layout types that RAJA provides for various indexing
+patterns and data layouts.
+
+Using the View, the C-style kernel looks like:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _cstyle_tensorinit_view_seq_start
+   :end-before: _cstyle_tensorinit_view_seq_end
+   :language: C++
+
+Notice how accessing each (i,j,k) entry in the array is more natural,
+and less error prone, using the View.
+
+The corresponding RAJA sequential version using ``RAJA::launch`` is:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_seq_start
+   :end-before: _raja_tensorinit_seq_end
+   :language: C++
+
+This should be familiar to the reader who has read through the preceding
+:ref:`tut-launchintro-label` section of this tutorial. As the 
+``RAJA::launch`` method is templated on a host execution policy, the 
+``RAJA::LaunchParams`` object can be defined without arguments as loop methods 
+will get dispatched as standard C-Style for-loops.
+     
+Suppose we wanted to parallelize the outer 'k' loop using OpenMP multithreading.
+A C-style version of this is:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _cstyle_tensorinit_omp_outer_start
+   :end-before: _cstyle_tensorinit_omp_outer_end
+   :language: C++
+
+where we have placed the OpenMP directive ``#pragma omp parallel for`` before
+the outer loop of the kernel.
+
+The corresponding RAJA versions of the C-style OpenMP variant is:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_omp_outer_start
+   :end-before: _raja_tensorinit_omp_outer_end
+   :language: C++
+
+With the OpenMP version above, ``RAJA::launch`` method is templated on
+a ``RAJA::omp_launch_t`` execution policy. The policy is used
+to create an OpenMP parallel region, loop iterations may then be distributed
+using ``RAJA::loop`` methods templated on ``RAJA::omp_for_exec`` 
+execution policies. As before, the ``RAJA::LaunchParams`` object may be 
+initialized without grid dimensions as the CPU does not require specifying a 
+compute grid.
+
+The first RAJA-based kernel for parallel GPU execution using the RAJA CUDA
+back-end we introduce is:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_cuda_start
+   :end-before: _raja_tensorinit_cuda_end
+   :language: C++
+
+where we have defined the CUDA thread-block dimensions as:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _cuda_blockdim_start
+   :end-before: _cuda_blockdim_end
+   :language: C++      
+
+Here, we use the ``RAJA::cuda_launch_t`` policy type to
+indicate that we want a CUDA kernel to be launched. The 'k', 'j', 'i'
+iteration variables are mapped to CUDA threads and blocks using the CUDA 
+execution policy types ``RAJA::cuda_block_z_direct``, 
+``RAJA::cuda_global_thread_y``, and ``RAJA::cuda_global_thread_x``,
+respectively. Thus, we use a two-dimensional CUDA thread-block and 
+three-dimensional compute grid to map the loop iterations to CUDA threads. In 
+comparison to the RAJA CUDA example in :ref:`tut-kernelexecpols-label` , 
+``RAJA::loop`` methods support execution policies, which enable mapping 
+directly to the global thread ID of a compute grid.
+
+Using a combination of ``RAJA::tile`` and ``RAJA::loop`` methods, 
+we can create a loop tiling platform portable implementation. Here, is a 
+CUDA variant: 
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_cuda_tiled_direct_start
+   :end-before: _raja_tensorinit_cuda_tiled_direct_end
+   :language: C++
+
+We consider the kernel to be portable, because all of the execution policy types
+and execution parameters can be replaced by other types and values without
+changing the kernel code directly. 
+
+The ``RAJA::tile`` methods are used to partition an iteration space into
+tiles to be used within a ``RAJA::loop`` method. The '{i,j,k}_block_sz'
+arguments passed to the ``RAJA::tile`` function specify the tile size
+for each loop. In the case of GPU programming models, we define the tile size 
+to correspond to the number of threads in a given dimension. Execution tile 
+and loop execution policies are chosen to have CUDA blocks and threads map 
+directly to tiles and entries in a tile.
+	      
+For context and comparison, here is the same kernel implementation using
+CUDA directly:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _cuda_tensorinit_tiled_direct_start
+   :end-before: _cuda_tensorinit_tiled_direct_end
+   :language: C++
+
+The ``nested_init`` device kernel used here is:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _cuda_tensorinit_kernel_start
+   :end-before: _cuda_tensorinit_kernel_end
+   :language: C++
+
+A few differences between the CUDA and RAJA-CUDA versions are worth noting. 
+First, the CUDA version uses the CUDA ``dim3`` construct to express the 
+threads-per-block and number of thread-blocks to use: i.e., the 
+``nthreads_per_block`` and ``nblocks`` variable definitions. The
+``RAJA::launch`` interface takes compute dimensions through a
+``RAJA::LaunchParams`` object. RAJA provides a macro ``RAJA_DIVIDE_CEILING_INT``
+to perform the proper integer arithmetic to calculate the number of blocks 
+based on the size of the array and the block size in each dimension. Second, the
+mapping of thread identifiers to the (i,j,k) indices is explicit in the device 
+kernel. Third, an explicit check of the (i,j,k) values is required in the CUDA 
+implementation to avoid addressing memory out-of-bounds; i.e., 
+``if ( i < N && j < N && k < N )...``. The RAJA variants set similar
+definitions internally and **mask out indices that would be out-of-bounds.**
+Note that we also inserted additional error checking with ``static_assert``
+and ``cudaErrchk``, which is a RAJA macro, for printing CUDA device error
+codes, to catch device errors if there are any.
+
+Lastly, we show the RAJA HIP variants of the kernel, which are semantically
+identical to the RAJA CUDA variants. First, the RAJA-HIP global-thread 
+variant:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_hip_start
+   :end-before: _raja_tensorinit_hip_end
+   :language: C++
+
+and then the RAJA Launch HIP fixed thread-block size, tiled, direct thread 
+mapping version:
+
+.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp
+   :start-after: _raja_tensorinit_hip_tiled_direct_start
+   :end-before: _raja_tensorinit_hip_tiled_direct_end
+   :language: C++
+
+The only differences are that type names are changed to replace 'CUDA' types 
+with 'HIP' types to use the RAJA HIP back-end.
diff --git a/docs/sphinx/user_guide/tutorial/launch_naming_kernels.rst b/docs/sphinx/user_guide/tutorial/launch_naming_kernels.rst
new file mode 100644
index 0000000000..7fa02a0fcb
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/launch_naming_kernels.rst
@@ -0,0 +1,65 @@
+.. ##
+.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-teamsbasic-label:
+
+------------------------------------
+Naming kernels for NVTX/ROCTX tools
+------------------------------------
+
+There are no exercise files to work through for this section. Instead, there
+is an example source file ``RAJA/examples/teams_reductions.cpp`` which
+contains complete code examples of the concepts described here.
+
+Key RAJA feature shown in the following example:
+
+  *  Naming kernels using an optional argument in ``RAJA::launch`` methods.
+
+In this example, we illustrate kernel naming capabilities within the RAJA Launch
+framework for use with NVTX or ROCTX region naming capabilities.
+
+To name a ``RAJA::launch`` kernel, a string name is passed as an argument
+before the lambda ::
+
+  RAJA::launch<launch_policy>(RAJA::ExecPlace ,
+    RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams),
+                       RAJA::Threads(Nthreads,Nthreads)),
+                       "myKernel",
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      /* Kernel body code goes here */
+
+    }
+  );
+  
+The kernel name is used to create NVTX (NVIDIA) or ROCTX (AMD) ranges enabling
+developers to identify kernels using NVIDIA `Nsight <https://developer.nvidia.com/nsight-visual-studio-edition>`_
+and NVIDIA `nvprof <https://docs.nvidia.com/cuda/profiler-users-guide/index.html>`_ profiling
+tools or `ROCm <https://rocmdocs.amd.com/en/latest/ROCm_Tools/ROCm-Tools.html>`_
+profiling tools when using ROCTX.  As an illustration, nvprof
+kernels are identified as ranges of GPU activity using the provided kernel 
+name::
+
+  ==73220== NVTX result:
+  ==73220==   Thread "<unnamed>" (id = 290832)
+  ==73220==     Domain "<unnamed>"
+  ==73220==       Range "myKernel"
+              Type  Time(%)      Time     Calls       Avg       Min       Max  Name
+              Range:  100.00%  32.868us         1  32.868us  32.868us  32.868us  myKernel
+     GPU activities:  100.00%  2.0307ms         1  2.0307ms  2.0307ms  2.0307ms  _ZN4RAJA4expt17launch_global_fcnIZ4mainEUlNS0_13LaunchContextEE_EEvS2_T_
+          API calls:  100.00%  27.030us         1  27.030us  27.030us  27.030us  cudaLaunchKernel
+
+Similarly, ROCm tools can be used to generate traces of a profile and
+the resulting json file can be viewed using tools such as `Perfetto
+<https://ui.perfetto.dev/#!/>`_.
+
+In future work, we plan to add support to other profiling tools. Thus, API 
+changes may occur based on user feedback and integration with other tools. 
+Enabling NVTX profiling with RAJA Launch requires RAJA to be configured with 
+RAJA_ENABLE_NV_TOOLS_EXT=ON.
+or RAJA_ENABLE_ROCTX=ON for ROCTX profiling on AMD platforms platforms.
diff --git a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst
index 74afc5fc4b..5760627743 100644
--- a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst
+++ b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst
@@ -6,18 +6,23 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _matrixmultiply-label:
+.. _tut-matrixmultiply-label:
 
 ------------------------------------
-Matrix Multiplication (Nested Loops)
+Matrix Multiplication: RAJA::kernel
 ------------------------------------
 
+The file ``RAJA/examples/tut_matrix-multiply.cpp`` contains the complete 
+working code for all examples described in this section, plus others that
+show a variety of ``RAJA::kernel`` execution policy types. It also contains 
+raw CUDA and HIP versions of the kernel for comparison.
+
 Key RAJA features shown in the following examples:
 
   * ``RAJA::kernel`` template for nested-loop execution
   * RAJA kernel execution policies
   * ``RAJA::View`` multi-dimensional data access
-  * Basic RAJA nested-loop interchange 
+  * RAJA nested-loop interchange 
   * Specifying lambda arguments through statements
 
 In this example, we present different ways to perform multiplication of two 
@@ -31,8 +36,8 @@ C-version:
    :end-before: _matmult_macros_end
    :language: C++
 
-Then, a typical C-style sequential matrix multiplication operation looks like
-this:
+Then, a typical C-style sequential matrix multiplication operation might
+look like this:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
    :start-after: _matmult_cstyle_start
@@ -43,7 +48,7 @@ For the RAJA variants of the matrix multiple operation presented below,
 we use ``RAJA::View`` objects, which allow us to access matrix
 entries in a multi-dimensional manner similar to the C-style version that
 uses macros. We create a two-dimensional N x N 'view'
-for each of the three matrices:
+for each matrix:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
    :start-after: _matmult_views_start
@@ -53,70 +58,16 @@ for each of the three matrices:
 We show the most basic RAJA view usage here -- to simplify multi-dimensional
 array indexing. RAJA views can be used to abstract a variety of different 
 data layouts and access patterns, including stride permutations, offsets, etc. 
-For more information about RAJA views, see :ref:`view-label`.
+For more information about RAJA views, see :ref:`feat-view-label`.
 
-We also use the following ``RAJA::RangeSegment`` objects to define the matrix 
-row and column and dot product iteration spaces:
+We also use the following ``RAJA::TypedRangeSegment`` objects to define the 
+matrix row and column and dot product iteration spaces:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
    :start-after: _matmult_ranges_start
    :end-before: _matmult_ranges_end
    :language: C++
 
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Should I Use RAJA::forall For Nested Loops?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We begin by walking through some RAJA variants of the matrix multiplication 
-operation that show RAJA usage that **we do not recommend**, but which helps
-to motivate the ``RAJA::kernel`` interface. We noted some rationale behind 
-this preference in :ref:`loop_elements-kernel-label`. Here, we discuss this
-in more detail.
-
-Starting with the C-style kernel above, we first convert the outermost 
-'row' loop to a ``RAJA::forall`` method call with a sequential execution policy:
-
-.. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
-   :start-after: _matmult_outerforall_start
-   :end-before: _matmult_outerforall_end
-   :language: C++
-
-Here, the lambda expression for the loop body contains the inner 
-'col' and 'k' loops.
-
-Note that changing the RAJA execution policy to an OpenMP or CUDA policy
-enables the outer 'row' loop to run in parallel. When this is done, 
-each thread executes the lambda expression body, which contains the 'col' 
-and 'k' loops. Although this enables some parallelism, there is still more 
-available. In a bit, we will how the ``RAJA::kernel`` interface helps us to
-expose all available parallelism.
-
-Next, we nest a ``RAJA::forall`` method call for the 'column' loop inside the 
-outer lambda expression:
-
-.. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
-   :start-after: _matmult_nestedforall_start
-   :end-before: _matmult_nestedforall_end
-   :language: C++
-
-Here, the innermost lambda expression contains the row-column dot product
-initialization, the inner 'k' loop for the dot product, and the operation
-that assigns the dot product to the proper location in the result matrix. 
-
-Note that we can replace either RAJA execution policy with an OpenMP 
-execution policy to parallelize either the 'row' or 'col' loop. For example, 
-we can use an OpenMP execution policy on the outer 'row' loop and the result 
-will be the same as using an OpenMP execution policy in the earlier case that 
-used a ``RAJA::forall`` statement for the outer loop.
-
-We do not recommend using a parallel execution policy for both loops in 
-this type of kernel as the results may not be what is expected and RAJA 
-provides better mechanisms for parallelizing nested loops. Also, changing 
-the outer loop policy to a CUDA policy will not compile. This is by design 
-in RAJA since nesting forall statements inside lambdas in this way has limited 
-utility, is inflexible, and can hinder performance when compared to 
-``RAJA::kernel`` constructs, which we describe next. 
-
 .. _matmultkernel-label:
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -142,14 +93,15 @@ second argument is the lambda loop body. Unlike ``RAJA::forall``, the
 iteration space for ``RAJA::kernel`` is defined as a *tuple* of ranges 
 (created via the ``RAJA::make_tuple`` method), one for the 'col' loop and 
 one for the 'row' loop. Also, the lambda expression takes an iteration index 
-argument for entry in the iteration space tuple. 
+argument for each entry in the iteration space tuple. 
 
 .. note :: The number and order of lambda arguments must match the number and
-           order of the elements in the tuple for this to be correct.
+           order of the elements in the tuple for this type of ``RAJA::kernel``
+           usage to be correct.
 
 Another important difference between ``RAJA::forall`` and ``RAJA::kernel`` 
 involves the execution policy template parameter. The execution policy defined 
-by the ``RAJA::KernelPolicy`` type used here specifies a policy for each level 
+by the ``RAJA::KernelPolicy`` type shown here specifies a policy for each level 
 in the loop nest via nested ``RAJA::statement::For`` types. Here, the row and 
 column loops will both execute sequentially. The integer that appears as the 
 first template parameter to each 'For' statement corresponds to the position of 
@@ -157,18 +109,14 @@ a range in the iteration space tuple and also to the associated iteration
 index argument to the lambda. Here, '0' is the 'col' range and '1' is the 
 'row' range because that is the order those ranges appear in the tuple. The 
 innermost type ``RAJA::statement::Lambda<0>`` indicates that the first lambda
-expression (the only one in this case!) argument passed to the 
-``RAJA::kernel`` method will be invoked inside the nested loops.
+expression (the only one in this case) argument passed to the 
+``RAJA::kernel`` method will be invoked inside the inner loop.
 
 The integer arguments to the ``RAJA::statement::For`` types are needed to 
-enable a variety of kernel execution patterns and transformations. Since the 
-kernel policy is a single unified construct, it can be used to parallelize 
-the nested loop iterations together, which we will show later. Also, the 
-levels in the loop nest can be permuted by reordering the policy arguments; 
-this is analogous to how one would reorder C-style nested loops; i.e., 
-reorder for-statements for each loop nest level. These execution patterns 
-and transformations can be achieved by changing only the policy and leaving the 
-loop kernel code as is.
+enable the desired kernel execution pattern and potential transformations,
+without changing the kernel code. Since the kernel policy is a single unified 
+construct, it can be used to parallelize the nested loop iterations together, 
+which we show next. 
 
 If we want to execute the row loop using OpenMP multithreaded parallelism 
 and keep the column loop sequential, the policy we would use is:
@@ -181,7 +129,7 @@ and keep the column loop sequential, the policy we would use is:
 To swap the loop nest ordering and keep the same execution policy on each loop,
 we would use the following policy, which swaps the ``RAJA::statement::For`` 
 types. The inner loop is now the 'row' loop and is run in parallel; 
-the outer loop is now the 'col' loop and is still sequential:
+the outer loop is now the 'col' loop and is run sequentially:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
    :start-after: _matmult_ompkernel_swap_start
@@ -192,8 +140,8 @@ the outer loop is now the 'col' loop and is still sequential:
           and others, can be done by switching the ``RAJA::KernelPolicy`` 
           type with no changes to the loop kernel code.
 
-In :ref:`nestedreorder-label`, we provide a more detailed discussion of the
-mechanics of loop nest reordering. Next, we show other variations of the 
+In :ref:`tut-kernelnestedreorder-label`, we provide a more detailed discussion 
+of the mechanics of loop nest ordering. Next, we show other variations of the 
 matrix multiplication kernel that illustrate other ``RAJA::kernel`` features. 
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -214,13 +162,13 @@ The first example uses sequential execution for all loops:
 
 Note that we use a ``RAJA::kernel_param`` method to execute the kernel. It is
 similar to ``RAJA::kernel`` except that it accepts a tuple as the second
-argument (between the iteration space tuple and the lambda expressions). The
-tuple is a set of *parameters* that can be used in the kernel to pass data
-into lambda expressions. Here, the parameter tuple holds a single scalar 
-variable for the dot product. 
+argument (between the iteration space tuple and the lambda expressions). In
+general, the tuple is a set of *parameters* that can be used in the lambda
+expressions comprising the kernel. Here, the parameter tuple holds a single 
+scalar variable for the dot product of each row-column pair. 
 
 The remaining arguments include a sequence of lambda expressions representing 
-different parts of the inner loop body. We use three lambda expressions that: 
+different parts of the kernel body. We use three lambda expressions that: 
 initialize the dot product variable (lambda 0), define the 'k' inner loop 
 row-col dot product operation (lambda 1), and store the computed row-col dot 
 product in the proper location in the result matrix (lambda 2). Note that
@@ -228,39 +176,47 @@ all lambdas take the same arguments in the same order, which is required for
 the kernel to be well-formed. In addition to the loop index variables, we 
 pass the scalar dot product variable into each lambda. This enables the same 
 variables to be used in all three lambda expressions. However, observe that
-not all lambda expressions use all three index variables. They are declared, 
-but left unnamed to prevent compiler warnings. 
-
-Alternatively, the lambda statements in the execution policy may be used
-to specify which arguments each lambda takes and in which order. For example:
+not all lambda expressions use all three index variables. This is the 
+result of using the ``RAJA::Params`` and ``RAJA::Segs`` template parameter
+types in the ``RAJA::statement::Lambda`` types for lambdas '0' and '2'.
+Specifically, ``RAJA::statement::Lambda<0, RAJA::Params<0>>`` indicates that
+lambda '0' will take only the scalar parameter as an argument, and 
+``RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>`` indicates
+that lambda '2' will take index values for the column and row ranges and
+the scalar parameter as arguments, in that order. Since lambda '1' takes all
+arguments, we do not specify them.
+
+Alternatively, the statement to invoke lambda '1' could be augmented to 
+specify the arguments it takes:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
    :start-after: _matmult_3lambdakernel_args_seq_start
    :end-before: _matmult_3lambdakernel_args_seq_end
    :language: C++
 
+The result is the same.
+
 By using ``RAJA::statement::Lambda`` parameters in this way, the code 
-potentially indicates more clearly which areguments are used. Of course, this 
+potentially indicates more clearly which arguments are used. Of course, this 
 makes the execution policy more verbose, but that is typically hidden away 
-in a header file. Statements such as ``RAJA::Segs``, and 
-``RAJA::Params`` identify the positions of the segments and params 
-in the tuples to be used as arguments to the lambda expressions.
+in a header file, so it need not make the code harder to read. 
+
+.. note::: ``RAJA::Segs`` and ``RAJA::Params`` types can be used in a 
+           ``RAJA::statement::Lambda`` type to  identify which segment 
+           indices and params are passed as arguments to a lambda expression.
 
 As we noted earlier, the execution policy type passed to the 
 ``RAJA::kernel_param`` method as a template parameter describes how the 
 statements and lambda expressions are assembled to form the complete kernel. 
 To illustrate this, we describe various policies that enable the kernel to 
 run in different ways. In each case, the ``RAJA::kernel_param`` method call, 
-including its arguments is the same. The curious reader will inspect the 
-example code in the file listed below to see that this is indeed the case.
-In the interest of simplicity, the remaining matrix multiplication examples
-do not use ``RAJA::statement::Lambda`` parameters to specify arguments to
-the lambda expressions.
+including its arguments is the same. The curious reader may inspect the 
+example code in the file noted above to see that this is indeed the case.
 
 Next, we show how to collapse nested loops in an OpenMP parallel region
 using a ``RAJA::statement::Collapse`` type in the execution policy. This
 allows one to parallelize multiple levels in a loop nest using OpenMP 
-directives, for instance. The following policy will collapse the two outer 
+directives. The following policy will collapse the two outer 
 loops into one OpenMP parallel region:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
@@ -271,9 +227,9 @@ loops into one OpenMP parallel region:
 The ``RAJA::ArgList`` type indicates which loops in the nest are to be 
 collapsed and their nesting order within the collapse region. The integers
 passed to ``ArgList`` are indices of entries in the tuple of iteration spaces 
-and indicate inner to outer loop levels when read from right to left (i.e., 
-here '1, 0' indicates the column loop is the inner loop and the row loop is 
-the outer). For this transformation there are no ``statement::For`` types
+and indicate inner to outer loop levels when read from right to left. Here, 
+'1, 0' indicates that the column loop is the inner loop and the row loop is 
+the outer loop. For this transformation there are no ``statement::For`` types
 and policies for the individual loop levels inside the OpenMP collapse region. 
 
 Lastly, we show how to use ``RAJA::statement::CudaKernel`` and 
@@ -320,18 +276,14 @@ Note that the tiling mechanism requires a ``RAJA::statement::Tile`` type,
 with a tile size and a tiling execution policy, plus a ``RAJA::statement::For``
 type with an execution execution policy for each tile dimension.
 
-The analogous HIP policy is:
+The analogous HIP execution policy is:
 
 .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp
    :start-after: _matmult_3lambdakernel_hiptiled_start
    :end-before: _matmult_3lambdakernel_hiptiled_end
    :language: C++
 
-In :ref:`tiledmatrixtranspose-label` and :ref:`matrixtransposelocalarray-label`,
-we will discuss loop tiling in more detail including how it can be used to 
-improve performance of certain algorithms.
+In :ref:`tut-tiledmatrixtranspose-label` and 
+:ref:`tut-matrixtransposelocalarray-label`,
+we discuss loop tiling in more detail.
 
-The file ``RAJA/examples/tut_matrix-multiply.cpp`` contains the complete 
-working code for all examples described in this section, plus others that
-show a variety of ``RAJA::kernel`` execution policy types. It also contains 
-a raw CUDA version of the kernel for comparison.
diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose.rst
new file mode 100644
index 0000000000..5a36ccbee9
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/matrix_transpose.rst
@@ -0,0 +1,123 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-matrixtranspose-label:
+
+----------------------
+Matrix Transpose
+----------------------
+
+In :ref:`tut-kernelexecpols-label` and :ref:`tut-launchexecpols-label`,
+we presented a simple array initialization kernel using ``RAJA::kernel`` and 
+``RAJA::launch`` interfaces, respectively, and compared the two. This 
+section describes the implementation of a matrix transpose kernel using both 
+``RAJA::kernel`` and ``RAJA::launch`` interfaces. The intent is to 
+compare and contrast the two, as well as introduce additional features of the 
+interfaces.
+
+There are exercise files 
+``RAJA/exercises/kernel-matrix-transpose.cpp`` and
+``RAJA/exercises/launch-matrix-transpose.cpp`` for you to work through if you 
+wish to get some practice with RAJA. The files 
+``RAJA/exercises/kernel-matrix-transpose_solution.cpp`` and
+``RAJA/exercises/launch-matrix-transpose_solution.cpp`` contain
+complete working code for the examples. You can use the solution files to 
+check your work and for guidance if you get stuck. To build
+the exercises execute ``make (kernel/launch)-matrix-transpose`` and ``make (kernel/launch)-matrix-transpose_solution``
+from the build directory.
+
+Key RAJA features shown in this example are:
+
+  * ``RAJA::kernel`` method and kernel execution policies
+  * ``RAJA::launch`` method and kernel execution interface
+
+In the example, we compute the transpose of an input matrix
+:math:`A` of size :math:`N_r \times N_c` and store the result in a second
+matrix :math:`At` of size :math:`N_c \times N_r`.
+
+First we define our matrix dimensions
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp
+   :start-after: // _mattranspose_dims_start
+   :end-before: // _mattranspose_dims_end
+   :language: C++
+
+and wrap the data pointers for the matrices in ``RAJA::View`` objects to
+simplify the multi-dimensional indexing:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp
+   :start-after: // _mattranspose_views_start
+   :end-before: // _mattranspose_views_end
+   :language: C++
+
+Then, a C-style for-loop implementation looks like this:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp
+   :start-after: // _cstyle_mattranspose_start
+   :end-before: // _cstyle_mattranspose_end
+   :language: C++
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``RAJA::kernel`` Implementation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For ``RAJA::kernel`` variants, we use ``RAJA::statement::For`` and 
+``RAJA::statement::Lambda`` statement types in the execution policies.
+The complete sequential ``RAJA::kernel`` variant is:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp
+   :start-after: // _raja_mattranspose_start
+   :end-before: // _raja_mattranspose_end
+   :language: C++
+
+A CUDA ``RAJA::kernel`` variant for the GPU is similar with different policies
+in the ``RAJA::statement::For`` statements:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp
+   :start-after: // _raja_mattranspose_cuda_start
+   :end-before: // _raja_mattranspose_cuda_end
+   :language: C++
+
+A notable difference between the CPU and GPU execution policy is the insertion 
+of the ``RAJA::statement::CudaKernel`` type in the GPU version, which indicates
+that the execution will launch a CUDA device kernel.
+
+In the CUDA ``RAJA::kernel`` variant above, the thread-block size and
+and number of blocks to launch is determined by the implementation of the 
+``RAJA::kernel`` execution policy constructs using the sizes of the 
+``RAJA::TypedRangeSegment`` objects in the iteration space tuple.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``RAJA::launch`` Implementation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For ``RAJA::launch`` variants, we use ``RAJA::loop`` methods 
+to write a loop hierarchy within the kernel execution space. For a sequential 
+implementation, we pass the ``RAJA::seq_launch_t`` template parameter
+to the launch method and pass the ``RAJA::loop_exec`` parameter to the loop 
+methods. The complete sequential ``RAJA::launch`` variant is:
+
+.. literalinclude:: ../../../../exercises/launch-matrix-transpose_solution.cpp
+   :start-after: // _raja_mattranspose_start
+   :end-before: // _raja_mattranspose_end
+   :language: C++
+
+A CUDA ``RAJA::launch`` variant for the GPU is similar with CUDA
+policies in the ``RAJA::loop`` methods. The complete 
+``RAJA::launch`` variant is:
+
+.. literalinclude:: ../../../../exercises/launch-matrix-transpose_solution.cpp
+   :start-after: // _raja_mattranspose_cuda_start
+   :end-before: // _raja_mattranspose_cuda_end
+   :language: C++
+
+A notable difference between the CPU and GPU ``RAJA::launch`` 
+implementations is the definition of the compute grid. For the CPU
+version, the argument list is empty for the ``RAJA::LaunchParams`` constructor.
+For the CUDA GPU implementation, we define a 'Team' of one two-dimensional 
+thread-block with 16 x 16 = 256 threads.
diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst
index 7fb10a7299..3d5aa4f316 100644
--- a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst
+++ b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst
@@ -6,60 +6,72 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _matrixtransposelocalarray-label:
+.. _tut-matrixtransposelocalarray-label:
 
----------------------------------
-Matrix Transpose with Local Array
----------------------------------
+-----------------------------------------
+Tiled Matrix Transpose with Local Array
+-----------------------------------------
 
-This section extends the discussion in :ref:`tiledmatrixtranspose-label`, 
-where only loop tiling is considered. Here, we combine loop tiling with 
-``RAJA::LocalArray`` objects which enable us to store data for each tile in
+This section extends the discussion in :ref:`tut-tiledmatrixtranspose-label`
+by adding *local array* objects which are used to store data for each tile in
 CPU stack-allocated arrays or GPU thread local and shared memory to be used 
-within kernels. For more information about ``RAJA::LocalArray``, please 
-see :ref:`local_array-label`.
+within kernels.
 
-Key RAJA features shown in this example include:
+There are exercise files
+``RAJA/exercises/kernel-matrix-transpose-local-array.cpp`` and
+``RAJA/exercises/launch-matrix-transpose-local-array.cpp`` for you to work 
+through if you wish to get some practice with RAJA. The files
+``RAJA/exercises/kernel-matrix-transpose-local-array._solutioncpp`` and
+``RAJA/exercises/launch-matrix-transpose-local-array_solution.cpp`` contain
+complete working code for the examples. You can use the solution files to
+check your work and for guidance if you get stuck. To build
+the exercises execute ``make (kernel/launch)-matrix-transpose-local-array`` and ``make (kernel/launch)-matrix-transpose-local-array_solution``
+from the build directory.
 
-  * ``RAJA::kernel_param`` method with multiple lambda expressions
-  * ``RAJA::statement::Tile`` type
-  * ``RAJA::statement::ForICount`` type
-  * ``RAJA::LocalArray``
-  * Specifying lambda arguments through statements
+Key RAJA features shown in this example are:
 
-As in :ref:`tiledmatrixtranspose-label`, this example computes the transpose 
-of an input matrix :math:`A` of size :math:`N_r \times N_c` and stores the 
-result in a second matrix :math:`At` of size :math:`N_c \times N_r`. The 
-operation uses a local memory tiling algorithm. The algorithm tiles the outer 
+  * ``RAJA::kernel_param`` method and execution policy usage with multiple lambda expressions
+  * ``RAJA::statement::Tile`` type for loop tiling
+  * ``RAJA::statement::ForICount`` type for generating local tile indices
+  * ``RAJA::LocalArray`` type for thread-local tile memory arrays
+  * ``RAJA::launch`` kernel execution interface
+  * ``RAJA::expt::tile`` type for loop tiling
+  * ``RAJA::expt::loop_icount`` method to generate local tile indices for Launch
+  * ``RAJA_TEAM_SHARED`` macro for thread-local tile memory arrays
+
+As in :ref:`tut-tiledmatrixtranspose-label`, this example computes the 
+transpose of an input matrix :math:`A` of size :math:`N_r \times N_c` and 
+stores the result in a second matrix :math:`At` of size :math:`N_c \times N_r`.
+The operation uses a local memory tiling algorithm, which tiles the outer 
 loops and iterates over tiles in inner loops. The algorithm first loads 
 input matrix entries into a local two-dimensional array for a tile, and then 
 reads from the tile swapping the row and column indices to generate the output 
 matrix. 
 
-We start with a non-RAJA C++ implementation to show the algorithm pattern.
 We choose tile dimensions smaller than the dimensions of the matrix and note 
 that it is not necessary for the tile dimensions to divide evenly the number
-of rows and columns in the matrix A. As in the :ref:`tiledmatrixtranspose-label`
-example, we start by defining the number of rows and columns in the matrices, 
-the tile dimensions, and the number of tiles.
+of rows and columns in the matrix. As in the 
+:ref:`tut-tiledmatrixtranspose-label` example, we start by defining the number 
+of rows and columns in the matrices, the tile dimensions, and the number of 
+tiles.
 
-.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp
    :start-after: // _mattranspose_localarray_dims_start
    :end-before: // _mattranspose_localarray_dims_end
    :language: C++
 
 We also use RAJA View objects to simplify the multi-dimensional indexing
-as in the :ref:`tiledmatrixtranspose-label` example.
+as in the :ref:`tut-tiledmatrixtranspose-label` example.
 
-.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp
    :start-after: // _mattranspose_localarray_views_start
    :end-before: // _mattranspose_localarray_views_end
    :language: C++
 
-The complete sequential C++ implementation of the tiled transpose operation 
+The complete sequential C-style implementation of the tiled transpose operation 
 using a stack-allocated local array for the tiles is:
 
-.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp
    :start-after: // _mattranspose_localarray_cstyle_start
    :end-before: // _mattranspose_localarray_cstyle_end
    :language: C++
@@ -72,15 +84,16 @@ using a stack-allocated local array for the tiles is:
             stride-1 data access.
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RAJA::kernel Version of Tiled Loops with Local Array
+``RAJA::kernel`` Variants
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-RAJA provides mechanisms to tile loops and use *local arrays*
-in kernels so that algorithm patterns like we just described can be 
-implemented with RAJA. A ``RAJA::LocalArray`` type specifies an object whose
-memory is created inside a kernel using a ``RAJA::statement`` type in a RAJA 
-kernel execution policy. The local array data is only usable within the kernel.
-See :ref:`local_array-label` for more information. 
+The ``RAJA::kernel`` interface provides mechanisms to tile loops and use 
+*local arrays* in kernels so that algorithm patterns like the C-style kernel 
+above can be implemented with RAJA. When using ``RAJA::kernel``, a 
+``RAJA::LocalArray`` type specifies an object whose memory is created inside 
+a kernel using a statement type in a RAJA kernel execution policy. The local 
+array data is only usable within the kernel. See :ref:`feat-local_array-label` 
+for more information. 
 
 ``RAJA::kernel`` methods also support loop tiling statements which determine 
 the number of tiles needed to perform an operation based on tile size and
@@ -94,91 +107,98 @@ For the RAJA version of the matrix transpose kernel above, we define the
 type of the ``RAJA::LocalArray`` used for matrix entries in a tile and
 create an object to represent it: 
 
-.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp
    :start-after: // _mattranspose_localarray_start
    :end-before: // _mattranspose_localarray_end
    :language: C++
 
-The template parameters that define the type are: array data type, data stride
-permutation for the array indices (here the identity permutation is given, so
-the default RAJA conventions apply; i.e., the rightmost array index will be 
-stride-1), and the array dimensions. Next, we compare two RAJA implementations
-of matrix transpose with RAJA. 
+The template parameters that define the type are: the array data type, the 
+data stride permutation for the array indices (here the identity permutation 
+is given, so the default RAJA conventions apply; i.e., the rightmost array 
+index will be stride-1), and the array dimensions. Next, we compare two 
+``RAJA::kernel`` implementations of the matrix transpose operation.
 
 The complete RAJA sequential CPU variant with kernel execution policy and 
 kernel is:
 
-.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp
    :start-after: // _mattranspose_localarray_raja_start
    :end-before: // _mattranspose_localarray_raja_end
    :language: C++
 
-The ``RAJA::statement::Tile`` types in the execution policy define
+In the execution policy, the ``RAJA::statement::Tile`` types define
 tiling of the outer 'row' (iteration space tuple index '1') and 'col' 
-(iteration space tuple index '0') loops, including tile sizes 
+(iteration space tuple index '0') loops, as well as tile sizes 
 (``RAJA::tile_fixed`` types) and loop execution policies. Next, 
-the ``RAJA::statement::InitLocalMem`` type initializes the local stack array
+the ``RAJA::statement::InitLocalMem`` type allocates the local tile array
 based on the memory policy type (here, we use ``RAJA::cpu_tile_mem`` for
 a CPU stack-allocated array). The ``RAJA::ParamList<2>`` parameter indicates 
 that the local array object is associated with position '2' in the parameter 
 tuple argument passed to the ``RAJA::kernel_param`` method. The first two
 entries in the parameter tuple indicate storage for the local tile indices 
-which can be used in multiple lambdas in the kernel. Finally, we have two sets 
-of nested inner loops for reading the input matrix entries into the local 
-array and writing them out to the output matrix transpose. The inner bodies of 
-each of these loop nests are identified by lambda expression arguments 
-'0' and '1', respectively.
-
-Note that the loops over tiles use ``RAJA::statement::ForICount`` types 
-rather than ``RAJA::statement::For`` types that we have seen in other
-nested loop examples. The ``RAJA::statement::ForICount`` type generates 
-local tile indices that are passed to lambda loop body expressions. As 
-the reader will observe, there is no local tile index computation 
-needed in the lambdas for the RAJA version of the kernel as a result. The 
-first integer template parameter for each ``RAJA::statement::ForICount`` type 
-indicates the item in the iteration space tuple passed to the 
-``RAJA::kernel_param`` method to which it applies; this is similar to 
-``RAJA::statement::For`` usage. The second template parameter for each 
+that are used in the two lambda expressions that comprise the kernel body.
+Finally, we have two sets of nested inner loops for reading the input matrix 
+entries into the local tile array and writing them out to the output matrix 
+transpose. The inner bodies of each of these loop nests are identified by 
+lambda expression invocation statements ``RAJA::statement::Lambda<0>`` for
+the first lambda passed as an argument to the ``RAJA::kernel_param`` method
+and ``RAJA::statement::Lambda<1>`` for the second lambda argument.
+
+Note that the loops within tiles use ``RAJA::statement::ForICount`` types 
+rather than ``RAJA::statement::For`` types that we saw in the 
+tiled matrix transpose example in :ref:`tut-tiledmatrixtranspose-label`.
+The ``RAJA::statement::ForICount`` type generates local tile indices that 
+are passed to lambda loop body expressions to index into the local tile
+memory array. As the reader will observe, there is no local tile index 
+computation needed in the lambdas for the RAJA version of the kernel as a 
+result. The first integer template parameter for each 
+``RAJA::statement::ForICount`` type indicates the item in the iteration space 
+tuple passed to the ``RAJA::kernel_param`` method to which it applies. 
+The second template parameter for each 
 ``RAJA::statement::ForICount`` type indicates the position in the parameter 
 tuple passed to the ``RAJA::kernel_param`` method that will hold the 
-associated local tile index. The loop execution policy template
-argument that follows works the same as in ``RAJA::statement::For`` usage.
-For more detailed discussion of RAJA loop tiling statement types, please see
-:ref:`tiling-label`.
+associated local tile index. For more detailed discussion of RAJA loop tiling 
+statement types, please see :ref:`feat-tiling-label`.
 
 Now that we have described the execution policy in some detail, let's pull 
 everything together by briefly walking though the call to the 
-``RAJA::kernel_param`` method. The first argument is a tuple of iteration
-spaces that define the iteration ranges for the level in the loop nest.
-Again, the first integer parameters given to the ``RAJA::statement::Tile`` and
-``RAJA::statement::ForICount`` types identify the tuple entry they apply to.
-The second argument is a tuple of data parameters that will hold the local
-tile indices and ``RAJA::LocalArray`` tile memory. The tuple entries are 
+``RAJA::kernel_param`` method, which is similar to ``RAJA::kernel`` but takes
+additional arguments needed to execute the operations involving local
+tile indices and the local memory array. The first argument is a tuple of 
+iteration spaces that define the iteration ranges for the levels in the loop 
+nest. Again, the first integer parameters given to the ``RAJA::statement::Tile``
+and ``RAJA::statement::ForICount`` types identify the tuple entry to which 
+they apply. The second argument::
+
+  RAJA::make_tuple((int)0, (int)0, Tile_Array)
+
+is a tuple of data parameters that will hold the local tile indices and 
+``RAJA::LocalArray`` tile memory. The tuple entries are 
 associated with various statements in the execution policy as we described
 earlier. Next, two lambda expression arguments are passed to the 
 ``RAJA::kernel_param`` method for reading and writing the input and output 
 matrix entries, respectively.
 
-Note that each lambda expression takes five arguments. The first two are
-the matrix column and row indices associated with the iteration space tuple.
-The next three arguments correspond to the parameter tuple entries. The first
-two of these are the local tile indices used to access entries in the 
+.. note:: ``RAJA::kernel_param`` accepts a parameter tuple argument after
+          the iteration space tuple, which enables the parameters to be 
+          used in multiple lambda expressions in a kernel.
+
+In the kernel, both lambda expressions take the same five arguments. The first 
+two are the matrix global column and row indices associated with the iteration 
+space tuple. The next three arguments correspond to the parameter tuple entries.
+The first two of these are the local tile indices used to access entries in the 
 ``RAJA::LocalArray`` object memory. The last argument is a reference to the 
 ``RAJA::LocalArray`` object itself.
 
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-RAJA::kernel Version of Tiled Loops with Local Array Specifying Lambda Arguments
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The next ``RAJA::kernel_param`` variant we present works the same as the one 
+above. It is different from the previous version since we include 
+additional template parameters in the ``RAJA::statement::Lambda`` types to 
+indicate which arguments each lambda expression takes and in which order. 
+Here is the complete version including execution policy and kernel:
 
-The second RAJA variant works the same as the one above. The main differences
-between the two variants is due to the fact that in this second one, we use 
-``RAJA::statement::Lambda`` types to indicate which arguments each lambda 
-takes and in which order. Here is the complete version including
-execution policy and kernel:
-
-.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp
-   :start-after: // _mattranspose_localarray_raja_lambdaargs_start
-   :end-before: // _mattranspose_localarray_raja_lambdaargs_end
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp
+   :start-after: // _raja_mattranspose_lambdaargs_start
+   :end-before: // _raja_mattranspose_lambdaargs_start
    :language: C++
 
 Here, the two ``RAJA::statement::Lambda`` types in the execution policy show
@@ -193,11 +213,55 @@ As a consequence of specifying lambda arguments, there are two main differences.
 The local tile indices are properly computed and passed to the lambda 
 expressions as a result of the ``RAJA::Offsets`` types that appear
 in the lambda statement types. The ``RAJA::statement::Lambda`` type for each
-lambda shows the two ways to specify the local tile index args; we can use an
-``Offsets`` statement for each argument, or include multiple segment ids in one
-statement. Lastly, there is only one entry in the parameter
-tuple in this case, the local tile array. The placeholders are not needed.
-
-The file ``RAJA/examples/tut_matrix-transpose-local-array.cpp`` contains the 
-complete working example code for the examples described in this section along 
-with OpenMP, CUDA, and HIP variants.
+lambda shows the two ways to specify the local tile index arguments; we can 
+use an ``Offsets`` statement for each argument, or include multiple segment 
+ids in one statement. Lastly, there is only one entry in the parameter
+tuple in this case, the local tile array. The placeholders in the
+previous example are not needed.
+
+.. note:: In this example, we need all five arguments in each lambda 
+          expression so the lambda expression argument lists are 
+          the same. Another use case for the template parameter argument
+          specification described here is to be able to pass only the
+          arguments used in a lambda expression. In particular when we use
+          multiple lambda expressions to represent a kernel, each lambda
+          can have a different argument lists from the others.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``RAJA::expt::launch`` Variants
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``RAJA::expt::launch`` interface provides mechanisms to tile loops and use 
+*local arrays* in kernels to support algorithm patterns like the C-style kernel 
+above. When, using ``RAJA::expt::launch``, the ``RAJA_TEAM_SHARED`` macro is
+used to create a GPU shared memory array or a CPU stack memory array inside
+a kernel.
+
+``RAJA::expt::launch`` support methods for tiling over an iteration space
+using ``RAJA::expt::tile`` and ``RAJA::expt::loop_icount`` methods to tile
+loops and generate global iteration indices and local tile offsets.
+Moreover, lambda expressions for these methods will not be invoked for
+iterations outside the bounds of an iteration space when tile dimensions
+do not divide evenly the size of the iteration space; thus, no conditional
+checks on loop bounds are needed inside inner loops.
+
+A complete RAJA sequential CPU variant with kernel execution policy and 
+kernel is:
+
+.. literalinclude:: ../../../../exercises/launch-matrix-transpose-local-array_solution.cpp
+   :start-after: // _mattranspose_localarray_raja_start
+   :end-before: // _mattranspose_localarray_raja_end
+   :language: C++
+
+Here, the ``RAJA::expt::tile`` method is used to create tilings of the outer 
+'row' and 'col' iteration spaces. The ``RAJA::expt::tile`` method
+takes an additional argument specifying the tile size for the corresponding 
+loop. To traverse the tile, we use the ``RAJA::expt::loop_icount`` method, 
+which is similar to the ``RAJA::ForICount`` statement used in a 
+``RAJA::kernel`` execution policy as shown above. A 
+``RAJA::expt::loop_icount`` method call
+will generate local tile index associated with the outer global index.
+The local tile index is necessary as we use it to read and write entries 
+from/to global memory to ``RAJA_TEAM_SHARED`` memory array.
+
+
diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose_tiled.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose_tiled.rst
new file mode 100644
index 0000000000..46d92b4c66
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/matrix_transpose_tiled.rst
@@ -0,0 +1,152 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-tiledmatrixtranspose-label:
+
+----------------------
+Tiled Matrix Transpose
+----------------------
+
+This section describes the implementation of a tiled matrix transpose kernel 
+using both ``RAJA::kernel`` and ``RAJA::launch`` interfaces. The intent
+is to compare and contrast the two. The discussion builds on 
+:ref:`tut-matrixtranspose-label` by adding tiling to the matrix transpose 
+implementation.
+
+There are exercise files
+``RAJA/exercises/kernel-matrix-transpose-tiled.cpp`` and
+``RAJA/exercises/launch-matrix-transpose-tiled.cpp`` for you to work through 
+if you wish to get some practice with RAJA. The files
+``RAJA/exercises/kernel-matrix-transpose-tiled_solution.cpp`` and
+``RAJA/exercises/launch-matrix-transpose-tiled_solution.cpp`` contain
+complete working code for the examples. You can use the solution files to
+check your work and for guidance if you get stuck. To build
+the exercises execute ``make (kernel/launch)-matrix-transpose-tiled`` and 
+``make (kernel/launch)-matrix-transpose-tiled_solution``
+from the build directory.
+
+Key RAJA features shown in this example are:
+
+  * ``RAJA::kernel`` method and execution policies, and the ``RAJA::statement::Tile`` type
+  * ``RAJA::launch`` method and execution policies, and the ``RAJA::tile`` type
+
+As in :ref:`tut-matrixtranspose-label`, we compute the transpose of an input 
+matrix :math:`A` of size :math:`N_r \times N_c` and storing the result in a 
+second matrix :math:`At` of size :math:`N_c \times N_r`.
+
+We will compute the matrix transpose using a tiling algorithm, which iterates 
+over tiles and transposes the matrix entries in each tile.
+The algorithm involves outer and inner loops to iterate over the tiles and
+matrix entries within each tile, respectively.
+
+As in :ref:`tut-matrixtranspose-label`, we start by defining the matrix 
+dimensions. Additionally, we define a tile size smaller than the matrix 
+dimensions and determine the number of tiles in each dimension. Note that we 
+do not assume that tiles divide evenly the number of rows and and columns of 
+the matrix. However, we do assume square tiles.
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp
+   :start-after: // _tiled_mattranspose_dims_start
+   :end-before: // _tiled_mattranspose_dims_end
+   :language: C++
+
+Then, we wrap the matrix data pointers in ``RAJA::View`` objects to  
+simplify the multi-dimensional indexing:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp
+   :start-after: // _tiled_mattranspose_views_start
+   :end-before: // _tiled_mattranspose_views_end
+   :language: C++
+
+The C-style for-loop implementation looks like this:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp
+   :start-after: // _cstyle_tiled_mattranspose_start
+   :end-before: // _cstyle_tiled_mattranspose_end
+   :language: C++
+
+.. note:: To prevent indexing out of bounds, when the tile dimensions do not
+          divide evenly the matrix dimensions, the algorithm requires a 
+          bounds check in the inner loops.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``RAJA::kernel`` Variants
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For ``RAJA::kernel`` variants, we use ``RAJA::statement::Tile`` types
+for the outer loop tiling and ``RAJA::tile_fixed`` types to 
+indicate the tile dimensions. The complete sequential RAJA variant is:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp
+   :start-after: // _raja_tiled_mattranspose_start
+   :end-before: // _raja_tiled_mattranspose_end
+   :language: C++
+
+The ``RAJA::statement::Tile`` types compute the number of tiles needed to 
+iterate over all matrix entries in each dimension and generate iteration 
+index bounds for each tile, which are used to generate loops for the inner  
+``RAJA::statement::For`` types. Thus, the explicit bounds checking logic in the 
+C-style variant is not needed. Note that the integer template parameters
+in the ``RAJA::statement::For`` types refer to the entries in the iteration 
+space tuple passed to the ``RAJA::kernel`` method.
+
+The ``RAJA::kernel`` CUDA variant is similar with sequential policies replaced 
+with CUDA execution policies:
+
+.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp
+   :start-after: // _raja_mattranspose_cuda_start
+   :end-before: // _raja_mattranspose_cuda_end
+   :language: C++
+
+A notable difference between the CPU and GPU execution policy is the insertion
+of the ``RAJA::statement::CudaKernel`` type in the GPU version, which indicates
+that the execution will launch a CUDA device kernel.
+
+The CUDA thread-block dimensions are set based on the tile dimensions and the
+iterates withing each tile are mapped directly to GPU threads in each block
+due to the ``RAJA::cuda_thread_{x, y}_direct`` policies.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``RAJA::launch`` Variants
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For ``RAJA::launch`` variants, we use ``RAJA::tile`` methods
+for the outer loop tiling and ``RAJA::loop`` methods
+to iterate within the tiles. The complete sequential tiled 
+``RAJA::launch`` variant is:
+
+.. literalinclude:: ../../../../exercises/launch-matrix-transpose-tiled_solution.cpp
+   :start-after: // _raja_tiled_mattranspose_start
+   :end-before: // _raja_tiled_mattranspose_end
+   :language: C++
+
+Similar to the ``RAJA::statement::Tile`` type in the ``RAJA::kernel`` variant
+above, the ``RAJA::tile`` method computes the number of tiles needed to 
+iterate over all matrix entries in each dimension and generates a corresponding
+iteration space for each tile, which is used to generate loops for the inner  
+``RAJA::loop`` methods. Thus, the explicit bounds checking logic in the 
+C-style variant is not needed.
+
+A CUDA ``RAJA::launch`` tiled variant for the GPU is similar with 
+CUDA policies in the ``RAJA::loop`` methods. The complete
+``RAJA::launch`` variant is:
+
+.. literalinclude:: ../../../../exercises/launch-matrix-transpose-tiled_solution.cpp
+   :start-after: // _raja_mattranspose_cuda_start
+   :end-before: // _raja_mattranspose_cuda_end
+   :language: C++
+
+A notable difference between the CPU and GPU ``RAJA::launch``
+implementations is the definition of the compute grid. For the CPU
+version, the argument list is empty for the ``RAJA::LaunchParams`` constructor.
+For the CUDA GPU implementation, we define a 'Team' of one two-dimensional
+thread-block with 16 x 16 = 256 threads.
+
+
+
+
diff --git a/docs/sphinx/user_guide/tutorial/naming_kernels.rst b/docs/sphinx/user_guide/tutorial/naming_kernels.rst
deleted file mode 100644
index 4e30eb1dcf..0000000000
--- a/docs/sphinx/user_guide/tutorial/naming_kernels.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-.. ##
-.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-.. ## and RAJA project contributors. See the RAJA/LICENSE file
-.. ## for details.
-.. ##
-.. ## SPDX-License-Identifier: (BSD-3-Clause)
-.. ##
-
-.. _teamsbasic-label:
-
-------------------------------------
-Naming kernels with NVTX/ROCTX tools
-------------------------------------
-
-Key RAJA feature shown in the following example:
-
-  *  Naming kernels using the ``Grid`` object in ``RAJA::ext::Launch`` methods.
-
-In this example we illustrate kernel naming capabilities within the RAJA Teams
-framework for use with NVTX or ROCTX region naming capabilities.
-
-Recalling the ``RAJA::expt::launch`` API, naming a kernel is done using the third
-argument of the ``Resources`` constructor as illustrated below::
-  RAJA::expt::launch<launch_policy>(RAJA::expt::ExecPlace ,
-  RAJA::expt::Grid(RAJA::expt::Teams(Nteams,Nteams),
-                        RAJA::expt::Threads(Nthreads,Nthreads)
-                        "myKernel"),
-  [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) {
-
-    /* Express code here */
-
-  });
-  
-The kernel name is used to create NVTX (NVIDIA) or ROCTX (AMD) ranges enabling
-developers to identify kernels using NVIDIA `Nsight <https://developer.nvidia.com/nsight-visual-studio-edition>`_
-and NVIDIA `Nvprof <https://docs.nvidia.com/cuda/profiler-users-guide/index.html>`_ profiling
-tools or `ROCm <https://rocmdocs.amd.com/en/latest/ROCm_Tools/ROCm-Tools.html>`_
-profiling tools when using ROCTX.  As an illustration, using Nvprof
-kernels are identified as ranges of GPU activity through the user specified name::
-
-  ==73220== NVTX result:
-  ==73220==   Thread "<unnamed>" (id = 290832)
-  ==73220==     Domain "<unnamed>"
-  ==73220==       Range "myKernel"
-              Type  Time(%)      Time     Calls       Avg       Min       Max  Name
-              Range:  100.00%  32.868us         1  32.868us  32.868us  32.868us  myKernel
-     GPU activities:  100.00%  2.0307ms         1  2.0307ms  2.0307ms  2.0307ms  _ZN4RAJA4expt17launch_global_fcnIZ4mainEUlNS0_13LaunchContextEE_EEvS2_T_
-          API calls:  100.00%  27.030us         1  27.030us  27.030us  27.030us  cudaLaunchKernel
-
-In a similar fashion ROCm tools can be used to generate traces of the profile and
-the resulting json file can be viewed using tools such as `perfetto
-<https://ui.perfetto.dev/#!/>`_.
-
-As future work we plan to add support to other profiling tools; API changes may occur
-based on user feedback and integration with other tools. Enabling NVTX profiling
-with RAJA Teams requires RAJA to be configured with RAJA_ENABLE_NV_TOOLS_EXT=ON.
-or RAJA_ENABLE_ROCTX=ON for ROCTX profiling on AMD platforms platforms.
-
-The file RAJA/examples/teams_reductions.cpp contains a complete working example code.
diff --git a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst b/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst
deleted file mode 100644
index 0542d93216..0000000000
--- a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst
+++ /dev/null
@@ -1,172 +0,0 @@
-.. ##
-.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-.. ## and RAJA project contributors. See the RAJA/LICENSE file
-.. ## for details.
-.. ##
-.. ## SPDX-License-Identifier: (BSD-3-Clause)
-.. ##
-
-.. _nestedreorder-label:
-
----------------------------------
-Nested Loop Interchange
----------------------------------
-
-Key RAJA features shown in this example:
-
-  * ``RAJA::kernel`` loop iteration templates 
-  * RAJA nested loop execution policies
-  * Nested loop reordering (i.e., loop interchange)
-  * RAJA strongly-types indices
-
-In :ref:`loop_elements-kernel-label`, we introduced the basic mechanics in
-RAJA for representing nested loops. In :ref:`matrixmultiply-label`, we 
-presented a complete example using RAJA nested loop features. The following 
-example shows the nested loop interchange process in more detail. 
-Specifically, we describe how to reorder nested policy arguments and introduce
-strongly-typed index variables that can help users write correct nested loop 
-code with RAJA. The example does not perform any actual computation; each 
-kernel simply prints out the loop indices in the order that the iteration 
-spaces are traversed. Thus, only sequential execution policies are used. 
-However, the mechanics work the same way for other RAJA execution policies.
-
-Before we dive into the example, we note important features applied here that 
-represent the main differences between nested-loop RAJA and the 
-``RAJA::forall`` loop construct for simple (i.e., non-nested) loops: 
-
-  * An index space (e.g., range segment) and lambda index argument are 
-    required for each level in a loop nest. This example contains
-    triply-nested loops, so there will be three ranges and three index 
-    arguments.
-
-  * The index spaces for the nested loop levels are specified in a RAJA tuple 
-    object. The order of spaces in the tuple must match the order of index 
-    arguments to the lambda for this to be correct, in general. RAJA provides 
-    strongly-typed indices to help with this, which we show here.
-
-  * An execution policy is required for each level in a loop nest. These
-    are specified as nested statements in the ``RAJA::KernelPolicy`` type.
-
-  * The loop nest ordering is specified in the nested kernel policy --
-    the first ``statement::For`` type identifies the outermost loop, the 
-    second ``statement::For`` type identifies the loop nested inside the 
-    outermost loop, and so on.
-
-We begin by defining three named **strongly-typed** variables for the loop 
-index variables.
-
-.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp
-   :start-after: _nestedreorder_idxtypes_start
-   :end-before: _nestedreorder_idxtypes_end
-   :language: C++
-
-We also define three **typed** range segments which bind the ranges to the
-index variable types via template specialization:
-
-.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp
-   :start-after: _nestedreorder_ranges_start
-   :end-before: _nestedreorder_ranges_end
-   :language: C++
-
-When these features are used as in this example, the compiler will 
-generate error messages if the lambda expression index argument ordering
-and types do not match the index ordering in the tuple.
-
-We present a complete example, and then describe its key elements:
-
-.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp
-   :start-after: _nestedreorder_kji_start
-   :end-before: _nestedreorder_kji_end
-   :language: C++
-
-Here, the ``RAJA::kernel`` execution template takes two arguments: a tuple of 
-ranges, one for each of the three levels in the loop nest, and the lambda 
-expression loop body. Note that the lambda has an index argument for each 
-range and that their order and types match.
-
-The execution policy for the loop nest is specified in the 
-``RAJA::KernelPolicy`` type. Each level in the loop nest is identified by a
-``statement::For`` type, which identifies the iteration space and
-execution policy for the level. Here, each level uses a 
-sequential execution policy. This is for 
-illustration purposes; if you run the example code, you will see the loop
-index triple printed in the exact order in which the kernel executes.
-The integer that appears as the first template argument to each 
-``statement::For`` type corresponds to the index of a range in the tuple 
-and also to the associated lambda index argument; i.e., '0' is for 'i', 
-'1' is for 'j', and '2' is for 'k'. 
-
-Here, the 'k' index corresponds to the outermost loop (slowest index), 
-the 'j' index corresponds to the middle loop, and the 'i' index is for the 
-innermost loop (fastest index). In other words, if written using C-style 
-for-loops, the loop would appear as::
-
-  for (int k = 2; k< 4; ++k) {
-    for (int j = 1; j < 3; ++j) { 
-      for (int i = 0; i < 2; ++i) { 
-        // print loop index triple...
-      }
-    }
-  }
-
-The integer argument to each ``statement::For`` type is needed so 
-that the levels in the loop nest can be reordered by changing the policy 
-while the kernel remains the same. Next, we permute the loop nest ordering 
-so that the 'j' loop is the outermost, the 'i' loop is in the middle, and 
-the 'k' loop is the innermost with the following policy:
-
-.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp
-   :start-after: _nestedreorder_jik_start
-   :end-before: _nestedreorder_jik_end
-   :language: C++
-
-Note that we have simply reordered the nesting of the ``RAJA::statement::For``
-types. This is analogous to reordering 'for' statements in traditional C-style
-nested loops. Here, the analogous C-style loop nest would appear as::
-
-  for (int j = 1; j < 3; ++j) {
-    for (int i = 0; i < 2; ++i) {
-      for (int k = 2; k< 4; ++k) {
-        // print loop index triple...
-      }
-    }
-  }
-
-Finally, for completeness, we permute the loops again so that the 'i' loop 
-is the outermost, the 'k' loop is in the middle, and the 'j' loop is the 
-innermost with the following policy:
-
-.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp
-   :start-after: _nestedreorder_ikj_start
-   :end-before: _nestedreorder_ikj_end
-   :language: C++
-
-The analogous C-style loop nest would appear as::
-
-  for (int i = 0; j < 2; ++i) {
-    for (int k = 2; k< 4; ++k) {
-      for (int j = 1; j < 3; ++j) {
-        // print loop index triple...
-      }
-    }
-  }
-
-Hopefully, it should be clear how this works at this point. If not,
-the typed indices and typed range segments can help by enabling the 
-compiler to let you know when something is not correct.
-
-For example, this version of the loop will generate a compilation error
-(note that the kernel execution policy is the same as in the previous example): 
-
-.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp
-   :start-after: _nestedreorder_typemismatch_start
-   :end-before: _nestedreorder_typemismatch_end
-   :language: C++
-
-If you carefully compare the range ordering in the tuple to the
-lambda argument types, you will see what's wrong.
-
-Do you see the problem?
-
-The file ``RAJA/examples/tut_nested-loop-reorder.cpp`` contains the complete 
-working example code.
diff --git a/docs/sphinx/user_guide/tutorial/offset-layout-5pt-stencil.rst b/docs/sphinx/user_guide/tutorial/offset-layout-5pt-stencil.rst
new file mode 100644
index 0000000000..8b75eb37cb
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/offset-layout-5pt-stencil.rst
@@ -0,0 +1,203 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-offsetlayout-label:
+
+------------------------------------------------
+OffsetLayout: Five-point Stencil
+------------------------------------------------
+
+This section contains an exercise file ``RAJA/exercises/offset-layout-stencil.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/offset-layout-stencil.cpp`` contains
+complete working code for the examples discussed in this section. You can use
+the solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make offset-layout-stencil`` and ``make offset-layout-stencil_solution``
+from the build directory.
+
+Key RAJA features shown in the following example:
+
+  * ``RAJA::kernel`` loop execution template and execution policies
+  * ``RAJA::View`` multi-dimensional data access
+  * ``RAJA:make_offset_layout`` method to create an offset Layout
+
+The examples in this section apply a five-point stencil to the interior cells 
+of a two-dimensional lattice and store a resulting sum in a second 
+lattice of equal size. The five-point stencil associated with a lattice cell
+accumulates the value in the cell and each of its four neighbors. We use 
+``RAJA::View`` and ``RAJA::OffsetLayout`` constructs to simplify 
+the multi-dimensional indexing so that we can write the stencil operation 
+naturally, as such::
+
+  output(row, col) = input(row, col) + 
+                     input(row - 1, col) + input(row + 1, col) + 
+                     input(row, col - 1) + input(row, col + 1)
+
+A lattice is assumed to have :math:`N_r \times N_c` interior cells with unit 
+values surrounded by a halo of cells containing zero values for a total 
+dimension of :math:`(N_r + 2) \times (N_c + 2)`. For example, when
+:math:`N_r = N_c = 3`, the input lattice and values are:
+
+  +---+---+---+---+---+
+  | 0 | 0 | 0 | 0 | 0 |
+  +---+---+---+---+---+
+  | 0 | 1 | 1 | 1 | 0 |
+  +---+---+---+---+---+
+  | 0 | 1 | 1 | 1 | 0 |
+  +---+---+---+---+---+
+  | 0 | 1 | 1 | 1 | 0 |
+  +---+---+---+---+---+
+  | 0 | 0 | 0 | 0 | 0 |
+  +---+---+---+---+---+
+
+After applying the stencil, the output lattice and values are:
+
+  +---+---+---+---+---+
+  | 0 | 0 | 0 | 0 | 0 |
+  +---+---+---+---+---+
+  | 0 | 3 | 4 | 3 | 0 |
+  +---+---+---+---+---+
+  | 0 | 4 | 5 | 4 | 0 |
+  +---+---+---+---+---+
+  | 0 | 3 | 4 | 3 | 0 |
+  +---+---+---+---+---+
+  | 0 | 0 | 0 | 0 | 0 |
+  +---+---+---+---+---+
+
+For this :math:`(N_r + 2) \times (N_c + 2)` lattice case, here is our 
+(row, col) indexing scheme.
+
+  +----------+---------+---------+---------+---------+
+  | (-1, 3)  | (0, 3)  | (1, 3)  | (2, 3)  | (3, 3)  |
+  +----------+---------+---------+---------+---------+
+  | (-1, 2)  | (0, 2)  | (1, 2)  | (2, 2)  | (3, 2)  |
+  +----------+---------+---------+---------+---------+
+  | (-1, 1)  | (0, 1)  | (1, 1)  | (2, 1)  | (3, 1)  |
+  +----------+---------+---------+---------+---------+
+  | (-1, 0)  | (0, 0)  | (1, 0)  | (2, 0)  | (3, 0)  |
+  +----------+---------+---------+---------+---------+
+  | (-1, -1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
+  +----------+---------+---------+---------+---------+
+
+Notably, :math:`[0, N_r) \times [0, N_c)` corresponds to the interior index
+range over which we apply the stencil, and :math:`[-1,N_r+1) \times [-1, N_c+1)`
+is the full lattice index range.
+
+For reference and comparison to the ``RAJA::kernel`` implementations 
+described below, we begin by walking through a C-style version of the stencil 
+computation. First, we define the size of our lattice:
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _stencil_define_start
+   :end-before: _stencil_define_end
+   :language: C++
+
+Then, after allocating input and output arrays, we initialize the input:
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _stencil_input_init_start
+   :end-before: _stencil_input_init_end
+   :language: C++
+
+and compute the reference output solution:
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _stencil_output_ref_start
+   :end-before: _stencil_output_ref_end
+   :language: C++
+
+
+^^^^^^^^^^^^^^^^^^^
+RAJA Offset Layouts
+^^^^^^^^^^^^^^^^^^^
+
+We use the ``RAJA::make_offset_layout`` method to construct a 
+``RAJA::OffsetLayout`` object that we use to create ``RAJA::View`` objects
+for our input and output data arrays:
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _offsetlayout_views_start
+   :end-before: _offsetlayout_views_end
+   :language: C++
+
+Here, the row index range is :math:`[-1, N_r+1)`, and the column index 
+range is :math:`[-1, N_c+1)`. The first argument to each call to the 
+``RAJA::View`` constructor is the pointer to the array that holds the View
+data. The second argument is the ``RAJA::OffsetLayout`` object.
+
+``RAJA::OffsetLayout`` objects allow us to write loops over
+data arrays using non-zero based indexing and without having to manually 
+compute offsets into the arrays.
+
+For more information about RAJA View and Layout types, please see 
+:ref:`feat-view-label`.
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RAJA Kernel Variants
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For the RAJA implementations of the stencil computation, we use two 
+``RAJA::TypedRangeSegment`` objects to define the row and column iteration 
+spaces for the interior cells:
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _offsetlayout_ranges_start
+   :end-before: _offsetlayout_ranges_end
+   :language: C++
+
+Now, we have all the ingredients to implement the stencil computation using
+``RAJA::kernel``. Here is a sequential CPU variant:
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _offsetlayout_rajaseq_start
+   :end-before: _offsetlayout_rajaseq_end
+   :language: C++
+
+This RAJA variant does the computation as the C-style variant 
+introduced above.
+
+Since the input and output arrays are distinct, the stencil computation is
+data parallel. Thus, we can use ``RAJA::kernel`` and an appropriate 
+execution policy to run the computation in parallel. Here is an OpenMP
+collapse variant that maps the row-column product index space to OpenMP
+threads:
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _offsetlayout_rajaomp_start
+   :end-before: _offsetlayout_rajaomp_end
+   :language: C++
+
+Note that the lambda expression representing the kernel body is identical to
+the ``RAJA::kernel`` sequential version. 
+
+Here are variants for CUDA
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _offsetlayout_rajacuda_start
+   :end-before: _offsetlayout_rajacuda_end
+   :language: C++
+
+and HIP
+
+.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp
+   :start-after: _offsetlayout_rajahip_start
+   :end-before: _offsetlayout_rajahip_end
+   :language: C++
+
+The only difference between the CPU and GPU variants is that the RAJA macro
+``RAJA_DEVICE`` is used to decorate the lambda expression with the 
+``__device__`` annotation, which is required when capturing a lambda for use
+in a GPU device environment as we have discussed in other examples in this
+tutorial.
+
+One other point to note is that the CUDA variant in the exercise files 
+uses Unified Memory and the HIP variant uses distinct host and device memory
+arrays, with explicit host-device data copy operations. Thus, new 
+``RAJA::View`` objects were created for the HIP variant to wrap the 
+device data pointers used in the HIP kernel. Please see the exercise files
+for this example for details.
diff --git a/docs/sphinx/user_guide/tutorial/offset-layout.rst b/docs/sphinx/user_guide/tutorial/offset-layout.rst
deleted file mode 100644
index 7692738408..0000000000
--- a/docs/sphinx/user_guide/tutorial/offset-layout.rst
+++ /dev/null
@@ -1,133 +0,0 @@
-.. ##
-.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-.. ## and RAJA project contributors. See the RAJA/LICENSE file
-.. ## for details.
-.. ##
-.. ## SPDX-License-Identifier: (BSD-3-Clause)
-.. ##
-
-.. _offset-label:
-
----------------------------------------------
-Stencil Computations (View Offsets)
----------------------------------------------
-
-Key RAJA features shown in the following example:
-
-  * ``RAJA::Kernel`` loop execution template
-  *  RAJA kernel execution policies
-  * ``RAJA::View`` multi-dimensional data access
-  * ``RAJA:make_offset_layout`` method to apply index offsets
-
-This example applies a five-cell stencil sum to the interior cells of a 
-two-dimensional square lattice and stores the resulting sums in a second 
-lattice of equal size. The five-cell stencil accumulates values from each
-interior cell and its four neighbors. We use ``RAJA::View`` and 
-``RAJA::Layout`` constructs to simplify the multi-dimensional indexing so 
-that we can write the stencil operation as follows::
-
-  output(row, col) = input(row, col) + 
-                     input(row - 1, col) + input(row + 1, col) + 
-                     input(row, col - 1) + input(row, col + 1)
-
-A lattice is assumed to have :math:`N_r \times N_c` interior cells with unit 
-values surrounded by a halo of cells containing zero values for a total 
-dimension of :math:`(N_r + 2) \times (N_c + 2)`. For example, when
-:math:`N_r = N_c = 3`, the input lattice and values are:
-
-  +---+---+---+---+---+
-  | 0 | 0 | 0 | 0 | 0 |
-  +---+---+---+---+---+
-  | 0 | 1 | 1 | 1 | 0 |
-  +---+---+---+---+---+
-  | 0 | 1 | 1 | 1 | 0 |
-  +---+---+---+---+---+
-  | 0 | 1 | 1 | 1 | 0 |
-  +---+---+---+---+---+
-  | 0 | 0 | 0 | 0 | 0 |
-  +---+---+---+---+---+
-
-After applying the stencil, the output lattice and values are:
-
-  +---+---+---+---+---+
-  | 0 | 0 | 0 | 0 | 0 |
-  +---+---+---+---+---+
-  | 0 | 3 | 4 | 3 | 0 |
-  +---+---+---+---+---+
-  | 0 | 4 | 5 | 4 | 0 |
-  +---+---+---+---+---+
-  | 0 | 3 | 4 | 3 | 0 |
-  +---+---+---+---+---+
-  | 0 | 0 | 0 | 0 | 0 |
-  +---+---+---+---+---+
-
-For this :math:`(N_r + 2) \times (N_c + 2)` lattice case, here is our 
-(row, col) indexing scheme.
-
-  +----------+---------+---------+---------+---------+
-  | (-1, 3)  | (0, 3)  | (1, 3)  | (2, 3)  | (3, 3)  |
-  +----------+---------+---------+---------+---------+
-  | (-1, 2)  | (0, 2)  | (1, 2)  | (2, 2)  | (3, 2)  |
-  +----------+---------+---------+---------+---------+
-  | (-1, 1)  | (0, 1)  | (1, 1)  | (2, 1)  | (3, 1)  |
-  +----------+---------+---------+---------+---------+
-  | (-1, 0)  | (0, 0)  | (1, 0)  | (2, 0)  | (3, 0)  |
-  +----------+---------+---------+---------+---------+
-  | (-1, -1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
-  +----------+---------+---------+---------+---------+
-
-Notably :math:`[0, N_r) \times [0, N_c)` corresponds to the interior index
-range over which we apply the stencil, and :math:`[-1,N_r] \times [-1, N_c]`
-is the full lattice index range.
-
-^^^^^^^^^^^^^^^^^^^
-RAJA Offset Layouts
-^^^^^^^^^^^^^^^^^^^
-
-We use the ``RAJA::make_offset_layout`` method to construct a 
-``RAJA::OffsetLayout`` object that defines our two-dimensional indexing scheme.
-Then, we create two ``RAJA::View`` objects for each of the input and output
-lattice arrays.
-
-.. literalinclude:: ../../../../examples/tut_offset-layout.cpp
-   :start-after: _offsetlayout_views_start
-   :end-before: _offsetlayout_views_end
-   :language: C++
-
-Here, the row index range is :math:`[-1, N_r]`, and the column index 
-range is :math:`[-1, N_c]`. The first argument to each call to the 
-``RAJA::View`` constructor is a pointer to an array that holds the data for 
-the view; we assume the arrays are properly allocated before these calls.
-
-The offset layout mechanics of RAJA allow us to write loops over
-data arrays using non-zero based indexing and without having to manually 
-compute the proper offsets into the arrays. For more details on the 
-``RAJA::View`` and ``RAJA::Layout`` concepts we use in this example, please 
-refer to :ref:`view-label`.
-
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-RAJA Kernel Implementation
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-For the RAJA implementations of the example computation, we use two 
-``RAJA::RangeSegment`` objects to define the row and column iteration 
-spaces for the interior cells:
-
-.. literalinclude:: ../../../../examples/tut_offset-layout.cpp
-   :start-after: _offsetlayout_ranges_start
-   :end-before: _offsetlayout_ranges_end
-   :language: C++
-
-Here, is an implementation using ``RAJA::kernel`` multi-dimensional loop
-execution with a sequential execution policy.
-
-.. literalinclude:: ../../../../examples/tut_offset-layout.cpp
-   :start-after: _offsetlayout_rajaseq_start
-   :end-before: _offsetlayout_rajaseq_end
-   :language: C++
-
-Since the stencil operation is data parallel, any parallel execution policy 
-may be used. The file ``RAJA/examples/tut_offset-layout.cpp`` contains a 
-complete working example code with various parallel implementations. For more 
-information about using the ``RAJA::kernel`` interface, please see 
-:ref:`loop_elements-kernel-label`. 
diff --git a/docs/sphinx/user_guide/tutorial/permuted-layout-batch-matrix-multiply.rst b/docs/sphinx/user_guide/tutorial/permuted-layout-batch-matrix-multiply.rst
new file mode 100644
index 0000000000..9f229912db
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/permuted-layout-batch-matrix-multiply.rst
@@ -0,0 +1,161 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-permutedlayout-label:
+
+-----------------------------------------------
+Permuted Layout: Batched Matrix-Multiplication
+-----------------------------------------------
+
+This section contains an exercise file 
+``RAJA/exercises/permuted-layout-batch-matrix-multiply.cpp`` for you to work 
+through if you wish to get some practice with RAJA. The file 
+``RAJA/exercises/permuted-layout-batch-matrix-multiply_solution.cpp`` contains 
+complete working code for the examples discussed in this section. You can use 
+the solution file to check your work and for guidance if you get stuck.
+To build the exercises execute ``make permuted-layout-batch-matrix-multiply`` 
+and ``make permuted-layout-batch-matrix-multiply_solution`` from the build 
+directory.
+
+Key RAJA features shown in the following example:
+
+  * ``RAJA::forall`` loop traversal template
+  *  RAJA execution policies
+  * ``RAJA::View`` multi-dimensional data access
+  * ``RAJA::make_permuted_layout`` method to permute data ordering
+
+This example performs a "batched" matrix multiplication operation for a
+collection of :math:`3 \times 3` matrices. Each pair of matrices 
+:math:`A^{e}` and :math:`B^{e}`, indexed by 'e', is multiplied and the product 
+is stored in a matrix :math:`C^{e}`. :math:`A^{e}` matrix entries, for all 
+values of e, are stored in an array :math:`A`, all :math:`B^{e}` matrices 
+are stored in an array :math:`B`, and all :math:`C^{e}` matrices are stored in 
+an array :math:`C`. In the following discussion, the notation 
+:math:`A^{e}_{rc}` indicates the row r and column c entry of the 
+:math:`3 \times 3` matrix :math:`A^{e}`. 
+
+In the exercise, we use two different data layouts for the arrays :math:`A`, 
+:math:`B`, and :math:`C` to represent different storage patterns for the 
+:math:`3 \times 3` matrices. Below, we describe these layouts
+for two :math:`3 \times 3` matrices. The extension to more than two 
+matrices is straightforward as you will see in the exercise code. In the 
+exercise code, we time the execution of the batched matrix multiplication 
+operation to compare the performance for each layout and execution policy.
+These comparisons are not completely conclusive as to which layout is best since
+there may be additional performance to be gained by more specific tuning of 
+the memory layouts for an architecture and execution back-end. A complete, 
+detailed analysis of the performance implications of memory layout and access 
+patterns is beyond the scope of the exercise.
+
+In **layout 1**, the entries for each :math:`3 \times 3` matrix are contiguous 
+in memory following row major ordering; i.e., the ordering is column index, 
+then row index, then matrix index:
+
+.. math::
+  A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02},
+       A^{0}_{10}, A^{0}_{11}, A^{0}_{12},
+       A^{0}_{20}, A^{0}_{21}, A^{0}_{22},\\
+       A^{1}_{00}, A^{1}_{01}, A^{1}_{02},
+       A^{1}_{10}, A^{1}_{11}, A^{1}_{12},
+       A^{1}_{20}, A^{1}_{21}, A^{1}_{22}]
+
+In **layout 2**, the matrix entries are first ordered by matrix index,
+then by column index, and finally by row index:
+
+.. math::
+  A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01},
+       A^{1}_{01}, A^{0}_{02}, A^{1}_{02},
+       A^{0}_{10}, A^{1}_{10}, A^{0}_{11},\\
+       A^{1}_{11}, A^{0}_{12}, A^{1}_{12},
+       A^{0}_{20}, A^{1}_{20}, A^{0}_{21},
+       A^{1}_{21}, A^{0}_{22}, A^{1}_{22}]
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Permuted Layouts
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Next, we show how to construct the two data layouts described above using 
+``RAJA::View`` and ``RAJA::Layout`` objects. For more information on these 
+RAJA concepts, please see :ref:`feat-view-label`.
+
+The views to access data for layout 1 are constructed as follows:
+
+.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+   :start-after: _permutedlayout_defviews_start
+   :end-before: _permutedlayout_defviews_end
+   :language: C++
+
+The first argument to ``RAJA::make_permuted_layout`` is an array
+whose entries correspond to the extent of each layout dimension. Here, we have
+:math:`N` :math:`N_r \times N_c` matrices. The second argument, the layout
+permutation, describes the striding order of the array indices. Note that 
+since this case follows the default RAJA ordering convention 
+(see :ref:`feat-view-label`), we use the identity permutation '(0,1,2)'. For each 
+matrix, the column index (index 2) has unit stride and the row index (index 1) 
+has stride :math:`N_c`, the number of columns in each matrix. The matrix index 
+(index 0) has stride :math:`N_r \times N_c`, the number of entries in each 
+matrix.
+
+The views for layout 2 are constructed similarly, with a different index 
+striding order:
+
+.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+   :start-after: _permutedlayout_permviews_start
+   :end-before: _permutedlayout_permviews_end
+   :language: C++
+
+Here, the first argument to ``RAJA::make_permuted_layout`` is the same as in
+layout 1 since we have the same number of matrices with the same matrix 
+dimensions, and we will use the same indexing scheme to access the matrix 
+entries. However, the permutation we use is '(1,2,0)'. This makes the matrix 
+index (index 0) have unit stride, the column index (index 2) have stride
+N, which is the number of matrices, and the row index (index 1) has 
+stride :math:`N \times N_c`.
+
+^^^^^^^^^^^^^^^^^^^^^^
+RAJA Kernel Variants
+^^^^^^^^^^^^^^^^^^^^^^
+
+The exercise files contain RAJA variants that run the batched matrix 
+multiplication kernel with different execution back-ends. As mentioned
+earlier, we print out execution timings for each so you can compare the run 
+times of the different layouts described above. For example, the sequential 
+CPU variant using layout 1 is:
+
+.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+   :start-after: _permutedlayout_batchedmatmult_loop_start
+   :end-before: _permutedlayout_batchedmatmult_loop_end
+   :language: C++
+
+The sequential CPU variant using layout 2 is:
+
+.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+   :start-after: _permutedlayout2_batchedmatmult_loop_start
+   :end-before: _permutedlayout2_batchedmatmult_loop_end
+   :language: C++
+
+The only differences between these two are the names of the views that appear
+in the lambda expression loop body since a different layout is used to create 
+view objects for each layout case. To make the algorithm code identical for all 
+cases, we could use type aliases for the view and layout types in a header
+file similar to how we may abstract the execution policy out of the
+algorithm, and compile the code for the case we want to run.
+
+For comparison, here is an OpenMP CPU variant using layout 1:
+
+.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp
+   :start-after: _permutedlayout_batchedmatmult_omp_start
+   :end-before: _permutedlayout_batchedmatmult_omp_end
+   :language: C++
+
+The only difference between this variant and the sequential CPU variant shown
+above is the execution policy. The lambda expression loop body is identical
+to the sequential CPU variant.
+
+The exercise files also contain variants for RAJA CUDA and HIP back-ends.
+Their similarities and differences are the same as what we've just described.
diff --git a/docs/sphinx/user_guide/tutorial/permuted-layout.rst b/docs/sphinx/user_guide/tutorial/permuted-layout.rst
deleted file mode 100644
index 9e56c77bfe..0000000000
--- a/docs/sphinx/user_guide/tutorial/permuted-layout.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-.. ##
-.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-.. ## and RAJA project contributors. See the RAJA/LICENSE file
-.. ## for details.
-.. ##
-.. ## SPDX-License-Identifier: (BSD-3-Clause)
-.. ##
-
-.. _permuted-layout-label:
-
----------------------------------------------
-Batched Matrix-Multiply (Permuted Layouts)
----------------------------------------------
-
-Key RAJA features shown in the following example:
-
-  * ``RAJA::forall`` loop traversal template
-  *  RAJA execution policies
-  * ``RAJA::View`` multi-dimensional data access
-  * ``RAJA::make_permuted_layout`` method to permute data ordering
-
-This example performs batched matrix multiplication for a set of
-:math:`3 \times 3` matrices using two different data layouts.
-
-Matrices :math:`A` and :math:`B` are multiplied with the product stored in
-matrix :math:`C`. The notation :math:`A^{e}_{rc}` indicates the row r and 
-column c entry of matrix e. We describe the two data layouts we use for two
-matrices. The extension to more than two matrices is straightforward. Using
-different data layouts, we can assess which performs best for a given
-execution policy and computing environment.
-
-Layout 1:
-Entries in each matrix are grouped together with each each having row major 
-ordering; i.e.,
-
-.. math::
-  A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02},
-       A^{0}_{10}, A^{0}_{11}, A^{0}_{12},
-       A^{0}_{20}, A^{0}_{21}, A^{0}_{22},\\
-       A^{1}_{00}, A^{1}_{01}, A^{1}_{02},
-       A^{1}_{10}, A^{1}_{11}, A^{1}_{12},
-       A^{1}_{20}, A^{1}_{21}, A^{1}_{22}];
-
-Layout 2:
-Matrix entries are first ordered by matrix index,
-then by column index, and finally by row index; i.e.,
-
-.. math::
-  A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01},
-       A^{1}_{01}, A^{0}_{02}, A^{1}_{02},
-       A^{0}_{10}, A^{1}_{10}, A^{0}_{11},\\
-       A^{1}_{11}, A^{0}_{12}, A^{1}_{12},
-       A^{0}_{20}, A^{1}_{20}, A^{0}_{21},
-       A^{1}_{21}, A^{0}_{22}, A^{1}_{22}];
-
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Permuted Layouts
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Next, we show how to construct the two data layouts using ``RAJA::View`` and
-``RAJA::Layout`` objects. For more details on these RAJA concepts, please
-refer to :ref:`view-label`.
-
-The views for layout 1 are constructed as follows:
-
-.. literalinclude:: ../../../../examples/tut_batched-matrix-multiply.cpp
-   :start-after: _permutedlayout_defviews_start
-   :end-before: _permutedlayout_defviews_end
-   :language: C++
-
-The first argument to ``RAJA::make_permuted_layout`` is a C++ array
-whose entries correspond to the size of each array dimension; i.e., we have
-'N' :math:`N_r \times N_c` matrices. The second argument describes the
-striding order of the array dimensions. Note that since this case follows
-the default RAJA ordering convention (see :ref:`view-label`), we use the 
-identity permutation '(0,1,2)'. For each matrix, the column index (index 2) 
-has unit stride and the row index (index 1) has stride 3 (number of columns). 
-The matrix index (index 0) has stride 9 (:math:`N_c \times N_r`).
-
-The views for layout 2 are constructed similarly:
-
-.. literalinclude:: ../../../../examples/tut_batched-matrix-multiply.cpp
-   :start-after: _permutedlayout_permviews_start
-   :end-before: _permutedlayout_permviews_end
-   :language: C++
-
-Here, the first argument to ``RAJA::make_permuted_layout`` is the same as in
-layout 1 since we have the same number of matrices, matrix dimensions and we
-will use the same indexing scheme to access the matrix entries. However, the
-permutation we use is '(1,2,0)'. This makes the matrix index (index 0) have 
-unit stride, the column index (index 2) for each matrix has stride N, which 
-is the number of matrices, and the row index (index 1) has 
-stride :math:`N \times N_c`.
-
-^^^^^^^^^^^^^^^^^^^
-Example Code
-^^^^^^^^^^^^^^^^^^^
-
-Complete working examples that run the batched matrix-multiplication 
-computation for both layouts and various RAJA execution policies is located
-in the file ``RAJA/examples/tut_batched-matrix-multiply.cpp``. 
-
-It compares the execution run times of the two layouts described above 
-using four RAJA back-ends (Sequential, OpenMP, CUDA, and HIP). The OpenMP 
-version for layout 1 looks like this:
-
-.. literalinclude:: ../../../../examples/tut_batched-matrix-multiply.cpp
-   :start-after: _permutedlayout_batchedmatmult_omp_start
-   :end-before: _permutedlayout_batchedmatmult_omp_end
-   :language: C++
-
-The only differences between the lambda loop body for layout 1 and layout 2
-cases are the names of the views. To make the algorithm code identical for all 
-cases, we would use type aliases for the view and layout types in a header
-file similarly to how we would abstract the execution policy out of the
-algorithm.
diff --git a/docs/sphinx/user_guide/tutorial/reductions.rst b/docs/sphinx/user_guide/tutorial/reductions.rst
index bfd4344aff..effcb65378 100644
--- a/docs/sphinx/user_guide/tutorial/reductions.rst
+++ b/docs/sphinx/user_guide/tutorial/reductions.rst
@@ -6,47 +6,61 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _reductions-label:
+.. _tut-reduction-label:
 
----------------------------------
-Reductions
----------------------------------
+-----------------------------------------------------
+Reduction Types and Kernels with Multiple Reductions
+-----------------------------------------------------
 
-Key RAJA features shown in this example:
+This section contains an exercise file ``RAJA/exercises/reductions.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/reductions_solution.cpp`` contains complete
+working code for the examples discussed in this section. You can use the
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make reductions`` and ``make reductions_solution``
+from the build directory.
 
-  * ``RAJA::forall`` loop execution template 
-  * ``RAJA::RangeSegment`` iteration space construct
-  * RAJA reduction types
-  * RAJA reduction policies
+Key RAJA features shown in this section are:
 
-In the :ref:`dotproduct-label` example, we showed how to use the RAJA sum 
+  * ``RAJA::forall`` loop execution template and execution policies
+  * ``RAJA::TypedRangeSegment`` iteration space construct
+  * RAJA reduction types and reduction policies
+
+In the :ref:`tut-dotproduct-label` exercise, we showed how to use the RAJA sum 
 reduction type. The following example uses all supported RAJA reduction types: 
 min, max, sum, min-loc, max-loc.
 
+.. note:: RAJA 'min-loc' and 'max-loc' reductions determine the min and max 
+          reduction value, respectively, along with an iteration index at 
+          which the main/max value is found. 
+
 .. note:: Multiple RAJA reductions can be combined in any RAJA loop kernel 
           execution method, and reduction operations can be combined with 
           any other kernel operations. 
 
-We start by allocating an array (the memory manager in the example uses 
-CUDA Unified Memory if CUDA is enabled) and initializing its values in a 
+.. note:: Each RAJA reduction type requires a reduction policy that must 
+          be compatible with the execution policy for the kernel in which 
+          it is used.
+
+We start by allocating an array and initializing its values in a 
 manner that makes the example mildly interesting and able to show what the 
 different reduction types do. Specifically, the array is initialized to
 a sequence of alternating values ('1' and '-1'). Then, two values near
 the middle of the array are set to '-100' and '100':
 
-.. literalinclude:: ../../../../examples/tut_reductions.cpp
+.. literalinclude:: ../../../../exercises/reductions_solution.cpp
    :start-after: _reductions_array_init_start
    :end-before: _reductions_array_init_end
    :language: C++
 
 We also define a range segment to iterate over the array:
 
-.. literalinclude:: ../../../../examples/tut_reductions.cpp
+.. literalinclude:: ../../../../exercises/reductions_solution.cpp
    :start-after: _reductions_range_start
-   :end-before: _reductions_arange_end
+   :end-before: _reductions_range_end
    :language: C++
 
-With these parameters and data initialization, all the code examples 
+With these parameters and data initialization, the code example 
 presented below will generate the following results:
 
  * the sum will be zero
@@ -55,25 +69,27 @@ presented below will generate the following results:
  * the min loc will be N/2
  * the max loc will be N/2 + 1
 
-A sequential kernel that exercises all RAJA sequential reduction types is:
+A sequential kernel that exercises all RAJA sequential reduction types 
+along with operations after the kernel to print the reduced values is:
  
-.. literalinclude:: ../../../../examples/tut_reductions.cpp
+.. literalinclude:: ../../../../exercises/reductions_solution.cpp
    :start-after: _reductions_raja_seq_start
    :end-before: _reductions_raja_seq_end
    :language: C++
 
 Note that each reduction object takes an initial value at construction. Also,
 within the kernel, updating each reduction is done via an operator or method
-that is basically what you would expect (i.e., '+=' for sum, 'min()' for min,
-etc.). After the kernel executes, the reduced value computed by each reduction 
+that is basically what you would expect for the type of reduction 
+(e.g., '+=' for sum, 'min()' for min, etc.). After the kernel executes, the 
+reduced value computed by each reduction 
 object is retrieved after the kernel by calling a 'get()' method on the 
 reduction object. The min-loc/max-loc index values are obtained using 
 'getLoc()' methods.
 
-For parallel multithreading execution via OpenMP, the example can be run 
-by replacing the execution and reduction policies with:
+For parallel multithreading execution via OpenMP, the exercise can be run with
+the execution and reduction policies:
 
-.. literalinclude:: ../../../../examples/tut_reductions.cpp
+.. literalinclude:: ../../../../exercises/reductions_solution.cpp
    :start-after: _reductions_raja_omppolicy_start
    :end-before: _reductions_raja_omppolicy_end
    :language: C++
@@ -81,21 +97,15 @@ by replacing the execution and reduction policies with:
 Similarly, the kernel containing the reductions can be run in parallel
 on a GPU using CUDA policies:
 
-.. literalinclude:: ../../../../examples/tut_reductions.cpp
+.. literalinclude:: ../../../../exercises/reductions_solution.cpp
    :start-after: _reductions_raja_cudapolicy_start
    :end-before: _reductions_raja_cudapolicy_end
    :language: C++
 
 or HIP policies:
 
-.. literalinclude:: ../../../../examples/tut_reductions.cpp
+.. literalinclude:: ../../../../exercises/reductions_solution.cpp
    :start-after: _reductions_raja_hippolicy_start
    :end-before: _reductions_raja_hippolicy_end
    :language: C++
 
-.. note:: Each RAJA reduction type requires a reduction policy that must 
-          be compatible with the execution policy for the kernel in which 
-          it is used.
-
-The file ``RAJA/examples/tut_reductions.cpp`` contains the complete 
-working example code.
diff --git a/docs/sphinx/user_guide/tutorial/scan.rst b/docs/sphinx/user_guide/tutorial/scan.rst
index a06c7f2eb6..10cc0535fb 100644
--- a/docs/sphinx/user_guide/tutorial/scan.rst
+++ b/docs/sphinx/user_guide/tutorial/scan.rst
@@ -6,81 +6,105 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _scan-label:
+.. _tut-scan-label:
 
 --------------------------------------------------
 Parallel Scan Operations
 --------------------------------------------------
 
-Key RAJA features shown in this section:
+This section contains an exercise file ``RAJA/exercises/scan.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/scan_solution.cpp`` contains complete
+working code for the examples discussed in this section. You can use the
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make scan`` and ``make scan_solution``
+from the build directory.
 
-  * ``RAJA::inclusive_scan`` operation
-  * ``RAJA::inclusive_scan_inplace`` operation
-  * ``RAJA::exclusive_scan`` operation
-  * ``RAJA::exclusive_scan_inplace`` operation
-  * RAJA operators for different types of scans; e.g., plus, minimum, maximum, etc.
+Key RAJA features shown in this section are:
 
-Below, we present examples of RAJA sequential, OpenMP,
-and CUDA scan operations and show how different scan operations can be 
+  * ``RAJA::inclusive_scan``, ``RAJA::inclusive_scan_inplace``,
+    ``RAJA::exclusive_scan``, and ``RAJA::exclusive_scan_inplace`` operations
+    and execution policies
+  * RAJA operators for different types of scans; e.g., plus, minimum, maximum, 
+    etc.
+
+In this section, we present examples of various RAJA scan operations using
+multiple RAJA execution back-ends. Different scan operations can be 
 performed by passing different RAJA operators to the RAJA scan template 
 methods. Each operator is a template type, where the template argument is 
 the type of the values it operates on. For a summary of RAJA scan 
-functionality, please see :ref:`scan-label`. 
+functionality, please see :ref:`feat-scan-label`. 
 
 .. note:: RAJA scan operations use the same execution policy types that 
-          ``RAJA::forall`` loop execution templates do.
+          ``RAJA::forall`` kernel execution templates do.
+
+.. note:: RAJA scan operations take 'span' arguments to express the sequential
+          index range of array entries used in the scan. Typically, these
+          span objects are created using the ``RAJA::make_span`` method
+          as shown in the examples below.
 
 Each of the examples below uses the same integer arrays for input
-and output values. We set the input array and print them as follows:
+and output values. We initialize the input array and print its values as such:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_array_init_start
    :end-before: _scan_array_init_end
    :language: C++
 
-This generates the following sequence of values in the 'in' array::
+This generates the following sequence of values. This sequence will be used as 
+the 'in' array for each of the following examples.::
 
-   3 -1 2 15 7 5 17 9 6 18 1 10 0 14 13 4 11 12 8 16
+   -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 
 ^^^^^^^^^^^^^^^^
 Inclusive Scans
 ^^^^^^^^^^^^^^^^
 
-A sequential inclusive scan operation is performed by:
+RAJA's scan operations are standalone operations. That is, they cannot be 
+combined with other operations in a kernel. A sequential
+inclusive scan operation can be executed like so:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_inclusive_seq_start
    :end-before: _scan_inclusive_seq_end
    :language: C++
 
-Since no operator is passed to the scan method, the default 'sum' operation 
+Since no operator is passed to the scan method, the default 'plus' operation 
 is applied and the result generated in the 'out' array is a prefix-sum based 
 on the 'in' array. The resulting 'out' array contains the values::
 
-   3 2 4 19 26 31 48 57 63 81 82 92 92 106 119 123 134 146 154 170
+   -1 -1 0 2 5 9 14 20 27 35 44 54 65 77 90 104 119 135 152 170
+
+In particular, each entry in the output array is a *partial sum* of all 
+input array entries up to that array index.
 
-We can be explicit about the operation used in the scan by passing the 
-'plus' operator to the scan method:
+We can be explicit about the operation used in the scan by passing the RAJA
+'plus' operator ``RAJA::operators::plus<int>`` to the scan method:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_inclusive_seq_plus_start
    :end-before: _scan_inclusive_seq_plus_end
    :language: C++
 
-The result in the 'out' array is the same.
+The result in the 'out' array is the same as above.
 
 An inclusive parallel scan operation using OpenMP multithreading is
 accomplished similarly by replacing the execution policy type:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_inclusive_omp_plus_start
    :end-before: _scan_inclusive_omp_plus_end
    :language: C++
 
-As is commonly done with RAJA, the only difference between this code and
-the previous one is that the execution policy is different. If we want to 
-run the scan on a GPU using CUDA, we would use a CUDA execution policy. This
-will be shown shortly.
+As expected, this produces the same result as the previous two examples.
+
+As is commonly the case with RAJA, the only difference between this code and
+the previous one is the execution policy. If we want to 
+run the scan on a GPU using CUDA, we would use a CUDA execution policy as
+is shown in examples below.
+
+.. note:: If no operator is passed to a RAJA scan operation, the default
+          plus operator is used, resulting in a prefix-sum. 
 
 ^^^^^^^^^^^^^^^^
 Exclusive Scans
@@ -88,89 +112,102 @@ Exclusive Scans
 
 A sequential exclusive scan (plus) operation is performed by:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_exclusive_seq_plus_start
    :end-before: _scan_exclusive_seq_plus_end
    :language: C++
 
 This generates the following sequence of values in the output array::
 
-   0 3 2 4 19 26 31 48 57 63 81 82 92 92 106 119 123 134 146 154
+   0 -1 -1 0 2 5 9 14 20 27 35 44 54 65 77 90 104 119 135 152
+
+The result of an exclusive scan is similar to the result of an 
+inclusive scan, but differs in two ways. First, the first entry in 
+the exclusive scan output array is the `identity` of the operator used.
+In the example here, it is zero, since the operator is 'plus'. 
+Second, the output sequence is shifted one position to the right
+when compared to an inclusive scan.
+
+.. note:: The `identity` of an operator is the default value of a given type
+          for that operation. For example:
+          - The identity of an int for a sum operation is 0.
+          - The identity of an int for a maximum operation is -2147483648.
 
-Note that the exclusive scan result is different than the inclusive scan 
-result in two ways. The first entry in the result is the `identity` of the
-operator used (here, it is zero, since the operator is 'plus') and, after
-that, the output sequence is shifted one position to the right.
 
 Running the same scan operation on a GPU using CUDA is done by:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_exclusive_cuda_plus_start
    :end-before: _scan_exclusive_cuda_plus_end
    :language: C++
 
 Note that we pass the number of threads per CUDA thread block as the template
-argument to the CUDA execution policy as we do in other cases.
+argument to the CUDA execution policy as we do when using ``RAJA::forall``.
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 In-place Scans and Other Operators
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-*In-place* scan operations generate the same results as the scan operations
+*In-place* scan variants generate the same results as the scan operations
 we have just described. However, the result is generated in the input array 
 directly so **only one array is passed to in-place scan methods.**
 
 Here is a sequential inclusive in-place scan that uses the 'minimum' operator:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_inclusive_inplace_seq_min_start
    :end-before: _scan_inclusive_inplace_seq_min_end
    :language: C++
 
-Note that, before the scan, we copy the input array into the output array so 
-the result is generated in the output array. Doing this, we avoid having to
-re-initialize the input array to use it in other examples. 
+Note that, before the scan operation is invoked, we copy the 
+input array into the output array to provide the scan input array we want.
 
 This generates the following sequence in the output array::
 
-   3 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+   -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+
+Since the operator used in the scan is 'minimum' and the smallest values in
+the input array is the first entry, the result is an array with that value
+in all array slots.
 
 Here is a sequential exclusive in-place scan that uses the 'maximum' operator:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_exclusive_inplace_seq_max_start
    :end-before: _scan_exclusive_inplace_seq_max_end
    :language: C++
 
 This generates the following sequence in the output array::
 
-   -2147483648 3 3 3 15 15 15 17 17 17 18 18 18 18 18 18 18 18 18 18
+   -2147483648 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
 
-Note that the first value in the result is the negative of the max int value;
-i.e., the identity of the maximum operator.
+Since it is an exclusive scan, the first value in the result is the negative 
+of the max int value, which is the identity of the 'maximum' operator.
 
 As you may expect at this point, running an exclusive in-place prefix-sum
 operation using OpenMP is accomplished by: 
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_exclusive_inplace_omp_plus_start
    :end-before: _scan_exclusive_inplace_omp_plus_end
    :language: C++
 
 This generates the following sequence in the output array (as we saw earlier)::
 
-   0 3 2 4 19 26 31 48 57 63 81 82 92 92 106 119 123 134 146 15
+   0 -1 -1 0 2 5 9 14 20 27 35 44 54 65 77 90 104 119 135 152
 
 and the only difference is the execution policy template parameter.
 
 Lastly, we show a parallel inclusive in-place prefix-sum operation using CUDA:
 
-.. literalinclude:: ../../../../examples/tut_scan.cpp
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
    :start-after: _scan_inclusive_inplace_cuda_plus_start
    :end-before: _scan_inclusive_inplace_cuda_plus_end
    :language: C++
 
-.. note:: RAJA scans for the HIP back-end are similar to those for CUDA.
+and the same using the RAJA HIP back-end:
 
-The file ``RAJA/examples/tut_scan.cpp`` contains the complete 
-working example code.
+.. literalinclude:: ../../../../exercises/scan_solution.cpp
+   :start-after: _scan_inclusive_inplace_hip_plus_start
+   :end-before: _scan_inclusive_inplace_hip_plus_end
+   :language: C++
diff --git a/docs/sphinx/user_guide/tutorial/sort.rst b/docs/sphinx/user_guide/tutorial/sort.rst
index a44fe9b7b2..1eefab9de9 100644
--- a/docs/sphinx/user_guide/tutorial/sort.rst
+++ b/docs/sphinx/user_guide/tutorial/sort.rst
@@ -6,87 +6,111 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _sort-label:
+.. _tut-sort-label:
 
 --------------------------------------------------
 Parallel Sort Operations
 --------------------------------------------------
 
-Key RAJA features shown in this section:
+This section contains an exercise file ``RAJA/exercises/sort.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/sort_solution.cpp`` contains complete
+working code for the examples discussed in this section. You can use the
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make sort`` and ``make sort_solution``
+from the build directory.
 
-  * ``RAJA::sort`` operation
-  * ``RAJA::sort_pairs`` operation
-  * ``RAJA::stable_sort`` operation
-  * ``RAJA::stable_sort_pairs`` operation
+Key RAJA features shown in this section are:
+
+  * ``RAJA::sort``, ``RAJA::sort_pairs``, ``RAJA::stable_sort``, and ``RAJA::stable_sort_pairs`` operations and execution policies
   * RAJA comparators for different types of sorts; e.g., less, greater
 
-Below, we present examples of RAJA sequential, OpenMP,
-and CUDA sort operations and show how different sort orderings can be
-achieved by passing different RAJA comparators to the RAJA sort template
-methods. Each comparator is a template type, where the template argument is
-the type of the values it compares. For a summary of RAJA sort
-functionality, please see :ref:`sort-label`.
+We show examples of RAJA sort operations using multiple RAJA execution
+back-ends and describe how different sort orderings can be achieved by 
+passing different RAJA comparators to the RAJA sort template methods. Each 
+comparator is a template type, where the template argument is the type of 
+the values it compares. For a summary of available RAJA sorts, please see 
+:ref:`feat-sort-label`.
 
 .. note:: RAJA sort operations use the same execution policy types that
           ``RAJA::forall`` loop execution templates do.
 
+.. note:: RAJA sort operations take 'span' arguments to express the sequential
+          index range of array entries used in the sort. Typically, these
+          span objects are created using the ``RAJA::make_span`` method
+          as shown in the examples below.
+
 Each of the examples below uses the same integer arrays for input
 and output values. We set the input array and print them as follows:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_array_init_start
    :end-before: _sort_array_init_end
    :language: C++
 
-This generates the following sequence of values in the ``in`` array::
+This produces the following sequence of values in the ``in`` array::
 
    6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5
 
-and the following sequence of (key, value) pairs in the ``in`` and ``in_vals``
-arrays::
+and the following sequence of (key, value) pairs shown as pairs of values
+in the ``in`` and ``in_vals`` arrays, respectively::
 
    (6,0) (7,0) (2,0) (1,0) (0,0) (9,0) (4,0) (8,0) (5,0) (3,0)
    (4,1) (9,1) (6,1) (3,1) (7,1) (0,1) (1,1) (8,1) (2,1) (5,1)
 
+.. note:: In the following sections, we discuss *stable* and *unstable* sort 
+          operations. The difference between them is that a stable sort 
+          preserves the relative order of equal elements, with respect to the 
+          sort comparator operation, while an unstable sort may not preserve 
+          the relative order of equal elements. For the examples below that 
+          use integer arrays, there is no way to tell by inspecting the 
+          output whether relative ordering is preserved for unstable sorts.
+          However, the preservation of relative ordering can be seen in the
+          sort pairs examples below.
+
 ^^^^^^^^^^^^^^^^
 Unstable Sorts
 ^^^^^^^^^^^^^^^^
 
 A sequential unstable sort operation is performed by:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_seq_start
    :end-before: _sort_seq_end
    :language: C++
 
-Since no comparator is passed to the sort method, the default less operation
-is applied and the result generated in the ``out`` array is non-decreasing sort
-on the ``out`` array. The resulting ``out`` array contains the values::
+Since no comparator is passed to the sort method, the default 'less' operator
+``RAJA::operators::less<int>`` is applied and the result generated in the 
+``out`` array is a non-decreasing sequence of values from the ``in`` array; 
+i.e.,::
 
    0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
 
-We can be explicit about the operation used in the sort by passing the
-less operator to the sort method:
+We can be explicit about the operation used in the sort operation by passing 
+the 'less' operator to the sort method manually:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_seq_less_start
    :end-before: _sort_seq_less_end
    :language: C++
 
-The result in the ``out`` array is the same.
+The result in the ``out`` array is the same as before::
+
+   0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
 
-An unstable parallel sort operation using OpenMP multi-threading is
-accomplished similarly by replacing the execution policy type:
+An unstable parallel sort operation using OpenMP multithreading is
+accomplished similarly by replacing the execution policy type with
+and OpenMP policy:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_omp_less_start
    :end-before: _sort_omp_less_end
    :language: C++
 
-As is commonly done with RAJA, the only difference between this code and
+As is common with RAJA, the only difference between this code and
 the previous one is that the execution policy is different. If we want to
-run the sort on a GPU using CUDA, we would use a CUDA execution policy. This
-will be shown shortly.
+run the sort on a GPU using CUDA or HIP, we would use a CUDA or HIP execution 
+policy. This is shown in examples that follow.
 
 ^^^^^^^^^^^^^^^^
 Stable Sorts
@@ -94,62 +118,80 @@ Stable Sorts
 
 A sequential stable sort (less) operation is performed by:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_stable_seq_less_start
    :end-before: _sort_stable_seq_less_end
    :language: C++
 
-This generates the following sequence of values in the output array::
+This generates the following sequence of values in the output array 
+as expected based on the examples we discussed above::
 
    0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9
 
 Note that the stable sort result is the same as the unstable sort in this case
-because we are sorting integers. We will show an example of sorting pairs later
-where this is not the case.
+because we are sorting an array of integers. We will show an example of 
+sorting pairs later where this is not the case.
 
 Running the same sort operation on a GPU using CUDA is done by:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_stable_cuda_less_start
    :end-before: _sort_stable_cuda_less_end
    :language: C++
 
 Note that we pass the number of threads per CUDA thread block as the template
-argument to the CUDA execution policy as we do in other cases.
+argument to the CUDA execution policy as we do when using ``RAJA::forall``.
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Other Comparators
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Using a different comparator allows sorting in a different order.
-Here is a sequential stable sort that uses the greater operator:
+Using a different comparator operator allows sorting in a different order.
+Here is a sequential stable sort that uses the 'greater' operator
+``RAJA::operators::greater<int>``:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_stable_seq_greater_start
    :end-before: _sort_stable_seq_greater_end
    :language: C++
 
-This generates the following sequence of values in non-increasing order in
-the output array::
+and similarly for HIP:
+
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
+   :start-after: _sort_stable_hip_greater_start
+   :end-before: _sort_stable_hip_greater_end
+   :language: C++
+
+Both of these sorts generate the following sequence of values in 
+non-increasing order in the output array::
 
    9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0
 
-Note that the only operators provided by RAJA that are valid to use in sort
-because they form a strict weak ordering of elements for arithmetic types are
-less and greater. Also note that the the cuda sort backend only supports
-RAJA's operators less and greater.
+.. note:: * The only operators provided by RAJA that are valid to use in sort
+            because they enforce a strict weak ordering of elements for 
+            arithmetic types are 'less' and 'greater'. Users may provide other
+            operators for different sorting operations. 
+          * Also the RAJA CUDA sort back-end only supports RAJA operators 
+            'less' and 'greater' because it uses the NVIDIA CUB library.
 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Sort Pairs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Sort *Pairs* operations generate the same results as the sort operations
-we have just described. However, an additional array of values is also permuted
-to match the sorted array so **two arrays are passed to sort pairs methods.**
+*Sort pairs* operations generate the same results as the sort operations
+we have just described. Additionally, a second array of values is
+reordered using the ordering of the first sorted array so 
+**two arrays are passed to sort pairs methods.**
+
+.. note:: For ``RAJA::sort_pairs`` algorithms, two arrays are passed. The 
+          first array (*keys*) will be sorted according to the given 
+          comparator operator. The elements in the second array (*values*) 
+          will be reordered based on the final order of the first sorted array.
 
-Here is a sequential unstable sort pairs that uses the less operator:
+Here is a sequential unstable sort pairs operation that uses the 'less' 
+operator:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_pairs_seq_less_start
    :end-before: _sort_pairs_seq_less_end
    :language: C++
@@ -159,13 +201,14 @@ This generates the following sequence in the output array::
    (0,0) (0,1) (1,0) (1,1) (2,0) (2,1) (3,0) (3,1) (4,0) (4,1)
    (5,1) (5,0) (6,1) (6,0) (7,0) (7,1) (8,0) (8,1) (9,1) (9,0)
 
-Note that some of the pairs with equivalent keys stayed in the same order
-they appeared in the unsorted arrays like ``(8,0) (8,1)``, while others are
-reversed like ``(9,1) (9,0)``.
+Note that some of the pairs with equivalent *keys* stayed in the same order
+that they appeared in the unsorted arrays like ``(8,0) (8,1)``, while others are
+reversed like ``(9,1) (9,0)``. This illustrates that relative ordering of
+equal elements may not be preserved in an unstable sort.
 
 Here is a sequential stable sort pairs that uses the greater operator:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_stable_pairs_seq_greater_start
    :end-before: _sort_stable_pairs_seq_greater_end
    :language: C++
@@ -176,12 +219,12 @@ This generates the following sequence in the output array::
    (4,0) (4,1) (3,0) (3,1) (2,0) (2,1) (1,0) (1,1) (0,0) (0,1)
 
 Note that all pairs with equivalent keys stayed in the same order that they
-appeared in the unsorted arrays.
+appeared in the unsorted input arrays.
 
 As you may expect at this point, running an stable sort pairs
 operation using OpenMP is accomplished by:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_stable_pairs_omp_greater_start
    :end-before: _sort_stable_pairs_omp_greater_end
    :language: C++
@@ -195,12 +238,12 @@ and the only difference is the execution policy template parameter.
 
 Lastly, we show a parallel unstable sort pairs operation using CUDA:
 
-.. literalinclude:: ../../../../examples/tut_sort.cpp
+.. literalinclude:: ../../../../exercises/sort_solution.cpp
    :start-after: _sort_pairs_cuda_greater_start
    :end-before: _sort_pairs_cuda_greater_end
    :language: C++
 
-.. note:: RAJA sorts for the HIP back-end are similar to those for CUDA.
+.. note:: RAJA sorts for the HIP back-end are similar to those for CUDA. 
+          The only difference is that a HIP execution policy template 
+          parameter type is used.
 
-The file ``RAJA/examples/tut_sort.cpp`` contains the complete
-working example code.
diff --git a/docs/sphinx/user_guide/tutorial/teams_basic.rst b/docs/sphinx/user_guide/tutorial/teams_basic.rst
deleted file mode 100644
index 9ad9e99d70..0000000000
--- a/docs/sphinx/user_guide/tutorial/teams_basic.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-.. ##
-.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC
-.. ## and RAJA project contributors. See the RAJA/LICENSE file
-.. ## for details.
-.. ##
-.. ## SPDX-License-Identifier: (BSD-3-Clause)
-.. ##
-
-.. _teamsbasic-label:
-
-------------------------------
-Team based loops (RAJA Teams)
-------------------------------
-
-Key RAJA features shown in the following examples:
-
-  * ``RAJA::expt::launch`` method to create a run-time
-    selectable host/device execution space.
-  * ``RAJA::expt::loop`` methods to express algorithms
-    in terms of nested for loops. 
-
-In this example, we introduce the RAJA Teams framework and discuss
-hierarchical loop-based parallelism. Development with RAJA Teams occurs
-inside an execution space. The execution space is launched using the 
-``RAJA::expt::launch`` method::
-
-  RAJA::expt::launch<launch_policy>(RAJA::expt::ExecPlace ,
-  RAJA::expt::Grid(RAJA::expt::Teams(Nteams,Nteams),
-                        RAJA::expt::Threads(Nthreads,Nthreads)),
-  [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) {
-
-    /* Express code here */
-
-  });
-
-The ``RAJA::expt::launch`` method is templated on both a host and a device launch policy.
-As an example, the following constructs an execution space for a sequential 
-and CUDA kernel::
-
-  using launch_policy = RAJA::expt::LaunchPolicy
-    <RAJA::expt::seq_launch_t, RAJA::expt::cuda_launch_t<false>>;
-
-Kernel execution on either the host or device is driven by the first argument of
-the method which takes a ``RAJA::expt::ExecPlace`` enum type, either ``HOST`` or ``DEVICE``. 
-Similar to thread, and block programming models, RAJA Teams carries out
-computation in a predefined compute grid made up of threads which are
-then grouped into teams. The execution space is then enclosed by a host/device
-lambda which takes a ``RAJA::expt::LaunchContext`` object. The ``RAJA::expt::LaunchContext``
-may then be used to control the flow within the kernel, for example creating thread-team
-synchronization points. 
-
-Inside the execution space the ``RAJA::expt::loop`` methods enable developers
-to express their code in terms of nested loops. The manner in which the loops
-are executed depends on the template. Following the CUDA/HIP programming models
-we follow a hierarchical structure in which outer loops are executed by thread-teams
-and inner loops are executed by a thread in a team. 
-
-.. literalinclude:: ../../../../examples/tut_teams_basic.cpp
-   :start-after: // _team_loops_start
-   :end-before: // _team_loops_end
-   :language: C++
-  
-The mapping between the thread and teams to programming model depends on 
-how they are defined. For example, we may define host and device mapping 
-strategies as the following::
-
-  using teams_x = RAJA::expt::LoopPolicy<RAJA::loop_exec,
-                                         RAJA::cuda_block_x_direct>;
-  using thread_x = RAJA::expt::LoopPolicy<RAJA::loop_exec, 
-                                          RAJA::cuda_block_x_direct>;
-
-In the example above the ``RAJA::expt::LoopPolicy`` struct holds both the host and
-device loop mapping strategies. On the host, both the team/thread strategies expand
-out to standard C-style loops for execution:
-
-.. literalinclude:: ../../../../examples/tut_teams_basic.cpp
-   :start-after: // _c_style_loops_start
-   :end-before: // _c_style_loops_end
-   :language: C++
-   
-On the device the ``teams_x/y`` policies will map loop iterations directly to 
-CUDA thread blocks, while the ``thread_x/y`` policies will map loop iterations
-directly to threads in a CUDA block. The CUDA equivalent is illustrated below:   
-
-.. literalinclude:: ../../../../examples/tut_teams_basic.cpp
-   :start-after: // _device_loop_start
-   :end-before: // _device_loop_end
-   :language: C++
-   
-The file RAJA/examples/tut_teams_basic.cpp contains the complete working example code.
diff --git a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst b/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst
deleted file mode 100644
index 8554e273a4..0000000000
--- a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst
+++ /dev/null
@@ -1,84 +0,0 @@
-.. ##
-.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-.. ## and RAJA project contributors. See the RAJA/LICENSE file
-.. ## for details.
-.. ##
-.. ## SPDX-License-Identifier: (BSD-3-Clause)
-.. ##
-
-.. _tiledmatrixtranspose-label:
-
-----------------------
-Tiled Matrix Transpose
-----------------------
-
-Key RAJA features shown in this example are:
-
-  * ``RAJA::kernel`` usage with multiple lambdas
-  * ``RAJA::statement::Tile`` type
-
-In this example, we compute the transpose of an input matrix 
-:math:`A` of size :math:`N_r \times N_c` and store the result in a second 
-matrix :math:`At` of size :math:`N_c \times N_r`.
-
-We compute the matrix transpose using a tiling algorithm, which iterates 
-over tiles of the matrix A and performs a transpose copy of a tile without 
-storing the tile in another array. The algorithm is expressed as a collection 
-of outer and inner loops. Iterations of the inner loop will transpose each tile,
-while outer loops iterate over the tiles.
-
-We start with a non-RAJA C++ implementation, where we choose tile
-dimensions smaller than the matrix dimensions. Note that we do not assume 
-that tiles divide evenly the number of rows and and columns of the matrix.
-However, we do assume square tiles. First, we define matrix dimensions: 
-
-.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp
-   :start-after: // _tiled_mattranspose_dims_start
-   :end-before: // _tiled_mattranspose_dims_end
-   :language: C++
-
-Then, we wrap the matrix data pointers in ``RAJA::View`` objects to  
-simplify the multi-dimensional indexing:
-
-.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp
-   :start-after: // _tiled_mattranspose_views_start
-   :end-before: // _tiled_mattranspose_views_end
-   :language: C++
-
-Then, the non-RAJA C++ implementation looks like this:
-
-.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp
-   :start-after: // _cstyle_tiled_mattranspose_start
-   :end-before: // _cstyle_tiled_mattranspose_end
-   :language: C++
-
-Note that we need to include a bounds check in the code to avoid indexing out 
-of bounds when the tile sizes do not divide the matrix dimensions evenly.
-
-^^^^^^^^^^^^^^^^^^^^^
-RAJA::kernel Variants
-^^^^^^^^^^^^^^^^^^^^^
-
-For ``RAJA::kernel`` variants, we use ``RAJA::statement::Tile`` types
-for the outer loop tiling and ``RAJA::tile_fixed`` types to 
-indicate the tile dimensions. The complete sequential RAJA variant is:
-
-.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp
-   :start-after: // _raja_tiled_mattranspose_start
-   :end-before: // _raja_tiled_mattranspose_end
-   :language: C++
-
-The ``RAJA::statement::Tile`` types compute the number of tiles needed to 
-iterate over all matrix entries in each dimension and generate iteration 
-index bounds for each tile, which are used to generate loops for the inner  
-``RAJA::statement::For`` types. Thus, the bounds checking logic in the 
-non-RAJA variant is not needed. Note that the integer template parameters
-to these statement types refer to the entries in the iteration space tuple
-passed to the ``RAJA::kernel`` method.
-
-The file ``RAJA/examples/tut_tiled-matrix-transpose.cpp`` contains the complete working example code for the examples described in this section, including
-OpenMP, CUDA, and HIP variants.
-
-A more advanced version using RAJA local arrays for CPU cache blocking and
-using GPU shared memory is discussed in :ref:`matrixtransposelocalarray-label`.
-
diff --git a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst
index f8423ba5d1..8e73076df0 100644
--- a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst
+++ b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst
@@ -6,127 +6,165 @@
 .. ## SPDX-License-Identifier: (BSD-3-Clause)
 .. ##
 
-.. _vertexsum-label:
+.. _tut-vertexsum-label:
 
 --------------------------------------------------
-Mesh Vertex Sum Example: Iteration Space Coloring
+Iteration Space Coloring: Mesh Vertex Sum
 --------------------------------------------------
 
-Key RAJA features shown in this example:
+This section contains an exercise file ``RAJA/exercises/vertexsum-indexset.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/vertexsum-indexset_solution.cpp`` contains complete
+working code for the examples discussed in this section. You can use the
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make vertexsum-indexset`` and ``make vertexsum-indexset_solution``
+from the build directory.
 
-  * ``RAJA::forall`` loop execution template method
-  * ``RAJA::ListSegment`` iteration space construct
-  * ``RAJA::IndexSet`` iteration space segment container and associated execution policies
+Key RAJA features shown in this example are:
 
+  * ``RAJA::forall`` loop execution template method
+  * ``RAJA::TypedListSegment`` iteration space construct
+  * ``RAJA::TypedIndexSet`` iteration space segment container and 
+    associated execution policies
 
 The example computes a sum at each vertex on a logically-Cartesian 2D mesh
 as shown in the figure.
 
 .. figure:: ../figures/vertexsum.jpg
 
-   A portion of the area of each mesh element is summed to the vertices surrounding the element.
+   The "area" of each vertex is the sum of an area contribution from each element sharing the vertex (left). In particular, one quarter of the area of each mesh element is summed to the vertices surrounding the element (right).
 
-Each sum is an average of the area of the mesh elements that share the vertex. 
-In many "staggered mesh" applications, such an operation is common and is 
-often written in a way that presents the algorithm clearly but prevents 
+Each sum is an average of the area of the four mesh elements that share the 
+vertex. In many "staggered mesh" applications, an operation like this is common 
+and is often written in a way that presents the algorithm clearly but prevents 
 parallelization due to potential data races. That is, multiple loop iterates 
 over mesh elements may attempt to write to the same shared vertex memory 
 location at the same time. The example shows how RAJA constructs can be 
 used to enable one to express such an algorithm in parallel and have it
 run correctly without fundamentally changing how it looks in source code.
 
-After defining the number of elements in the mesh, necessary array offsets
-and an array that indicates the mapping between an element and its four 
-surrounding vertices, a C-style version of the vertex sum calculation is:
+We start by setting the size of the mesh, specifically, the total number of 
+elements and vertices and the number of elements and vertices in each direction:
 
-.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp
-   :start-after: _cstyle_vertexsum_start
-   :end-before: _cstyle_vertexsum_end
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _vertexsum_define_start
+   :end-before: _vertexsum_define_end
    :language: C++
 
-^^^^^^^^^^^^^^^^^^^^^^^
-RAJA Sequential Variant
-^^^^^^^^^^^^^^^^^^^^^^^
+We also set up an array to map each element to its four surrounding vertices
+and set the area of each element:
+
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _vertexsum_elemarea_start
+   :end-before: _vertexsum_elemarea_end
+   :language: C++
 
-A nested loop RAJA variant of this kernel is:
+Then, a sequential C-style version of the vertex area calculation looks like 
+this:
 
-.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp
-   :start-after: _raja_seq_vertexsum_start
-   :end-before: _raja_seq_vertexsum_end
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _cstyle_vertexarea_seq_start
+   :end-before: _cstyle_vertexarea_seq_end
+   :language: C++
+
+We can't parallelize the entire computation at once due to potential race
+conditions where multiple threads may attempt to sum to a shared element 
+vertex simultaneously. However, we can parallelize the computation in 
+parts. Here is a C-style OpenMP parallel implementation:
+
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _cstyle_vertexarea_omp_start
+   :end-before: _cstyle_vertexarea_omp_end
+   :language: C++
+
+What we've done is broken up the computation into four parts, each of which
+can safely run in parallel because there are no overlapping writes to the
+same entry in the vertex area array in each parallel section. Note that there 
+is an outer loop on length four, one iteration for each of the elements that 
+share a vertex. Inside the loop, we iterate over a subset of elements in 
+parallel using an indexing area that guarantees that we will have no
+data races. In other words, we have "colored" the elements as shown in the
+figure below. 
+
+.. figure:: ../figures/vertexsum_color.png
+   :scale: 30
+   :align: center
+
+   We partition the mesh elements into four disjoint subsets shown by the colors and numbers so that within each subset no two elements share a vertex.
+
+For completeness, the computation of the four element indexing arrays is:
+
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _vertexarea_color_start
+   :end-before: _vertexarea_color_end
    :language: C++
 
-Note that this version cannot be guaranteed to run correctly in parallel
-by simply changing the loop execution policies as we have done in other
-examples. We would like to use RAJA to enable parallel execution and without
-changing the way the kernel looks in source code. By applying a RAJA index
-set and suitably-defined list segments, we can accomplish this.
 
 ^^^^^^^^^^^^^^^^^^^^^^^
 RAJA Parallel Variants
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-To enable the kernel to run safely in parallel, by eliminating the race 
-conditions, we partition the element iteration space into four subsets
-(or `colors`) indicated by the numbers in the figure below, which represents
-a portion of our logically-Cartesian 2D mesh.
-
-  +---+---+---+---+
-  | 2 | 3 | 2 | 3 |
-  +---+---+---+---+
-  | 0 | 1 | 0 | 1 |
-  +---+---+---+---+
-  | 2 | 3 | 2 | 3 |
-  +---+---+---+---+
-  | 0 | 1 | 0 | 1 |
-  +---+---+---+---+
-
-Note that none of the elements with the same number share a common vertex. 
-Thus, we can iterate over all elements with the same number (i.e., color) 
-in parallel.
-
-First, we define four vectors to gather the mesh element indices for each
-color:
-
-.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp
-   :start-after: _colorvectors_vertexsum_start
-   :end-before: _colorvectors_vertexsum_end
+To implement the vertex sum calculation using RAJA, we employ 
+``RAJA::TypedListSegment`` iteration space objects to enumerate the mesh
+elements for each color and put them in a ``RAJA::TypedIndexSet`` object.
+This allows us to execute the entire calculation using one ``RAJA::forall``
+call. 
+
+We declare a type alias for the list segments to make the code more compact:
+
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _vertexarea_listsegtype_start
+   :end-before: _vertexarea_listsegtype_end
    :language: C++
 
-Then, we create a RAJA index set with four list segments, one for each color,
-using the vectors:
+Then, we build the index set:
 
-.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp
-   :start-after: _colorindexset_vertexsum_start
-   :end-before: _colorindexset_vertexsum_end
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _vertexarea_indexset_start
+   :end-before: _vertexarea_indexset_end
    :language: C++
 
-Now, we can use an index set execution policy that iterates over the 
+Note that we construct the list segments using the arrays we made earlier 
+to partition the elements. Then, we push them onto the index set.
+
+Now, we can use a two-level index set execution policy that iterates over the 
 segments sequentially and executes each segment in parallel using OpenMP
-multithreading (and ``RAJA::forall``):
+multithreading to run the kernel:
 
-.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp
-   :start-after: _raja_seq_colorindexset_vertexsum_start
-   :end-before: _raja_seq_colorindexset_vertexsum_end
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _raja_vertexarea_omp_start
+   :end-before: _raja_vertexarea_omp_end
    :language: C++
 
-Note that we no longer need to use the offset variable to compute the 
-element index in terms of 'i' and 'j' since the loop is no longer nested
-and the element indices are directly encoded in the list segments.
+The execution of the RAJA version is similar to the C-style OpenMP variant 
+shown earlier, where we executed four OpenMP parallel loops in sequence, 
+but the code is more concise. In particular, we execute four parallel OpenMP
+loops, one for each list segment in the index set. Also, note that we do
+not have to manually extract the element index from the segments like we
+did earlier since RAJA passes the segment entries directly to the lambda
+expression.
 
-For completeness, here is the RAJA variant where we iterate over the 
+Here is the RAJA variant where we iterate over the 
 segments sequentially, and execute each segment in parallel via a CUDA
-kernel launch on a GPU:
+kernel launched on a GPU:
 
-.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp
-   :start-after: _raja_cuda_colorindexset_vertexsum_start
-   :end-before: _raja_cuda_colorindexset_vertexsum_end
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _raja_vertexarea_cuda_start
+   :end-before: _raja_vertexarea_cuda_end
    :language: C++
 
-Here, we have marked the lambda loop body with the 'RAJA_DEVICE' macro
-and specified the number of threads in a CUDA thread block in the segment
-execution policy.
+The only differences here are that we have marked the lambda loop body with the 
+``RAJA_DEVICE`` macro, used a CUDA segment execution policy, and built a new 
+index set with list segments created using a CUDA resource so that the indices 
+live in device memory.
+
+The RAJA HIP variant, which we show for completeness, is similar:
+
+.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp
+   :start-after: _raja_vertexarea_hip_start
+   :end-before: _raja_vertexarea_hip_end
+   :language: C++
 
-The RAJA HIP variant is similar.
+The main difference for the HIP variant is that we use explicit device
+memory allocation/deallocation and host-device memory copy operations.
 
-The file ``RAJA/examples/tut_vertexsum-coloring.cpp`` contains a complete 
-working example code, including a RAJA HIP variant.
diff --git a/docs/sphinx/user_guide/tutorial/view_layout.rst b/docs/sphinx/user_guide/tutorial/view_layout.rst
new file mode 100644
index 0000000000..df20a8af3a
--- /dev/null
+++ b/docs/sphinx/user_guide/tutorial/view_layout.rst
@@ -0,0 +1,303 @@
+.. ##
+.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _tut-view_layout-label:
+
+-----------------------------------------------------------
+Data Views and Layouts
+-----------------------------------------------------------
+
+This section contains an exercise file ``RAJA/exercises/view-layout.cpp``
+for you to work through if you wish to get some practice with RAJA. The
+file ``RAJA/exercises/view-layout_solution.cpp`` contains complete
+working code for the examples discussed in this section. You can use the
+solution file to check your work and for guidance if you get stuck. To build
+the exercises execute ``make view-layout`` and ``make view-layout_solution``
+from the build directory.
+
+Key RAJA features shown in this section are:
+
+  * ``RAJA::View`` 
+  * ``RAJA::Layout`` and ``RAJA::OffsetLayout`` constructs 
+  * Layout permutations
+
+The examples in this section illustrate RAJA View and Layout concepts
+and usage patterns. The goal is for you to gain an understanding of how
+to use RAJA Views and Layouts to simplify and transform array data access
+patterns. None of the examples use RAJA kernel execution methods, such
+as ``RAJA::forall``. The intent is to focus on RAJA View and Layout mechanics.
+
+Consider a basic C-style implementation of a matrix-matrix multiplication
+operation, using :math:`N \times N` matrices:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _cstyle_matmult_start
+   :end-before: _cstyle_matmult_end
+   :language: C++
+
+As is commonly done for efficiency in C and C++, we have allocated the data 
+for the matrices as one-dimensional arrays. Thus, we need to manually compute 
+the data pointer offsets for the row and column indices in the kernel.
+Here, we use the array ``Cref`` to hold a reference solution matrix that
+we use to compare with results generated by the examples below. 
+
+To simplify the multi-dimensional indexing, we can use ``RAJA::View`` objects,
+which we define as:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _matmult_views_start
+   :end-before: _matmult_views_end
+   :language: C++
+
+Here we define three ``RAJA::View`` objects, 'Aview', 'Bview', and 'Cview',
+that *wrap* the array data pointers, 'A', 'B', and 'C', respectively. We 
+pass a data pointer as the first argument to each view constructor and then 
+the extent of each matrix dimension as the second and third arguments. There
+are two extent arguments since we indicate in the ``RAJA::Layout`` template
+parameter list. The matrices are square and each extent is 'N'. Here, the 
+template parameters to ``RAJA::View`` are the array data type 'double' and 
+a ``RAJA::Layout`` type. Specifically::
+
+  RAJA::Layout<2, int>
+
+means that each View represents a two-dimensional default data layout, and 
+that we will use values of type 'int' to index into the arrays. 
+
+Using the ``RAJA::View`` objects, we can access the data entries for the rows 
+and columns using a more natural, less error-prone syntax:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _cstyle_matmult_views_start
+   :end-before: _cstyle_matmult_views_end
+   :language: C++
+
+Default Layouts Use Row-major Ordering
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The default data layout ordering in RAJA is *row-major*, which is the 
+convention for multi-dimensional array indexing in C and C++. This means that 
+the rightmost index will be stride-1, the index to the left of the rightmost 
+index will have stride equal to the extent of the rightmost dimension, and
+so on.
+
+.. note:: RAJA Layouts and Views support any number of dimensions and
+          the default data access ordering is *row-major*. Please
+          see :ref:`feat-view-label` for more details.
+
+To illustrate the default data layout striding, we next show simple
+one-, two-, and three-dimensional examples where the for-loop ordering 
+for the different dimensions is such that all data access is stride-1. We 
+begin by defining some dimensions, allocate and initialize arrays:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _default_views_init_start
+   :end-before: _default_views_init_end
+   :language: C++
+
+The version of the array initialization kernel using a one-dimensional 
+``RAJA::View`` is:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _default_view1D_start
+   :end-before: _default_view1D_end
+   :language: C++
+
+The version of the array initialization using a two-dimensional 
+``RAJA::View`` is:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _default_view2D_start
+   :end-before: _default_view2D_end
+   :language: C++
+
+The three-dimensional version is: 
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _default_view3D_start
+   :end-before: _default_view3D_end
+   :language: C++
+
+It's worth repeating that the data array access in all three variants shown
+here using ``RAJA::View`` objects is stride-1 since we order the for-loops
+in the loop nests to match the row-major ordering.
+
+RAJA Layout types support other data access patterns with different striding 
+orders, offsets, and permutations. To this point, we have used the default 
+Layout constructor. RAJA provides methods to generate Layouts for different 
+indexing patterns. We describe these in the next several sections. Next, we 
+show how to permute the data striding order using permuted Layouts.
+
+Permuted Layouts Change Data Striding Order
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Every ``RAJA::Layout`` object has a permutation. When a permutation is not 
+specified at creation, a Layout will use the identity permutation. Here are
+examples where the identity permutation is explicitly provided. First, in
+two dimensions:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _default_perm_view2D_start
+   :end-before: _default_perm_view2D_end
+   :language: C++
+
+Then, in three dimensions:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _default_perm_view3D_start
+   :end-before: _default_perm_view3D_end
+   :language: C++
+
+These two examples access the data with stride-1 ordering, the same as in
+the earlier examples, which is shown by the nested loop ordering.  
+The identity permutation in two dimensions is '{0, 1}' and is '{0, 1, 2}'
+for three dimensions. The method ``RAJA::make_permuted_layout`` is used to 
+create a ``RAJA::Layout`` object with a permutation. The method takes two 
+arguments, the extents of each dimension and the permutation.
+
+.. note:: If a permuted Layout is created with the *identity permutation*
+          (e.g., {0,1,2}), the Layout is the same as if it were created by
+
+Next, we permute the striding order for the two-dimensional example:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _perm_2D_start
+   :end-before: _perm_2D_end
+   :language: C++
+
+Read from right to left, the permutation '{1, 0}' specifies that the first
+(zero) index 'i' is stride-1 and the second index (one) 'j' has stride equal 
+to the extent of the first Layout dimension 'Nx'. This is evident in the 
+for-loop ordering.
+
+Here is the three-dimensional case, where we have reversed the striding order
+using the permutation '{2, 1, 0}':
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _perma_view3D_start
+   :end-before: _perma_view3D_end
+   :language: C++
+
+The data access remains stride-1 due to the for-loop reordering. For fun, 
+here is another three-dimensional permutation:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _permb_view3D_start
+   :end-before: _permb_view3D_end
+   :language: C++
+
+The permutation is '{1, 2, 0}' so to make the data access stride-1, we
+swap the 'j' and 'k' loops and leave the 'i' loop as the inner loop.
+
+Multi-dimensional Indices and Linear Indices
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``RAJA::Layout`` types provide methods to convert between linear indices and
+multi-dimensional indices and vice versa. Recall the Layout 'perm3a_layout' 
+from above that was created with the permutation '{2, 1, 0}'. To get the 
+linear index corresponding to the index triple '(1, 2, 0)', you can do
+this::
+
+  int lin = perm3a_layout(1, 2, 0);
+
+The value of 'lin' is 7 = 1 + 2 * Nx + 0 * Nx * Ny. To get the index triple
+for linear index 7, you can do::
+
+  int i, j, k;
+  perm3a_layout.toIndices(7, i, j, k);
+
+This sets 'i' to 1, 'j' to 2, and 'k' to 0.  
+
+Similarly for the Layout 'permb_layout', which was created with the 
+permutation '{1, 2, 0}'::
+
+  lin = perm3b_layout(1, 2, 0); 
+
+sets 'lin' to 13 = 1 + 0 * Nx + 2 * Nx * Nz and::
+
+  perm3b_layout.toIndices(13, i, j, k);
+
+sets 'i' to 1, 'j' to 2, and 'k' to 0.
+
+There are more examples in the exercise file associated with this section. 
+Feel free to experiment with them.
+
+One important item to note is that, by default, there is no bounds checking
+on indices passed to a ``RAJA::View`` data access method or ``RAJA::Layout``
+index computation methods. Therefore, it is the responsibility of a user 
+to ensure that indices passed to ``RAJA::View`` and ``RAJA::Layoout`` 
+methods are in bounds to avoid accessing data outside 
+of the View or computing invalid indices. 
+
+.. note:: RAJA provides a CMake variable ``RAJA_ENABLE_BOUNDS_CHECK`` to 
+          turn run time bounds checking on or off when the code is compiled.
+          Enabling bounds checking is useful for debugging and to ensure
+          your code is correct. However, when enabled, bounds checking adds
+          noticeable run time overhead. So it should not be enabled for
+          a production build of your code. 
+   
+Offset Layouts Apply Offsets to Indices
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The last topic we cover in this exercise is the ``RAJA::OffsetLayout`` type.
+We first illustrate the concept of an offset with a C-style for-loop:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _cstyle_offlayout1D_start
+   :end-before: _cstyle_offlayout1D_end
+   :language: C++
+
+Here, the for-loop runs from 'imin' to 'imax-1' (i.e., -5 to 5). To avoid 
+out-of-bounds negative indexing, we subtract 'imin' (i.e., -5) from the loop 
+index 'i'. 
+
+To do the same thing with RAJA, we create a ``RAJA::OffsetLayout`` object
+and use it to index into the array:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _raja_offlayout1D_start
+   :end-before: _raja_offlayout1D_end
+   :language: C++
+
+``RAJA::OffsetLayout`` is a different type than ``RAJA::Layout`` because
+it contains offset information. The arguments to the 
+``RAJA::make_offset_layout`` method are the index bounds.
+
+As expected, the two dimensional case is similar. First, a C-style loop:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _cstyle_offlayout2D_start
+   :end-before: _cstyle_offlayout2D_end
+   :language: C++
+
+and then the same operation using a ``RAJA::OffsetLayout`` object:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _raja_offlayout2D_start
+   :end-before: _raja_offlayout2D_end
+   :language: C++
+
+Note that the first argument passed to ``RAJA::make_offset_layout`` contains
+the lower bounds for 'i' and 'j' and the second argument contains the upper
+bounds. Also, the 'j' index is stride-1 by default since we did not pass
+a permutation to the ``RAJA::make_offset_layout`` method, which is the same 
+as the non-offset Layout usage.
+
+Just like ``RAJA::Layout`` has a permutation, so does ``RAJA::OffsetLayout``.
+Here is an example where we permute the (i, j) index stride ordering:
+
+.. literalinclude:: ../../../../exercises/view-layout_solution.cpp
+   :start-after: _raja_permofflayout2D_start
+   :end-before: _raja_permofflayout2D_end
+   :language: C++ 
+
+The permutation '{1, 0}' is passed as the third argument to 
+``RAJA::make_offset_layout``. From the ordering of the for-loops, we can see 
+that the 'i' index is stride-1 and the 'j' index has stride equal to the 
+extent of the 'i' dimension so the for-loop nest strides through 
+the data with unit stride.
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 2f2351ac7c..13c1395ee8 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,89 +1,51 @@
-###############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
 # and RAJA project contributors. See the RAJA/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
 raja_add_executable(
-  NAME tut_teams_basic
-  SOURCES tut_teams_basic.cpp)
+  NAME tut_launch_basic
+  SOURCES tut_launch_basic.cpp)
 
 raja_add_executable(
   NAME resource-forall
   SOURCES resource-forall.cpp)
 
 raja_add_executable(
-  NAME tut_daxpy
-  SOURCES tut_daxpy.cpp)
-
-raja_add_executable(
-  NAME tut_add-vectors
-  SOURCES tut_add-vectors.cpp)
-
-raja_add_executable(
-  NAME tut_dot-product
-  SOURCES tut_dot-product.cpp)
+  NAME dynamic-forall
+  SOURCES dynamic-forall.cpp)
 
 raja_add_executable(
-  NAME tut_indexset-segments
-  SOURCES tut_indexset-segments.cpp)
+  NAME forall-param-reductions
+  SOURCES forall-param-reductions.cpp)
 
 raja_add_executable(
-  NAME tut_matrix-multiply
-  SOURCES tut_matrix-multiply.cpp)
-
-raja_add_executable(
-  NAME tut_nested-loop-reorder
-  SOURCES tut_nested-loop-reorder.cpp)
+  NAME resource-dynamic-forall
+  SOURCES resource-dynamic-forall.cpp)
 
 raja_add_executable(
-  NAME tut_vertexsum-coloring
-  SOURCES tut_vertexsum-coloring.cpp)
-
-raja_add_executable(
-  NAME tut_reductions
-  SOURCES tut_reductions.cpp)
-
-raja_add_executable(
-  NAME teams_flatten
-  SOURCES teams_flatten.cpp)
-
-raja_add_executable(
-  NAME teams_reductions
-  SOURCES teams_reductions.cpp)
-
-raja_add_executable(
-  NAME resource-runtime-teams
-  SOURCES resource-runtime-teams.cpp)
-
-raja_add_executable(
-  NAME tut_scan
-  SOURCES tut_scan.cpp)
-
-raja_add_executable(
-  NAME tut_sort
-  SOURCES tut_sort.cpp)
+  NAME tut_daxpy
+  SOURCES tut_daxpy.cpp)
 
 raja_add_executable(
-  NAME tut_atomic-histogram
-  SOURCES tut_atomic-histogram.cpp)
+  NAME dynamic_mat_transpose
+  SOURCES dynamic_mat_transpose.cpp)
 
 raja_add_executable(
-  NAME tut_offset-layout
-  SOURCES tut_offset-layout.cpp)
+  NAME tut_matrix-multiply
+  SOURCES tut_matrix-multiply.cpp)
 
 raja_add_executable(
-  NAME tut_batched-matrix-multiply
-  SOURCES tut_batched-matrix-multiply.cpp)
+  NAME launch_flatten
+  SOURCES launch_flatten.cpp)
 
 raja_add_executable(
-  NAME tut_matrix-transpose-local-array
-  SOURCES tut_matrix-transpose-local-array.cpp)
+  NAME launch_reductions
+  SOURCES launch_reductions.cpp)
 
 raja_add_executable(
-  NAME tut_tiled-matrix-transpose
-  SOURCES tut_tiled-matrix-transpose.cpp)
+  NAME resource-runtime-launch
+  SOURCES resource-runtime-launch.cpp)
 
 raja_add_executable(
   NAME tut_halo-exchange
@@ -94,12 +56,12 @@ raja_add_executable(
   SOURCES pi-reduce_vs_atomic.cpp)
 
 raja_add_executable(
-  NAME raja-teams
-  SOURCES raja-teams.cpp)
+  NAME raja-launch
+  SOURCES raja-launch.cpp)
 
 raja_add_executable(
-  NAME teams_matrix-multiply
-  SOURCES teams_matrix-multiply.cpp)
+  NAME launch_matrix-multiply
+  SOURCES launch_matrix-multiply.cpp)
 
 raja_add_executable(
   NAME jacobi
@@ -136,7 +98,7 @@ raja_add_executable(
   SOURCES resource-kernel.cpp)
 
 raja_add_executable(
-  NAME resource-teams
-  SOURCES resource-teams.cpp)
+  NAME resource-launch
+  SOURCES resource-launch.cpp)
 
 add_subdirectory(plugin)
diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp
new file mode 100644
index 0000000000..18dbde8243
--- /dev/null
+++ b/examples/dynamic-forall.cpp
@@ -0,0 +1,145 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Vector Addition Example with dynamic policy selection
+ *
+ *  Computes c = a + b, where a, b, c are vectors of ints.
+ *  a policy selected at run-time
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Functions for checking and printing results
+//
+void checkResult(int* res, int len);
+void printResult(int* res, int len);
+
+using policy_list = camp::list<RAJA::loop_exec
+                               ,RAJA::simd_exec
+#if defined(RAJA_ENABLE_OPENMP)
+                               ,RAJA::omp_parallel_for_exec
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                               ,RAJA::cuda_exec<256>
+                               ,RAJA::cuda_exec<512>
+#endif
+                               >;
+
+int main(int argc, char *argv[])
+{
+
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the policy to run");
+  }
+
+  //
+  // Run time policy section is demonstrated in this example by specifying
+  // kernel exection space as a command line argument
+  // Example usage ./dynamic_forall policy N
+  //
+
+  const int pol = std::stoi(argv[1]);
+
+  std::cout << "\n\nRAJA vector addition example...\n";
+  std::cout << "Using policy # "<<pol<<std::endl;
+
+//
+// Define vector length
+//
+  const int N = 1000000;
+
+//
+// Allocate and initialize vector data
+//
+  int *a = memoryManager::allocate<int>(N);
+  int *b = memoryManager::allocate<int>(N);
+  int *c = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    a[i] = -i;
+    b[i] = i;
+  }
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style vector addition...\n";
+
+  // _cstyle_vector_add_start
+  for (int i = 0; i < N; ++i) {
+    c[i] = a[i] + b[i];
+  }
+  // _cstyle_vector_add_end
+
+  checkResult(c, N);
+//printResult(c, N);
+
+
+//----------------------------------------------------------------------------//
+// Example of dynamic policy selection for forall
+//----------------------------------------------------------------------------//
+
+  //policy is chosen from the list
+  RAJA::expt::dynamic_forall<policy_list>(pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i)   {
+      c[i] = a[i] + b[i];
+  });
+  // _rajaseq_vector_add_end
+
+  checkResult(c, N);
+//printResult(c, N);
+
+
+//----------------------------------------------------------------------------//
+//
+// Clean up.
+//
+  memoryManager::deallocate(a);
+  memoryManager::deallocate(b);
+  memoryManager::deallocate(c);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to check result and report P/F.
+//
+void checkResult(int* res, int len)
+{
+  bool correct = true;
+  for (int i = 0; i < len; i++) {
+    if ( res[i] != 0 ) { correct = false; }
+  }
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
+//
+// Function to print result.
+//
+void printResult(int* res, int len)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < len; i++) {
+    std::cout << "result[" << i << "] = " << res[i] << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp
new file mode 100644
index 0000000000..6386c0a42d
--- /dev/null
+++ b/examples/dynamic_mat_transpose.cpp
@@ -0,0 +1,436 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "memoryManager.hpp"
+
+/*
+ *  Matrix Transpose Example
+ *
+ *  In this example, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At of size N_c x N_r.
+ *
+ *  This operation is carried out using a local memory tiling
+ *  algorithm. The algorithm first loads matrix entries into an
+ *  iteraion shared tile, a two-dimensional array, and then
+ *  reads from the tile with row and column indices swapped for
+ *  the output matrix.
+ *
+ *  The algorithm is expressed as a collection of ``outer``
+ *  and ``inner`` for loops. Iterations of the inner loops will load/read
+ *  data into the tile; while outer loops will iterate over the number
+ *  of tiles needed to carry out the transpose.
+ *
+ *  RAJA variants of the example use RAJA dynamic shared memory as tile memory.
+ *  RAJA shared memory is mapped to device shared memory which
+ *  enables threads in the same thread block to share data. Host versions
+ *  of the algorithms will use a dynamically sized array
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::launch' abstractions for nested loops
+ *    - Hierachial parallism
+ *    - Dynamic shared memory
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+const int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+using launch_policy = RAJA::LaunchPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::omp_launch_t
+#else
+    RAJA::seq_launch_t
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+    ,
+    RAJA::cuda_launch_t<false>
+#endif
+#if defined(RAJA_ENABLE_HIP)
+    ,
+    RAJA::hip_launch_t<false>
+#endif
+#if defined(RAJA_ENABLE_SYCL)
+    ,
+    RAJA::sycl_launch_t<false>
+#endif
+    >;
+
+/*
+ * Define team policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using outer0 = RAJA::LoopPolicy<
+                                       RAJA::loop_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                       ,
+                                       RAJA::cuda_block_x_direct
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                       ,
+                                       RAJA::hip_block_x_direct
+#endif
+#if defined(RAJA_ENABLE_SYCL)
+                                       ,
+                                       RAJA::sycl_group_0_direct
+#endif
+                                       >;
+
+using outer1 = RAJA::LoopPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+                                      RAJA::omp_for_exec
+#else
+                                       RAJA::loop_exec
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                                       ,
+                                       RAJA::cuda_block_y_direct
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                       ,
+                                       RAJA::hip_block_y_direct
+#endif
+#if defined(RAJA_ENABLE_SYCL)
+                                       ,
+                                       RAJA::sycl_group_1_direct
+#endif
+                                       >;
+/*
+ * Define thread policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using inner0 = RAJA::LoopPolicy<
+                                         RAJA::loop_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                         ,
+                                         RAJA::cuda_thread_x_direct
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                         ,
+                                         RAJA::hip_thread_x_direct
+#endif
+#if defined(RAJA_ENABLE_SYCL)
+                                        ,
+                                         RAJA::sycl_local_0_direct
+#endif
+                                         >;
+
+using inner1 = RAJA::LoopPolicy<RAJA::loop_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                         ,
+                                         RAJA::cuda_thread_y_direct
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                         ,
+                                         RAJA::hip_thread_y_direct
+#endif
+#if defined(RAJA_ENABLE_SYCL)
+                                        ,
+                                         RAJA::sycl_local_1_direct
+#endif
+                                         >;
+
+template<typename T>
+void switch_ptrs(T *A, T *d_A)
+{
+  T *tmp_ptr;
+  tmp_ptr = d_A;
+  d_A = A;
+  A = tmp_ptr;
+}
+
+int main(int argc, char *argv[])
+{
+
+  std::cout << "\n\nRAJA matrix transpose example...\n";
+
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
+  }
+
+  //
+  // Run time policy section is demonstrated in this example by specifying
+  // kernel exection space as a command line argument (host or device).
+  // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device
+  //
+  std::string exec_space = argv[1];
+  if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
+    RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device");
+    return 0;
+  }
+
+  RAJA::ExecPlace select_cpu_or_gpu;
+  if(exec_space.compare("host") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA::launch reductions example on the host \n"); }
+  if(exec_space.compare("device") == 0)
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA::launch reductions example on the device \n"); }
+
+
+
+#if defined(RAJA_ENABLE_SYCL)
+  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
+  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
+#endif
+
+  //
+  // Define num rows/cols in matrix, tile dimensions, and number of tiles
+  //
+  // _mattranspose_localarray_dims_start
+  const int N_r = 267;
+  const int N_c = 251;
+
+  const int TILE_DIM = 16;
+
+  const int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  const int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  // _mattranspose_localarray_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_localarray_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_localarray_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of shared matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _mattranspose_localarray_cstyle_start
+  //
+  // (0) Outer loops to iterate over tiles
+  //
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
+
+      // Stack-allocated local array for data on a tile
+      int Tile[TILE_DIM][TILE_DIM];
+
+      //
+      // (1) Inner loops to read input matrix tile data into the array
+      //
+      //     Note: loops are ordered so that input matrix data access
+      //           is stride-1.
+      //
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Tile[ty][tx] = Aview(row, col);
+          }
+        }
+      }
+
+      //
+      // (2) Inner loops to write array data into output array tile
+      //
+      //     Note: loop order is swapped from above so that output matrix
+      //           data access is stride-1.
+      //
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Atview(col, row) = Tile[ty][tx];
+          }
+        }
+      }
+
+    }
+  }
+  // _mattranspose_localarray_cstyle_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n";
+
+#if defined(RAJA_ENABLE_HIP)
+
+  //Hip requires device side pointers
+  int *d_A = nullptr, *d_At = nullptr;
+
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+    d_A =  memoryManager::allocate_gpu<int>(N_r * N_c);
+    d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+
+    hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+
+    //switch host/device pointers so we can reuse the views
+    switch_ptrs(d_A, A);
+    switch_ptrs(d_At, At);
+  }
+#endif
+
+
+  constexpr size_t dynamic_shared_mem_size = TILE_DIM * TILE_DIM * sizeof(int);
+
+  RAJA::launch<launch_policy>
+    (select_cpu_or_gpu,
+     RAJA::LaunchParams(RAJA::Teams(outer_Dimr, outer_Dimc),
+                        RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size),
+     "Matrix tranpose with dynamic shared memory kernel",
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)
+  {
+
+    RAJA::loop<outer1>(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){
+        RAJA::loop<outer0>(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){
+
+            //Request memory from shared memory pool
+            int * tile_ptr = ctx.getSharedMemory<int>(TILE_DIM * TILE_DIM);
+
+            //Use RAJA View for simplified indexing
+            RAJA::View<int, RAJA::Layout<2>> Tile(tile_ptr, TILE_DIM, TILE_DIM);
+
+            RAJA::loop<inner1>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){
+              RAJA::loop<inner0>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){
+
+                  int col = bx * TILE_DIM + tx;  // Matrix column index
+                  int row = by * TILE_DIM + ty;  // Matrix row index
+
+                  // Bounds check
+                  if (row < N_r && col < N_c) {
+                    Tile(ty,tx) = Aview(row, col);
+                  }
+
+                });
+              });
+
+            //Barrier is needed to ensure all threads have written to Tile
+            ctx.teamSync();
+
+            RAJA::loop<inner1>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){
+              RAJA::loop<inner0>(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){
+
+                  int col = bx * TILE_DIM + tx;  // Matrix column index
+                  int row = by * TILE_DIM + ty;  // Matrix row index
+
+                  // Bounds check
+                  if (row < N_r && col < N_c) {
+                    Atview(col, row) = Tile(ty, tx);
+                  }
+
+                });
+              });
+
+            //The launch context uses bump style allocator to return different segments of shared memory
+            //to avoid requesting beyond the pre-allocated memory quantity we reset the allocator offset counter
+            //effectively releasing shared memory.
+            ctx.releaseSharedMemory();
+
+          });
+      });
+
+  });
+
+
+#if defined(RAJA_ENABLE_HIP)
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+    switch_ptrs(d_At, At);
+    switch_ptrs(d_A, A);
+
+    hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  }
+#endif
+
+
+  checkResult<int>(Atview, N_c, N_r);
+  //----------------------------------------------------------------------------//
+
+  return 0;
+}
+
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      //std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //<< std::endl;
+      printf("%d ",Atview(row, col));
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp
new file mode 100644
index 0000000000..9bb1aa62cb
--- /dev/null
+++ b/examples/forall-param-reductions.cpp
@@ -0,0 +1,336 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Reduction Example
+ *
+ *  This example illustrates use of the RAJA reduction types: min, max,
+ *  sum, min-loc, and max-loc.
+ *
+ *  RAJA features shown:
+ *    - `forall` loop iteration template method
+ *    -  Index range segment
+ *    -  Execution policies
+ *    -  Reduction types
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+  Specify the number of threads in a GPU thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+constexpr int HIP_BLOCK_SIZE = 256;
+#endif
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA reductions example...\n";
+
+  // _reductions_array_init_start
+//
+// Define array length
+//
+  constexpr int N = 1000000;
+
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
+  int* a = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
+      a[i] = 1;
+    } else {
+      a[i] = -1; 
+    }
+  }
+
+//
+// Set min and max loc values
+//
+  constexpr int minloc_ref = N / 2;
+  a[minloc_ref] = -100;
+
+  constexpr int maxloc_ref = N / 2 + 1;
+  a[maxloc_ref] = 100;
+  // _reductions_array_init_end
+
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
+  // _reductions_range_start
+  RAJA::TypedRangeSegment<int> arange(0, N);
+  // _reductions_range_end
+
+//
+// Define ValLoc Type
+//
+
+  using VALLOC_INT = RAJA::expt::ValLoc<int>;
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential reductions...\n";
+
+  // _reductions_raja_seq_start
+  using EXEC_POL1   = RAJA::seq_exec;
+ 
+  int seq_sum = 0;
+  int seq_min = std::numeric_limits<int>::max();
+  int seq_max = std::numeric_limits<int>::min();
+  VALLOC_INT seq_minloc(std::numeric_limits<int>::max(), -1);
+  VALLOC_INT seq_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL1>(arange, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&seq_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&seq_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&seq_maxloc),
+    [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) {
+      _seq_sum += a[i];
+
+      _seq_min = RAJA_MIN(a[i], _seq_min);
+      _seq_max = RAJA_MAX(a[i], _seq_max);
+
+      _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc);
+      _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc);
+      //_seq_minloc.min(a[i], i);
+      //_seq_maxloc.max(a[i], i);
+      // Note : RAJA::expt::ValLoc<T> objects provide min() and max() methods 
+      //        that are equivalent to the assignments with RAJA_MIN and RAJA_MAX 
+      //        above.
+    }
+  );
+
+  std::cout << "\tsum = " << seq_sum << std::endl;
+  std::cout << "\tmin = " << seq_min << std::endl;
+  std::cout << "\tmax = " << seq_max << std::endl;
+  std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " 
+                               << seq_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " 
+                               << seq_maxloc.getLoc() << std::endl;
+  // _reductions_raja_seq_end
+  
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running RAJA OpenMP reductions...\n";
+
+  // _reductions_raja_omppolicy_start
+  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
+  // _reductions_raja_omppolicy_end
+
+  int omp_sum = 0;
+  int omp_min = std::numeric_limits<int>::max();
+  int omp_max = std::numeric_limits<int>::min();
+  VALLOC_INT omp_minloc(std::numeric_limits<int>::max(), -1);
+  VALLOC_INT omp_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL2>(arange, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_maxloc),
+    [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) {
+      _omp_sum += a[i];
+
+      _omp_min = RAJA_MIN(a[i], _omp_min);
+      _omp_max = RAJA_MAX(a[i], _omp_max);
+
+      _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc);
+      _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc);
+      //_omp_minloc.min(a[i], i);
+      //_omp_maxloc.max(a[i], i);
+    }
+  );
+
+  std::cout << "\tsum = " << omp_sum << std::endl;
+  std::cout << "\tmin = " << omp_min << std::endl;
+  std::cout << "\tmax = " << omp_max << std::endl;
+  std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " 
+                               << omp_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " 
+                               << omp_maxloc.getLoc() << std::endl;
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+  std::cout << "\n Running RAJA OpenMP Target reductions...\n";
+
+  // _reductions_raja_omppolicy_start
+  using EXEC_POL3   = RAJA::omp_target_parallel_for_exec_nt;
+  // _reductions_raja_omppolicy_end
+
+  int omp_t_sum = 0;
+  int omp_t_min = std::numeric_limits<int>::max();
+  int omp_t_max = std::numeric_limits<int>::min();
+  VALLOC_INT omp_t_minloc(std::numeric_limits<int>::max(), -1);
+  VALLOC_INT omp_t_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(arange, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&omp_t_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&omp_t_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&omp_t_maxloc),
+    [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) {
+      _omp_t_sum += a[i];
+
+      _omp_t_min = RAJA_MIN(a[i], _omp_t_min);
+      _omp_t_max = RAJA_MAX(a[i], _omp_t_max);
+
+      _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc);
+      _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc);
+      //_omp_t_minloc.min(a[i], i);
+      //_omp_t_maxloc.max(a[i], i);
+    }
+  );
+
+  std::cout << "\tsum = " << omp_t_sum << std::endl;
+  std::cout << "\tmin = " << omp_t_min << std::endl;
+  std::cout << "\tmax = " << omp_t_max << std::endl;
+  std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , " 
+                               << omp_t_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " 
+                               << omp_t_maxloc.getLoc() << std::endl;
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running RAJA CUDA reductions...\n";
+
+  // _reductions_raja_cudapolicy_start
+  using EXEC_POL3   = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  // _reductions_raja_cudapolicy_end
+
+  int cuda_sum = 0;
+  int cuda_min = std::numeric_limits<int>::max();
+  int cuda_max = std::numeric_limits<int>::min();
+  VALLOC_INT cuda_minloc(std::numeric_limits<int>::max(), -1);
+  VALLOC_INT cuda_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(arange, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&cuda_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&cuda_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&cuda_maxloc),
+    [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) {
+      _cuda_sum += a[i];
+
+      _cuda_min = RAJA_MIN(a[i], _cuda_min);
+      _cuda_max = RAJA_MAX(a[i], _cuda_max);
+
+      _cuda_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _cuda_minloc);
+      _cuda_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _cuda_maxloc);
+      //_cuda_minloc.min(a[i], i);
+      //_cuda_maxloc.max(a[i], i);
+    }
+  );
+
+  std::cout << "\tsum = " << cuda_sum << std::endl;
+  std::cout << "\tmin = " << cuda_min << std::endl;
+  std::cout << "\tmax = " << cuda_max << std::endl;
+  std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " 
+                               << cuda_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " 
+                               << cuda_maxloc.getLoc() << std::endl;
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  std::cout << "\n Running RAJA HIP reductions...\n";
+
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  // _reductions_raja_hippolicy_start
+  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  // _reductions_raja_hippolicy_end
+
+  int hip_sum = 0;
+  int hip_min = std::numeric_limits<int>::max();
+  int hip_max = std::numeric_limits<int>::min();
+  VALLOC_INT hip_minloc(std::numeric_limits<int>::max(), -1);
+  VALLOC_INT hip_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(arange, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&hip_sum),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_min),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_max),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&hip_minloc),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&hip_maxloc),
+    [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) {
+      _hip_sum += d_a[i];
+
+      _hip_min = RAJA_MIN(d_a[i], _hip_min);
+      _hip_max = RAJA_MAX(d_a[i], _hip_max);
+
+      _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc);
+      _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc);
+      //_hip_minloc.min(d_a[i], i);
+      //_hip_maxloc.max(d_a[i], i);
+    }
+  );
+
+  std::cout << "\tsum = " << hip_sum << std::endl;
+  std::cout << "\tmin = " << hip_min << std::endl;
+  std::cout << "\tmax = " << hip_max << std::endl;
+  std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " 
+                               << hip_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " 
+                               << hip_maxloc.getLoc() << std::endl;
+
+  memoryManager::deallocate_gpu(d_a);
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(a);
+
+  std::cout << "\n DONE!...\n";
+ 
+  return 0;
+}
diff --git a/examples/teams_flatten.cpp b/examples/launch_flatten.cpp
similarity index 64%
rename from examples/teams_flatten.cpp
rename to examples/launch_flatten.cpp
index 9ea516e5a3..c8171126c1 100644
--- a/examples/teams_flatten.cpp
+++ b/examples/launch_flatten.cpp
@@ -14,10 +14,10 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Thread Flatten Example using RAJA Teams
+ *  Thread Flatten Example using RAJA Launch
  *
  *  This example illustrates use of the "flatten"
- *  policy inside RAJA Teams
+ *  policy inside RAJA Launch
  *
  *  The flatten policy enables reshaping
  *  multi-dimensional thread teams to 1D
@@ -34,16 +34,16 @@
  */
 
 #if defined(RAJA_ENABLE_CUDA)
-using device_launch = RAJA::expt::LaunchPolicy<RAJA::expt::cuda_launch_t<false>>;
-using device_inner_pol0 =  RAJA::expt::LoopPolicy<RAJA::cuda_thread_x_direct>;
-using device_inner_pol1 =  RAJA::expt::LoopPolicy<RAJA::cuda_thread_y_direct>;
-using device_flatten_pol =  RAJA::expt::LoopPolicy<RAJA::expt::cuda_flatten_block_threads_xy_direct>;
+using device_launch = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+using device_inner_pol0 =  RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+using device_inner_pol1 =  RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+using device_flatten_pol =  RAJA::LoopPolicy<RAJA::cuda_flatten_block_threads_xy_direct>;
 using reduce_policy = RAJA::cuda_reduce;
 #elif defined(RAJA_ENABLE_HIP)
-using device_launch = RAJA::expt::LaunchPolicy<RAJA::expt::hip_launch_t<false>>;
-using device_inner_pol0 =  RAJA::expt::LoopPolicy<RAJA::hip_thread_x_direct>;
-using device_inner_pol1 =  RAJA::expt::LoopPolicy<RAJA::hip_thread_y_direct>;
-using device_flatten_pol =  RAJA::expt::LoopPolicy<RAJA::expt::hip_flatten_block_threads_xy_direct>;
+using device_launch = RAJA::LaunchPolicy<RAJA::hip_launch_t<false>>;
+using device_inner_pol0 =  RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+using device_inner_pol1 =  RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+using device_flatten_pol =  RAJA::LoopPolicy<RAJA::hip_flatten_block_threads_xy_direct>;
 using reduce_policy = RAJA::hip_reduce;
 #endif
 
@@ -51,8 +51,8 @@ using reduce_policy = RAJA::hip_reduce;
  * Define device launch policies
  */
 
-using host_launch = RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t>;
-using host_loop = RAJA::expt::LoopPolicy<RAJA::loop_exec>;
+using host_launch = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+using host_loop = RAJA::LoopPolicy<RAJA::loop_exec>;
 
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
@@ -68,9 +68,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // Configure grid size
   //
-  RAJA::expt::Grid grid(RAJA::expt::Teams(1),
-                        RAJA::expt::Threads(N, N),
-                        "Teams Flatten Kernel");
+  RAJA::LaunchParams launch_params(RAJA::Teams(1),
+                                   RAJA::Threads(N, N));
+
 
   //
   // Resource object for host, used to allocate memory
@@ -97,13 +97,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<2>> d_A_2DView(d_A_ptr, N, N);
   RAJA::View<int, RAJA::Layout<1>> d_A_1DView(d_A_ptr, NN);
 
-
-  RAJA::expt::launch<device_launch>
-    (grid, [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx)
+  RAJA::launch<device_launch>
+    (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
      {
 
-       RAJA::expt::loop<device_inner_pol1>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
-         RAJA::expt::loop<device_inner_pol0>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
+       RAJA::loop<device_inner_pol1>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
+         RAJA::loop<device_inner_pol0>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
              d_A_2DView(j, i) = i + j;
            });
          });
@@ -112,7 +111,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
        // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying
        // accumulating memory contents
-       RAJA::expt::loop<device_flatten_pol>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
+       RAJA::loop<device_flatten_pol>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
            device_kernel_sum += d_A_1DView(i);
        });
 
@@ -126,12 +125,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::View<int, RAJA::Layout<2>> h_A_2DView(h_A_ptr, N, N);
   RAJA::View<int, RAJA::Layout<1>> h_A_1DView(h_A_ptr, NN);
 
-  RAJA::expt::launch<host_launch>
-    (grid, [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx)
+  RAJA::launch<host_launch>
+    (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx)
     {
 
-       RAJA::expt::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
-         RAJA::expt::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
+       RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int j) {
+         RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, N), [&] (int i) {
              h_A_2DView(j, i) = i + j;
            });
          });
@@ -140,7 +139,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
        //As loops are dispatched as standard C loops we can revert to using
        //a regular loop_exec policy
-       RAJA::expt::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
+       RAJA::loop<host_loop>(ctx, RAJA::RangeSegment(0, NN), [&] (int i) {
            host_kernel_sum += h_A_1DView(i);
        });
 
diff --git a/examples/teams_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp
similarity index 77%
rename from examples/teams_matrix-multiply.cpp
rename to examples/launch_matrix-multiply.cpp
index 01282ddce5..bf2f403573 100644
--- a/examples/teams_matrix-multiply.cpp
+++ b/examples/launch_matrix-multiply.cpp
@@ -15,15 +15,15 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Matrix Multiplication Examples using RAJA Teams
+ *  Matrix Multiplication Examples using RAJA Launch
  *
  *  Example computes the product of two square matrices and introduces
- *  RAJA Teams loop capabilities via a sequence of implementations.
+ *  RAJA Launch loop capabilities via a sequence of implementations.
  *
  *  RAJA features shown:
  *    - Index range segment
  *    - View abstraction
- *    - Basic usage of 'RAJA Teams' abstractions for nested loops
+ *    - Basic usage of 'RAJA Launch' abstractions for nested loops
  *
  *  If CUDA is enabled, CUDA unified memory is used.
  */
@@ -37,15 +37,15 @@
 /*
  * Define host/device launch policies
  */
-using launch_policy = RAJA::expt::LaunchPolicy<
-    RAJA::expt::seq_launch_t
+using launch_policy = RAJA::LaunchPolicy<
+    RAJA::seq_launch_t
 #if defined(RAJA_ENABLE_CUDA)
     ,
-    RAJA::expt::cuda_launch_t<false>
+    RAJA::cuda_launch_t<false>
 #endif
 #if defined(RAJA_ENABLE_HIP)
     ,
-    RAJA::expt::hip_launch_t<false>
+    RAJA::hip_launch_t<false>
 #endif
     >;
 
@@ -56,9 +56,9 @@ using gpu_block_x_policy = RAJA::cuda_block_x_direct;
 using gpu_block_y_policy = RAJA::cuda_block_y_direct;
 using gpu_thread_x_policy = RAJA::cuda_thread_x_loop;
 using gpu_thread_y_policy = RAJA::cuda_thread_y_loop;
-using gpu_global_thread_x_policy = RAJA::expt::cuda_global_thread_x;
-using gpu_global_thread_y_policy = RAJA::expt::cuda_global_thread_y;
-using gpu_global_thread_xy_policy = RAJA::expt::cuda_global_thread_xy;
+using gpu_global_thread_x_policy = RAJA::cuda_global_thread_x;
+using gpu_global_thread_y_policy = RAJA::cuda_global_thread_y;
+using gpu_global_thread_xy_policy = RAJA::cuda_global_thread_xy;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
@@ -66,56 +66,50 @@ using gpu_block_x_policy = RAJA::hip_block_x_direct;
 using gpu_block_y_policy = RAJA::hip_block_y_direct;
 using gpu_thread_x_policy = RAJA::hip_thread_x_loop;
 using gpu_thread_y_policy = RAJA::hip_thread_y_loop;
-using gpu_global_thread_x_policy = RAJA::expt::hip_global_thread_x;
-using gpu_global_thread_y_policy = RAJA::expt::hip_global_thread_y;
-using gpu_global_thread_xy_policy = RAJA::expt::hip_global_thread_xy;
+using gpu_global_thread_x_policy = RAJA::hip_global_thread_x;
+using gpu_global_thread_y_policy = RAJA::hip_global_thread_y;
+using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy;
 #endif
 
 /*
   Define RAJA Team/Thread policies, if a device is available add
   a device policy.
 */
-using teams_x = RAJA::expt::LoopPolicy<loop_policy
-#if defined(RAJA_DEVICE_ACTIVE)
-                                       ,
-                                       gpu_block_x_policy
+using teams_x = RAJA::LoopPolicy<loop_policy
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                 , gpu_block_x_policy
 #endif
-                                       >;
+                                >;
 
-using teams_y = RAJA::expt::LoopPolicy<loop_policy
-#if defined(RAJA_DEVICE_ACTIVE)
-                                       ,
-                                       gpu_block_y_policy
+using teams_y = RAJA::LoopPolicy<loop_policy
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                 , gpu_block_y_policy
 #endif
-                                       >;
+                                >;
 
-using threads_x = RAJA::expt::LoopPolicy<loop_policy
-#if defined(RAJA_DEVICE_ACTIVE)
-                                       ,
-                                       gpu_thread_x_policy
+using threads_x = RAJA::LoopPolicy<loop_policy
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                   , gpu_thread_x_policy
 #endif
-                                       >;
+                                  >;
 
-using threads_y = RAJA::expt::LoopPolicy<loop_policy
-#if defined(RAJA_DEVICE_ACTIVE)
-                                       ,
-                                       gpu_thread_y_policy
+using threads_y = RAJA::LoopPolicy<loop_policy
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                   , gpu_thread_y_policy
 #endif
-                                       >;
+                                  >;
 
-using global_thread_x = RAJA::expt::LoopPolicy<loop_policy
-#if defined(RAJA_DEVICE_ACTIVE)
-                                       ,
-                                       gpu_global_thread_x_policy
+using global_thread_x = RAJA::LoopPolicy<loop_policy
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                         , gpu_global_thread_x_policy
 #endif
-                                       >;
+                                        >;
 
-using global_thread_y = RAJA::expt::LoopPolicy<loop_policy
-#if defined(RAJA_DEVICE_ACTIVE)
-                                       ,
-                                       gpu_global_thread_y_policy
+using global_thread_y = RAJA::LoopPolicy<loop_policy
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                         , gpu_global_thread_y_policy
 #endif
-                                       >;
+                                        >;
 
 //
 // Define dimensionality of matrices.
@@ -320,13 +314,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //two for loops.
 
   // _matmult_basickernel_start
-  RAJA::expt::launch<launch_policy>(RAJA::expt::HOST,
-   RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams),
-                         RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)),
-       [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::HOST,
+   RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                         RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-   RAJA::expt::loop<global_thread_y>(ctx, col_range, [&] (int col) {
-       RAJA::expt::loop<global_thread_x>(ctx, row_range, [&] (int row) {
+   RAJA::loop<global_thread_y>(ctx, col_range, [&] (int col) {
+       RAJA::loop<global_thread_x>(ctx, row_range, [&] (int row) {
 
           double dot = 0.0;
           for (int k = 0; k < N; ++k) {
@@ -355,17 +349,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //recompiling execution policies. When running exclusively on the host
   //the compute grid may be left uninitialized as loop methods get expanded to
   //standard C style loops.
-  using omp_launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t>;
+  using omp_launch_policy = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
 
-  using omp_col_policy0 = RAJA::expt::LoopPolicy<RAJA::omp_for_exec>;
+  using omp_col_policy0 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
-  using omp_row_policy0 = RAJA::expt::LoopPolicy<loop_policy>;
+  using omp_row_policy0 = RAJA::LoopPolicy<loop_policy>;
 
-  RAJA::expt::launch<omp_launch_policy>(RAJA::expt::Grid(),
-       [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<omp_launch_policy>(RAJA::LaunchParams(),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-   RAJA::expt::loop<omp_col_policy0>(ctx, col_range, [&] (int col) {
-       RAJA::expt::loop<omp_row_policy0>(ctx, row_range, [&] (int row) {
+   RAJA::loop<omp_col_policy0>(ctx, col_range, [&] (int col) {
+       RAJA::loop<omp_row_policy0>(ctx, row_range, [&] (int row) {
 
           double dot = 0.0;
           for (int k = 0; k < N; ++k) {
@@ -391,11 +385,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // This is the same as using an OpenMP 'parallel for' directive on the
   // outer loop with a 'collapse(2) clause.
   //
-  using global_thread_xy = RAJA::expt::LoopPolicy<RAJA::omp_for_exec>;
+  using global_thread_xy = RAJA::LoopPolicy<RAJA::omp_for_exec>;
 
-   RAJA::expt::launch<omp_launch_policy>(RAJA::expt::HOST,
-                                         RAJA::expt::Grid(),
-   [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+   RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST,
+                                         RAJA::LaunchParams(),
+   [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
      RAJA::expt::loop<global_thread_xy>(ctx, col_range, row_range, [&] (int col, int row) {
 
@@ -431,13 +425,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-   RAJA::expt::launch<launch_policy>(RAJA::expt::DEVICE,
-    RAJA::expt::Grid(RAJA::expt::Teams(N),
-                          RAJA::expt::Threads(N)),
-        [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(N),
+                          RAJA::Threads(N)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-    RAJA::expt::loop<teams_x>(ctx, col_range, [&] (int col) {
-        RAJA::expt::loop<threads_x>(ctx, row_range, [&] (int row) {
+    RAJA::loop<teams_x>(ctx, col_range, [&] (int col) {
+        RAJA::loop<threads_x>(ctx, row_range, [&] (int row) {
 
            double dot = 0.0;
            for (int k = 0; k < N; ++k) {
@@ -467,18 +461,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // The tiling capabilities in RAJA will also mask out of bounds iterations.
   //
-  RAJA::expt::launch<launch_policy>(RAJA::expt::DEVICE,
-    RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams),
-                          RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-      RAJA::expt::tile<teams_y>
+      RAJA::tile<teams_y>
         (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) {
-          RAJA::expt::tile<teams_x>
+          RAJA::tile<teams_x>
             (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) {
 
-              RAJA::expt::loop<threads_y>(ctx, row_tile, [&] (int col) {
-                RAJA::expt::loop<threads_x>(ctx, col_tile, [&] (int row) {
+              RAJA::loop<threads_y>(ctx, row_tile, [&] (int col) {
+                RAJA::loop<threads_x>(ctx, col_tile, [&] (int row) {
 
                     double dot = 0.0;
                     for (int k = 0; k < N; ++k) {
@@ -527,13 +521,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // and col = threadIdx.x in the kernel.
   //
   //
-   RAJA::expt::launch<launch_policy>(RAJA::expt::DEVICE,
-    RAJA::expt::Grid(RAJA::expt::Teams(N),
-                          RAJA::expt::Threads(N)),
-        [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(N),
+                          RAJA::Threads(N)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-     RAJA::expt::loop<teams_x>(ctx, col_range, [&] (int col) {
-       RAJA::expt::loop<threads_x>(ctx, row_range, [&] (int row) {
+     RAJA::loop<teams_x>(ctx, col_range, [&] (int col) {
+       RAJA::loop<threads_x>(ctx, row_range, [&] (int row) {
 
             double dot = 0.0;
             for (int k = 0; k < N; ++k) {
@@ -567,18 +561,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   //
   // The tiling capabilities in RAJA will also mask out of bounds iterations.
   //
-  RAJA::expt::launch<launch_policy>(RAJA::expt::DEVICE,
-    RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams),
-                          RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)),
-      [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-      RAJA::expt::tile<teams_y>
+      RAJA::tile<teams_y>
         (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) {
-          RAJA::expt::tile<teams_x>
+          RAJA::tile<teams_x>
             (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) {
 
-              RAJA::expt::loop<threads_y>(ctx, row_tile, [&] (int col) {
-                RAJA::expt::loop<threads_x>(ctx, col_tile, [&] (int row) {
+              RAJA::loop<threads_y>(ctx, row_tile, [&] (int col) {
+                RAJA::loop<threads_x>(ctx, col_tile, [&] (int row) {
 
                     double dot = 0.0;
                     for (int k = 0; k < N; ++k) {
@@ -604,7 +598,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(C, 0, N*N * sizeof(double));
 
-  using seq_loop =  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::loop_exec>;
+  using seq_loop =  RAJA::LoopPolicy<RAJA::loop_exec, RAJA::loop_exec>;
 
   //
   // This example builds on the RAJA tiling capabilies presented earlier
@@ -616,49 +610,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // This example also uses the teamSync() method in the launch context
   // to add a barrier ensuring all threads have loaded/read from shared memory
   //
-  RAJA::expt::launch<launch_policy>(RAJA::expt::DEVICE,
-    RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams),
-                          RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)),
-     [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<launch_policy>(RAJA::ExecPlace::DEVICE,
+    RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams),
+                          RAJA::Threads(THREAD_SZ,THREAD_SZ)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
    //
    // Loop over teams
    //
-   RAJA::expt::tile<teams_y>
+   RAJA::tile<teams_y>
      (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &y_tile) {
-     RAJA::expt::tile<teams_x>
+     RAJA::tile<teams_x>
        (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &x_tile) {
 
          RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ];
          RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ];
          RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ];
 
-         RAJA::expt::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-             RAJA::expt::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+         RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+             RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
                Cs[ty][tx] = 0.0;
              });
          });
 
-         RAJA::expt::tile<seq_loop>
+         RAJA::tile<seq_loop>
            (ctx, THREAD_SZ, dot_range, [&] (RAJA::RangeSegment const &k_tile) {
 
-           RAJA::expt::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-               RAJA::expt::loop_icount<threads_x>(ctx, k_tile, [&](int k_id, int tx) {
+           RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, k_tile, [&](int k_id, int tx) {
                    As[ty][tx] = Aview(row,k_id);
                  });
              });
 
-           RAJA::expt::loop_icount<threads_y>(ctx, k_tile, [&](int k_id, int ty) {
-               RAJA::expt::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+           RAJA::loop_icount<threads_y>(ctx, k_tile, [&](int k_id, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
                    Bs[ty][tx] = Bview(k_id,col);
                });
              });
 
            ctx.teamSync();
 
-           RAJA::expt::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-               RAJA::expt::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+           RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
 
-                   RAJA::expt::loop_icount<seq_loop>(ctx, k_tile, [&] (int gid, int e) {
+                   RAJA::loop_icount<seq_loop>(ctx, k_tile, [&] (int gid, int e) {
                        Cs[ty][tx] += As[ty][e] * Bs[e][tx];
                      });
 
@@ -669,8 +663,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
          });  // slide across matrix
 
-          RAJA::expt::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
-               RAJA::expt::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
+          RAJA::loop_icount<threads_y>(ctx, y_tile, [&](int row, int ty) {
+               RAJA::loop_icount<threads_x>(ctx, x_tile, [&](int col, int tx) {
                    Cview(col,row) = Cs[ty][tx];
                });
            });
diff --git a/examples/teams_reductions.cpp b/examples/launch_reductions.cpp
similarity index 73%
rename from examples/teams_reductions.cpp
rename to examples/launch_reductions.cpp
index 5d2a4d6cbf..2929bf5075 100644
--- a/examples/teams_reductions.cpp
+++ b/examples/launch_reductions.cpp
@@ -14,7 +14,7 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Reduction Example using RAJA Teams
+ *  Reduction Example using RAJA Launch
  *
  *  This example illustrates use of the RAJA reduction types: min, max,
  *  sum, min-loc, and max-loc.
@@ -28,32 +28,32 @@
  */
 
 #if defined(RAJA_ENABLE_OPENMP)
-using host_launch = RAJA::expt::omp_launch_t;
+using host_launch = RAJA::omp_launch_t;
 using host_loop = RAJA::omp_for_exec;
 #else
-using host_launch = RAJA::expt::seq_launch_t;
+using host_launch = RAJA::seq_launch_t;
 using host_loop = RAJA::loop_exec;
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using device_launch = RAJA::expt::cuda_launch_t<false>;
-using device_loop = RAJA::expt::cuda_global_thread_x;
+using device_launch = RAJA::cuda_launch_t<false>;
+using device_loop = RAJA::cuda_global_thread_x;
 #elif defined(RAJA_ENABLE_HIP)
-using device_launch = RAJA::expt::hip_launch_t<false>;
-using device_loop = RAJA::expt::hip_global_thread_x;
+using device_launch = RAJA::hip_launch_t<false>;
+using device_loop = RAJA::hip_global_thread_x;
 #endif
 
-using launch_policy = RAJA::expt::LaunchPolicy<host_launch
-#if defined(RAJA_DEVICE_ACTIVE)
-                                               ,device_launch
+using launch_policy = RAJA::LaunchPolicy<host_launch
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                         , device_launch
 #endif
-                                               >;
+                                        >;
 
-using loop_pol = RAJA::expt::LoopPolicy<host_loop
-#if defined(RAJA_DEVICE_ACTIVE)
-                                        ,device_loop
+using loop_pol = RAJA::LoopPolicy<host_loop
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                  , device_loop
 #endif
-                                        >;
+                                 >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -70,25 +70,25 @@ int main(int argc, char *argv[])
 {
 
   if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions device");
+    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device");
   }
 
   //
   // Run time policy section is demonstrated in this example by specifying
   // kernel exection space as a command line argument (host or device).
-  // Example usage ./teams_reductions host or ./teams_reductions device
+  // Example usage ./launch_reductions host or ./launch_reductions device
   //
   std::string exec_space = argv[1];
   if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
-    RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions device");
+    RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device");
     return 0;
   }
 
-  RAJA::expt::ExecPlace select_cpu_or_gpu;
+  RAJA::ExecPlace select_cpu_or_gpu;
   if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::expt::HOST; printf("Running RAJA-Teams reductions example on the host \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Launch reductions example on the host \n"); }
   if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::expt::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Launch reductions example on the device \n"); }
 
   // _reductions_array_init_start
 //
@@ -149,15 +149,15 @@ int main(int argc, char *argv[])
   const int TEAM_SZ = 256;
   const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ);
 
-  RAJA::expt::launch<launch_policy>
+  RAJA::launch<launch_policy>
     (select_cpu_or_gpu,
-     RAJA::expt::Grid(RAJA::expt::Teams(GRID_SZ),
-                           RAJA::expt::Threads(TEAM_SZ),
-                           "Reduction Kernel"),
-     [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) 
+     RAJA::LaunchParams(RAJA::Teams(GRID_SZ),
+                        RAJA::Threads(TEAM_SZ)),
+     "Launch Reductions",
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) 
      {
 
-       RAJA::expt::loop<loop_pol>(ctx, arange, [&] (int i) {
+       RAJA::loop<loop_pol>(ctx, arange, [&] (int i) {
            
            kernel_sum += a[i];
            
diff --git a/examples/raja-teams.cpp b/examples/raja-launch.cpp
similarity index 73%
rename from examples/raja-teams.cpp
rename to examples/raja-launch.cpp
index 4c54221b0f..24ed83dda7 100644
--- a/examples/raja-teams.cpp
+++ b/examples/raja-launch.cpp
@@ -15,9 +15,9 @@
 
 
 /*
- * RAJA Teams Example: Upper Triangular Pattern + Shared Memory
+ * RAJA Launch Example: Upper Triangular Pattern + Shared Memory
  *
- * Teams introduces hierarchal parallelism through the concept of
+ * Launch introduces hierarchical parallelism through the concept of
  * teams and threads.  Computation is executed in a pre-defined grid
  * composed of threads and grouped into teams. The teams model enables
  * developers to express parallelism through loops over teams, and inner loops
@@ -34,19 +34,19 @@
 /*
  * Define host/device launch policies
  */
-using launch_policy = RAJA::expt::LaunchPolicy<
+using launch_policy = RAJA::LaunchPolicy<
 #if defined(RAJA_ENABLE_OPENMP)
-    RAJA::expt::omp_launch_t
+    RAJA::omp_launch_t
 #else
-    RAJA::expt::seq_launch_t
+    RAJA::seq_launch_t
 #endif
 #if defined(RAJA_ENABLE_CUDA)
     ,
-    RAJA::expt::cuda_launch_t<false>
+    RAJA::cuda_launch_t<false>
 #endif
 #if defined(RAJA_ENABLE_HIP)
     ,
-    RAJA::expt::hip_launch_t<false>
+    RAJA::hip_launch_t<false>
 #endif
     >;
 
@@ -54,7 +54,7 @@ using launch_policy = RAJA::expt::LaunchPolicy<
  * Define team policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using teams_x = RAJA::expt::LoopPolicy<
+using teams_x = RAJA::LoopPolicy<
 #if defined(RAJA_ENABLE_OPENMP)
                                        RAJA::omp_parallel_for_exec
 #else
@@ -73,7 +73,7 @@ using teams_x = RAJA::expt::LoopPolicy<
  * Define thread policies.
  * Up to 3 dimension are supported: x,y,z
  */
-using threads_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
+using threads_x = RAJA::LoopPolicy<RAJA::loop_exec
 #if defined(RAJA_ENABLE_CUDA)
                                          ,
                                          RAJA::cuda_thread_x_loop
@@ -100,9 +100,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   camp::resources::Hip device_res;
 #endif
 
-  std::cout << "\n Running RAJA-Teams examples...\n";
+  std::cout << "\n Running RAJA-Launch examples...\n";
   int num_of_backends = 1;
-#if defined(RAJA_DEVICE_ACTIVE)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
   num_of_backends++;
 #endif
 
@@ -111,27 +111,24 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) {
 
-    RAJA::expt::ExecPlace select_cpu_or_gpu = (RAJA::expt::ExecPlace)exec_place;
-
-    // auto select_cpu_or_gpu = RAJA::expt::HOST;
-    // auto select_cpu_or_gpu = RAJA::expt::DEVICE;
+    auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place;
 
     // Allocate memory for either host or device
     int N_tri = 5;
 
     int* Ddat = nullptr;
-    if (select_cpu_or_gpu == RAJA::expt::HOST) {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
       Ddat = host_res.allocate<int>(N_tri * N_tri);
     }
 
-#if defined(RAJA_DEVICE_ACTIVE)
-    if (select_cpu_or_gpu == RAJA::expt::DEVICE) {
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
       Ddat = device_res.allocate<int>(N_tri * N_tri);
     }
 #endif
 
     /*
-     * RAJA::expt::launch just starts a "kernel" and doesn't provide any looping.
+     * RAJA::launch just starts a "kernel" and doesn't provide any looping.
      *
      * The first argument determines which policy should be executed,
      *
@@ -144,7 +141,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
      * and is used to perform thread synchronizations within a team.
      */
 
-    if (select_cpu_or_gpu == RAJA::expt::HOST){
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){
       std::cout << "\n Running upper triangular pattern example on the host...\n";
     }else {
       std::cout << "\n Running upper triangular pattern example on the device...\n";
@@ -153,35 +150,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
     RAJA::View<int, RAJA::Layout<2>> D(Ddat, N_tri, N_tri);
 
-    RAJA::expt::launch<launch_policy>(select_cpu_or_gpu,
-       RAJA::expt::Grid(RAJA::expt::Teams(N_tri), RAJA::expt::Threads(N_tri)),
-       [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+    RAJA::launch<launch_policy>
+      (select_cpu_or_gpu,
+       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-         RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
+         RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
 
            // Array shared within threads of the same team
            RAJA_TEAM_SHARED int s_A[1];
 
-           RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
               s_A[c] = r;
            });  // loop c
 
            ctx.teamSync();
 
-           RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
                D(r, c) = r * N_tri + c;
                printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
            });  // loop c
 
          });  // loop r
+
        });  // outer lambda
 
-    if (select_cpu_or_gpu == RAJA::expt::HOST) {
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
       host_res.deallocate(Ddat);
     }
 
-#if defined(RAJA_DEVICE_ACTIVE)
-    if (select_cpu_or_gpu == RAJA::expt::DEVICE) {
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
       device_res.deallocate(Ddat);
     }
 #endif
diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp
new file mode 100644
index 0000000000..bb54d558a5
--- /dev/null
+++ b/examples/resource-dynamic-forall.cpp
@@ -0,0 +1,171 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Vector Addition Example with resource + dynamic policy selection
+ *
+ *  Computes c = a + b, where a, b, c are vectors of ints using
+ *  a policy selected at run-time
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Functions for checking and printing results
+//
+void checkResult(int* res, int len);
+void printResult(int* res, int len);
+
+using policy_list = camp::list<RAJA::loop_exec
+                               ,RAJA::simd_exec
+#if defined(RAJA_ENABLE_CUDA)
+                               ,RAJA::cuda_exec<256>
+                               ,RAJA::cuda_exec<512>
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+                               ,RAJA::hip_exec<256>
+                               ,RAJA::hip_exec<512>
+#endif
+                               >;
+
+
+int main(int argc, char *argv[])
+{
+
+  if(argc != 2) {
+    RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index of  the policy to run");
+  }
+
+  //
+  // Run time policy section is demonstrated in this example by specifying
+  // kernel exection space as a command line argument
+  // Example usage ./dynamic_forall policy N
+  //
+
+  const int pol = std::stoi(argv[1]);
+
+  RAJA::ExecPlace select_cpu_or_gpu;
+  if(pol < 2) {
+    select_cpu_or_gpu = RAJA::ExecPlace::HOST;
+  }else {
+    select_cpu_or_gpu = RAJA::ExecPlace::DEVICE;
+  }
+
+  std::cout << "\n\nRAJA vector addition example...\n";
+  std::cout << "Using policy # "<<pol<<std::endl;
+
+//
+// Define vector length
+//
+  const int N = 1000000;
+
+//
+// Allocate and initialize vector data
+//
+  int *a = memoryManager::allocate<int>(N);
+  int *b = memoryManager::allocate<int>(N);
+  int *c = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    a[i] = -i;
+    b[i] = i;
+  }
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style vector addition...\n";
+
+  // _cstyle_vector_add_start
+  for (int i = 0; i < N; ++i) {
+    c[i] = a[i] + b[i];
+  }
+  // _cstyle_vector_add_end
+
+  checkResult(c, N);
+//printResult(c, N);
+
+
+//----------------------------------------------------------------------------//
+// Example of dynamic policy selection for forall
+//----------------------------------------------------------------------------//
+
+  RAJA::resources::Host host_res;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::resources::Cuda device_res;
+#endif
+#if defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Hip device_res;
+#endif
+
+  //Get typed erased resource - it will internally store if we are running on the host or device
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+#else
+  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
+#endif
+
+  RAJA::expt::dynamic_forall<policy_list>
+  (res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i)   {
+
+    c[i] = a[i] + b[i];
+
+  });
+
+  checkResult(c, N);
+  //printResult(c, N);
+
+
+//----------------------------------------------------------------------------//
+//
+// Clean up.
+//
+  memoryManager::deallocate(a);
+  memoryManager::deallocate(b);
+  memoryManager::deallocate(c);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to check result and report P/F.
+//
+void checkResult(int* res, int len)
+{
+  bool correct = true;
+  for (int i = 0; i < len; i++) {
+    if ( res[i] != 0 ) { correct = false; }
+  }
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
+//
+// Function to print result.
+//
+void printResult(int* res, int len)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < len; i++) {
+    std::cout << "result[" << i << "] = " << res[i] << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp
index 8b15ab4004..09871eb4fb 100644
--- a/examples/resource-forall.cpp
+++ b/examples/resource-forall.cpp
@@ -127,7 +127,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(host, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { 
+  RAJA::forall<RAJA::omp_parallel_for_exec>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
     c[i] = a[i] + b[i]; 
   });
 
@@ -139,7 +140,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA omp_parallel_for_static_exec (default chunksize) vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_static_exec< >>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+  RAJA::forall<RAJA::omp_parallel_for_static_exec< >>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
     c[i] = a[i] + b[i]; 
   });
 
@@ -151,7 +153,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector addition...\n";
 
-  RAJA::forall<RAJA::omp_parallel_for_dynamic_exec<16>>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+  RAJA::forall<RAJA::omp_parallel_for_dynamic_exec<16>>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
     c[i] = a[i] + b[i]; 
   });
 
@@ -165,7 +168,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA tbb_for_dynamic vector addition...\n";
 
-  RAJA::forall<RAJA::tbb_for_dynamic>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+  RAJA::forall<RAJA::tbb_for_dynamic>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
     c[i] = a[i] + b[i]; 
   });
 
@@ -177,7 +181,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA tbb_for_static<8> vector addition...\n";
 
-  RAJA::forall<RAJA::tbb_for_static<8>>(host, RAJA::RangeSegment(0, N), [=] (int i) { 
+  RAJA::forall<RAJA::tbb_for_static<8>>(host, RAJA::RangeSegment(0, N),
+  [=] (int i) {
     c[i] = a[i] + b[i]; 
   });
 
@@ -383,4 +388,3 @@ void printResult(int* res, int len)
   }
   std::cout << std::endl;
 }
-
diff --git a/examples/resource-teams.cpp b/examples/resource-launch.cpp
similarity index 72%
rename from examples/resource-teams.cpp
rename to examples/resource-launch.cpp
index 05b5430a61..81b2c8488b 100644
--- a/examples/resource-teams.cpp
+++ b/examples/resource-launch.cpp
@@ -14,7 +14,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
 #if defined(RAJA_ENABLE_CUDA)
-  std::cout << "\n Running RAJA Resource Teams on Multiple Streams...\n";
+  std::cout << "\n Running RAJA Resource Launch on Multiple Streams...\n";
 
   constexpr int N = 10;
   constexpr int M = 1000000;
@@ -28,11 +28,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   RAJA::RangeSegment m_range(0, M);
   RAJA::RangeSegment n_range(0, N);
 
-  using launch_policy = RAJA::expt::LaunchPolicy<RAJA::expt::cuda_launch_t<true>>;
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
 
-  using teams_x = RAJA::expt::LoopPolicy<RAJA::cuda_block_x_loop>;
+  using teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
 
-  using threads_x = RAJA::expt::LoopPolicy<RAJA::cuda_thread_x_loop>;
+  using threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
   RAJA::forall<RAJA::loop_exec>(def_host_res, n_range,
     [=, &def_cuda_res](int i){
@@ -40,13 +40,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       RAJA::resources::Cuda res_cuda;
 
       RAJA::resources::Event e =
-        RAJA::expt::launch<launch_policy>(res_cuda,
-        RAJA::expt::Grid(RAJA::expt::Teams(64),
-                         RAJA::expt::Threads(1), "RAJA Teams kernel"),
-      [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx)  {
+        RAJA::launch<launch_policy>(res_cuda,
+        RAJA::LaunchParams(RAJA::Teams(64),
+                         RAJA::Threads(1)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
 
-       RAJA::expt::loop<teams_x>(ctx, m_range, [&] (int j) {
-         RAJA::expt::loop<threads_x>(ctx, one_range, [&] (int k) {
+       RAJA::loop<teams_x>(ctx, m_range, [&] (int j) {
+         RAJA::loop<threads_x>(ctx, one_range, [&] (int k) {
 
            d_array[i*M + j] = i * M + j;
 
diff --git a/examples/resource-runtime-teams.cpp b/examples/resource-runtime-launch.cpp
similarity index 76%
rename from examples/resource-runtime-teams.cpp
rename to examples/resource-runtime-launch.cpp
index 3344510e0c..45a07bb045 100644
--- a/examples/resource-runtime-teams.cpp
+++ b/examples/resource-runtime-launch.cpp
@@ -29,28 +29,28 @@
  *
  */
 
-using host_launch = RAJA::expt::seq_launch_t;
+using host_launch = RAJA::seq_launch_t;
 using host_loop = RAJA::loop_exec;
 
 #if defined(RAJA_ENABLE_CUDA)
-using device_launch = RAJA::expt::cuda_launch_t<true>;
-using device_loop = RAJA::expt::cuda_global_thread_x;
+using device_launch = RAJA::cuda_launch_t<true>;
+using device_loop = RAJA::cuda_global_thread_x;
 #elif defined(RAJA_ENABLE_HIP)
-using device_launch = RAJA::expt::hip_launch_t<true>;
-using device_loop = RAJA::expt::hip_global_thread_x;
+using device_launch = RAJA::hip_launch_t<true>;
+using device_loop = RAJA::hip_global_thread_x;
 #endif
 
-using launch_policy = RAJA::expt::LaunchPolicy<host_launch
-#if defined(RAJA_DEVICE_ACTIVE)
-                                               ,device_launch
+using launch_policy = RAJA::LaunchPolicy<host_launch
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                         , device_launch
 #endif
-                                               >;
+                                        >;
 
-using loop_pol = RAJA::expt::LoopPolicy<host_loop
-#if defined(RAJA_DEVICE_ACTIVE)
-                                        ,device_loop
+using loop_pol = RAJA::LoopPolicy<host_loop
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+                                  , device_loop
 #endif
-                                        >;
+                                 >;
 
 #if defined(RAJA_ENABLE_CUDA)
 using reduce_policy = RAJA::cuda_reduce;
@@ -78,11 +78,11 @@ int main(int argc, char *argv[])
     return 0;
   }
 
-  RAJA::expt::ExecPlace select_cpu_or_gpu;
+  RAJA::ExecPlace select_cpu_or_gpu;
   if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::expt::HOST; printf("Running RAJA-Teams reductions example on the host \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams reductions example on the host \n"); }
   if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::expt::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); }
 
   // _reductions_array_init_start
 //
@@ -153,19 +153,18 @@ int main(int argc, char *argv[])
 #endif
 
   //Get typed erased resource - it will internally store if we are running on the host or device
-#if defined(RAJA_DEVICE_ACTIVE)
-  RAJA::resources::Resource res = RAJA::expt::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu);
 #else
-  RAJA::resources::Resource res = RAJA::expt::Get_Host_Resource(host_res, select_cpu_or_gpu);
+  RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu);
 #endif
 
   //How the kernel executes now depends on how the resource is constructed (host or device)
-  RAJA::expt::launch<launch_policy>
-    (res, RAJA::expt::Grid(RAJA::expt::Teams(GRID_SZ),
-                           RAJA::expt::Threads(TEAM_SZ),
-                           "Reduction Kernel"),
-     [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx)  {
-       RAJA::expt::loop<loop_pol>(ctx, arange, [&] (int i) {
+  RAJA::launch<launch_policy>
+    (res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ),
+                                   RAJA::Threads(TEAM_SZ)),
+     [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx)  {
+       RAJA::loop<loop_pol>(ctx, arange, [&] (int i) {
 
            kernel_sum += a[i];
 
diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp
deleted file mode 100644
index 9d77468276..0000000000
--- a/examples/tut_add-vectors.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-
-#include "memoryManager.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-/*
- *  Vector Addition Example
- *
- *  Computes c = a + b, where a, b, c are vectors of ints.
- *  It illustrates similarities between a  C-style for-loop and a RAJA 
- *  forall loop.
- *
- *  RAJA features shown:
- *    - `forall` loop iteration template method
- *    -  Index range segment
- *    -  Execution policies
- *
- * If CUDA is enabled, CUDA unified memory is used. 
- */
-
-/*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-*/
-#if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_SYCL)
-const int SYCL_BLOCK_SIZE = 256;
-#endif
-
-//
-// Functions for checking and printing results
-//
-void checkResult(int* res, int len); 
-void printResult(int* res, int len);
-
-
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
-{
-
-  std::cout << "\n\nRAJA vector addition example...\n";
-
-#if defined(RAJA_ENABLE_SYCL)
-  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
-  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
-#endif
-
-//
-// Define vector length
-//
-  const int N = 1000000;
-
-//
-// Allocate and initialize vector data
-//
-  int *a = memoryManager::allocate<int>(N);
-  int *b = memoryManager::allocate<int>(N);
-  int *c = memoryManager::allocate<int>(N);
-
-  for (int i = 0; i < N; ++i) {
-    a[i] = -i;
-    b[i] = i;
-  }
-
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-style vector addition...\n";
-
-  // _cstyle_vector_add_start
-  for (int i = 0; i < N; ++i) {
-    c[i] = a[i] + b[i];
-  }
-  // _cstyle_vector_add_end
-
-  checkResult(c, N);
-//printResult(c, N);
-
-
-//----------------------------------------------------------------------------//
-// RAJA::seq_exec policy enforces strictly sequential execution.... 
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA sequential vector addition...\n";
-
-  // _rajaseq_vector_add_start
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });
-  // _rajaseq_vector_add_end
-
-  checkResult(c, N);
-//printResult(c, N);
-
-
-//----------------------------------------------------------------------------//
-// RAJA::simd_exec policy should force the compiler to generate SIMD
-// vectorization optimizations.... 
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA SIMD vector addition...\n";
-
-  RAJA::forall<RAJA::simd_exec>(RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });    
-
-  checkResult(c, N);
-//printResult(c, N);
-
-
-//----------------------------------------------------------------------------//
-// RAJA::loop_exec policy means that the compiler is allowed to generate 
-// optimizations (e.g., SIMD) if it thinks it is safe to do so...
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA loop-exec vector addition...\n";
-
-  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, N), [=] (int i) {
-    c[i] = a[i] + b[i];
-  });
-
-  checkResult(c, N);
-//printResult(c, N);
-
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_OPENMP)
-  std::cout << "\n Running RAJA OpenMP vector addition...\n";
-
-  // _rajaomp_vector_add_start
-  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });
-  // _rajaomp_vector_add_end
-
-  checkResult(c, N);
-//printResult(c, N);
-#endif
-
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_CUDA)
-  std::cout << "\n Running RAJA CUDA vector addition...\n";
-
-  // _rajacuda_vector_add_start
-  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::RangeSegment(0, N), 
-    [=] RAJA_DEVICE (int i) { 
-    c[i] = a[i] + b[i]; 
-  });    
-  // _rajacuda_vector_add_end
-
-  checkResult(c, N);
-//printResult(c, N);
-
-  const bool Asynchronous = false;
-  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
-
-  // _rajacuda_explicit_vector_add_start
-  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE, 2, Asynchronous>>(RAJA::RangeSegment(0, N), 
-    [=] RAJA_DEVICE (int i) { 
-    c[i] = a[i] + b[i]; 
-  });    
-  // _rajacuda_explicit_vector_add_end
-
-  checkResult(c, N);
-//printResult(c, N);
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_HIP)
-  std::cout << "\n Running RAJA HIP vector addition...\n";
-
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
-
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
-
-  // _rajahip_vector_add_start
-  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
-    [=] RAJA_DEVICE (int i) {
-    d_c[i] = d_a[i] + d_b[i];
-  });
-  // _rajahip_vector_add_end
-
-  hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost ));
-
-  checkResult(c, N);
-//printResult(c, N);
-
-  memoryManager::deallocate_gpu(d_a);
-  memoryManager::deallocate_gpu(d_b);
-  memoryManager::deallocate_gpu(d_c);
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_SYCL)
-  std::cout << "\n Running RAJA SYCL vector addition...\n";
-
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
-  int *d_c = memoryManager::allocate_gpu<int>(N);
-
-  memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int));
-  memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int));
-
-  // _rajasycl_vector_add_start
-  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE>>(RAJA::RangeSegment(0, N),
-    [=] RAJA_DEVICE (int i) {
-    d_c[i] = d_a[i] + d_b[i];
-  });
-  // _rajasycl_vector_add_end
-
-  memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int));
-
-  checkResult(c, N);
-//printResult(c, N);
-
-  memoryManager::deallocate_gpu(d_a);
-  memoryManager::deallocate_gpu(d_b);
-  memoryManager::deallocate_gpu(d_c);
-#endif
-
-//----------------------------------------------------------------------------//
-//
-// Clean up.
-//
-  memoryManager::deallocate(a);
-  memoryManager::deallocate(b);
-  memoryManager::deallocate(c);
-
-  std::cout << "\n DONE!...\n";
-
-  return 0;
-}
-
-//
-// Function to check result and report P/F.
-//
-void checkResult(int* res, int len) 
-{
-  bool correct = true;
-  for (int i = 0; i < len; i++) {
-    if ( res[i] != 0 ) { correct = false; }
-  }
-  if ( correct ) {
-    std::cout << "\n\t result -- PASS\n";
-  } else {
-    std::cout << "\n\t result -- FAIL\n";
-  }
-}
-
-//
-// Function to print result.
-//
-void printResult(int* res, int len)
-{
-  std::cout << std::endl;
-  for (int i = 0; i < len; i++) {
-    std::cout << "result[" << i << "] = " << res[i] << std::endl;
-  }
-  std::cout << std::endl;
-}
-
diff --git a/examples/tut_atomic-histogram.cpp b/examples/tut_atomic-histogram.cpp
deleted file mode 100644
index fad8269982..0000000000
--- a/examples/tut_atomic-histogram.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-#include <iostream>
-#include <iomanip>
-#include <cstring>
-
-#include "memoryManager.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-/*
- *  Atomic Histogram Example
- *
- *  Given an array of length N containing integers ranging from [0, M),
- *  this example uses RAJA atomics to count the number of instances a
- *  number between 0 and M appear.
- *
- *  RAJA features shown:
- *    - `forall` loop iteration template method
- *    - Atomic add
- *
- *  If CUDA is enabled, CUDA unified memory is used.
- */
-
-/*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-*/
-#if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
-#endif
-
-template <typename T>
-void printBins(T* bins, int M);
-
-int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
-{
-  //
-  // Define the inital array containing values between 0 and M and
-  // create the iteration bounds
-  //
-  int M = 10;
-  int N = 30;
-  // _range_atomic_histogram_start 
-  RAJA::TypedRangeSegment<int> array_range(0, N);
-  // _range_atomic_histogram_end 
-
-  int* array = memoryManager::allocate<int>(N);
-  int* bins = memoryManager::allocate<int>(M);
-
-  RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
-                               
-      array[i] = rand() % M;
-      
-  });
-  //----------------------------------------------------------------------------//
-
-  std::cout << "\n\n Running RAJA sequential binning" << std::endl;
-  std::memset(bins, 0, M * sizeof(int));
-
-  RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
-                                                      
-    RAJA::atomicAdd<RAJA::seq_atomic>(&bins[array[i]], 1);
-
-  });
-
-  printBins(bins, M);
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_OPENMP)
-
-  std::cout << "\n\n Running RAJA OMP binning" << std::endl;
-  std::memset(bins, 0, M * sizeof(int));
-
-  // _rajaomp_atomic_histogram_start 
-  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
-                          
-    RAJA::atomicAdd<RAJA::omp_atomic>(&bins[array[i]], 1);
-                                           
-  });
-  // _rajaomp_atomic_histogram_end
-
-  printBins(bins, M);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n\n Running RAJA OMP binning with auto atomic" << std::endl;
-  std::memset(bins, 0, M * sizeof(int));
-
-  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
-  
-    RAJA::atomicAdd<RAJA::auto_atomic>(&bins[array[i]], 1);
-  
-  });
-
-  printBins(bins, M);
-
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  std::cout << "\n\nRunning RAJA CUDA binning" << std::endl;
-  std::memset(bins, 0, M * sizeof(int));
-
-  // _rajacuda_atomic_histogram_start 
-  RAJA::forall< RAJA::cuda_exec<CUDA_BLOCK_SIZE> >(array_range, 
-    [=] RAJA_DEVICE(int i) {
-                          
-    RAJA::atomicAdd<RAJA::cuda_atomic>(&bins[array[i]], 1);
-                                                 
-  });
-  // _rajacuda_atomic_histogram_end
-
-  printBins(bins, M);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n\nRunning RAJA CUDA binning with auto atomic" << std::endl;
-  std::memset(bins, 0, M * sizeof(int));
-
-  // _rajacuda_atomicauto_histogram_start 
-  RAJA::forall< RAJA::cuda_exec<CUDA_BLOCK_SIZE> >(array_range, 
-    [=] RAJA_DEVICE(int i) {
-
-    RAJA::atomicAdd<RAJA::auto_atomic>(&bins[array[i]], 1);
-
-  });
-  // _rajacuda_atomicauto_histogram_end
-
-  printBins(bins, M);
-  
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_HIP)
-
-  std::cout << "\n\nRunning RAJA HIP binning" << std::endl;
-  std::memset(bins, 0, M * sizeof(int));
-
-  int* d_array = memoryManager::allocate_gpu<int>(N);
-  int* d_bins  = memoryManager::allocate_gpu<int>(M);
-  hipErrchk(hipMemcpy( d_array, array, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_bins, bins, M * sizeof(int), hipMemcpyHostToDevice ));
-
-  // _rajahip_atomic_histogram_start 
-  RAJA::forall< RAJA::hip_exec<HIP_BLOCK_SIZE> >(array_range, 
-    [=] RAJA_DEVICE(int i) {
-
-    RAJA::atomicAdd<RAJA::hip_atomic>(&d_bins[d_array[i]], 1);
-
-  });
-  // _rajahip_atomic_histogram_end
-
-  hipErrchk(hipMemcpy( bins, d_bins, M * sizeof(int), hipMemcpyDeviceToHost ));
-
-  printBins(bins, M);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n\nRunning RAJA HIP binning with auto atomic" << std::endl;
-  std::memset(bins, 0, M * sizeof(int));
-  hipErrchk(hipMemcpy( d_bins, bins, M * sizeof(int), hipMemcpyHostToDevice ));
-
-  // _rajahip_atomicauto_histogram_start
-  RAJA::forall< RAJA::hip_exec<HIP_BLOCK_SIZE> >(array_range, 
-    [=] RAJA_DEVICE(int i) {
-
-    RAJA::atomicAdd<RAJA::auto_atomic>(&d_bins[d_array[i]], 1);
-
-  });
-  // _rajahip_atomicauto_histogram_end
-
-  hipErrchk(hipMemcpy( bins, d_bins, M * sizeof(int), hipMemcpyDeviceToHost ));
-
-  printBins(bins, M);
-
-  memoryManager::deallocate_gpu(d_array);
-  memoryManager::deallocate_gpu(d_bins);
-#endif
-
-//----------------------------------------------------------------------------//
-
-
-  //
-  // Clean up dellacate data
-  //
-  memoryManager::deallocate(array);
-  memoryManager::deallocate(bins);
-
-  std::cout << "\n DONE!...\n";
-
-  return 0;
-}
-
-template <typename T>
-void printBins(T* bins, int M)
-{
-
-  std::cout << "Number of instances |";
-  for (int i = 0; i < M; ++i) {
-    std::cout << bins[i] << " ";
-  }
-  std::cout << "" << std::endl;
-
-  std::cout << "---------------------------";
-  for (int i = 0; i < M; ++i) {
-    std::cout << "-"
-              << "";
-  }
-  std::cout << "" << std::endl;
-
-  std::cout << "Index id            |";
-  for (int i = 0; i < M; ++i) {
-    std::cout << i << " ";
-  }
-  std::cout << "\n" << std::endl;
-}
diff --git a/examples/tut_batched-matrix-multiply.cpp b/examples/tut_batched-matrix-multiply.cpp
deleted file mode 100644
index 434263ff99..0000000000
--- a/examples/tut_batched-matrix-multiply.cpp
+++ /dev/null
@@ -1,689 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA/util/Timer.hpp"
-
-
-#include "memoryManager.hpp"
-
-/*
- *  Batched Matrix Multiply Example
- *
- *  This example performs batched matrix multiplication
- *  for matrices of dimension 3 x 3 using two different
- *  data layouts.
- *
- *  Matrices are stored in arrays A and B. Results
- *  are stored in a third array, C.
- *  We introduce the notation A^{e}_rc
- *  to correspond to the matrix entry in the row, r,
- *  column, c, of matrix, e. Below we describe the two
- *  layouts for the case of two (N=2) 3 x 3 matrices.
- *
- *  Layout 1:
- *  Matrix entries are grouped together so that each
- *  matrix is in a row major ordering.
- *  i.e. A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02},
- *            A^{0}_{10}, A^{0}_{11}, A^{0}_{12},
- *            A^{0}_{20}, A^{0}_{21}, A^{0}_{22},
- *            A^{1}_{00}, A^{1}_{01}, A^{1}_{02},
- *            A^{1}_{10}, A^{1}_{11}, A^{1}_{12},
- *            A^{1}_{20}, A^{1}_{21}, A^{1}_{22}];
- *
- *  Layout 2:
- *  Matrix entries are first ordered by matrix number,
- *  then by column number, and finally by row number.
- *  i.e. A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01},
- *            A^{1}_{01}, A^{0}_{02}, A^{1}_{02},
- *            A^{0}_{10}, A^{1}_{10}, A^{0}_{11},
- *            A^{1}_{11}, A^{0}_{12}, A^{1}_{12},
- *            A^{0}_{20}, A^{1}_{20}, A^{0}_{21},
- *            A^{1}_{21}, A^{0}_{22}, A^{1}_{22}];
- *
- * The extension to N > 2 matrices follows by direct
- * extension. By exploring different data layouts,
- * we can assess which performs best under a given
- * execution policy and architecture.
- *
- *  RAJA features shown:
- *    - `forall` loop iteration template method
- *    -  RAJA View
- *    -  RAJA make_permuted_layout
- *
- * If CUDA is enabled, CUDA unified memory is used.
- */
-
-/*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-*/
-#if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
-#endif
-
-//
-// By default a RAJA::Index_type
-// is a long int
-//
-using RAJA::Index_type;
-
-//
-//Function for checking results
-//
-template <typename T>
-void checkResult(T C, Index_type noMat, int nRows, int nCols);
-
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
-{
-
-  std::cout << "\n\nRAJA batched matrix multiplication example...\n";
-
-// Dimensions of matrices
-  const int N_c = 3;
-  const int N_r = 3;
-
-// Number of matrices
-  const Index_type N = 8000000;
-
-// Number of iterations
-  const int NITER = 20;
-
-  std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n";
-
-//
-// Initialize a RAJA timer object
-// and variable to store minimum run time
-//
-  auto timer = RAJA::Timer();
-  double minRun;
-
-//
-// Allocate space for data in layout 1
-//
-  double *A = memoryManager::allocate<double>(N_c * N_r * N);
-  double *B = memoryManager::allocate<double>(N_c * N_r * N);
-  double *C = memoryManager::allocate<double>(N_c * N_r * N);
-
-//
-// Layout 1
-//
-// make_permuted_layout takes the number of entries in each dimension and a
-// templated array indicating index arguments with slowest to fastest stride.
-// Standard C++ arrays are used to hold the number of entries in each component.
-// This example uses double braces to initalize the array and its subobjects.
-// The layout object will index into the array as the following C macro would
-// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
-//
-// RAJA::Layout objects may be templated on dimension, argument type, and 
-// index with unit stride. Here, the column index has unit stride (argument 2). 
-//
-  // _permutedlayout_defviews_start
-  std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
-  auto layout1 =
-      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 );
-
-  RAJA::View<double, RAJA::Layout<3, Index_type, 2>> Aview(A, layout1);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 2>> Bview(B, layout1);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 2>> Cview(C, layout1);
-  // _permutedlayout_defviews_end
-
-//
-// Allocate space for data in layout 2
-//
-  double *A2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double *B2 = memoryManager::allocate<double>(N_c * N_r * N);
-  double *C2 = memoryManager::allocate<double>(N_c * N_r * N);
-
-//
-// Permuted layout - equivalent to indexing using the following macro
-// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
-// In this case the element index has unit stride (argument 0). 
-//
-  // _permutedlayout_permviews_start
-  std::array<RAJA::idx_t, 3> perm2 {{1, 2, 0}};
-  auto layout2 =
-      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 );
-
-  RAJA::View<double, RAJA::Layout<3, Index_type, 0>> Aview2(A2, layout2);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 0>> Bview2(B2, layout2);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 0>> Cview2(C2, layout2);
-  // _permutedlayout_permviews_end
-
-//
-// Initialize data
-//
-#if defined(RAJA_ENABLE_OPENMP)
-  using INIT_POL = RAJA::omp_parallel_for_exec;
-#else
-  using INIT_POL = RAJA::loop_exec;
-#endif
-
-  RAJA::forall<INIT_POL>(RAJA::RangeSegment(0, N), [=](Index_type e) {
-    for (Index_type row = 0; row < N_r; ++row) {
-      for (Index_type col = 0; col < N_c; ++col) {
-        Aview(e, row, col) = row;
-        Bview(e, row, col) = col;
-        Cview(e, row, col) = 0;
-
-        Aview2(e, row, col) = row;
-        Bview2(e, row, col) = col;
-        Cview2(e, row, col) = 0;
-      }
-    }
-  });
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_OPENMP)
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 1 (RAJA - omp parallel for) ... " << std::endl;
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    // _permutedlayout_batchedmatmult_omp_start
-    RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::RangeSegment(0, N), [=](Index_type e) {
-
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                           + Aview(e, 0, 1) * Bview(e, 1, 0)
-                           + Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                           + Aview(e, 0, 1) * Bview(e, 1, 1)
-                           + Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                           + Aview(e, 0, 1) * Bview(e, 1, 2)
-                           + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                           + Aview(e, 1, 1) * Bview(e, 1, 0)
-                           + Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                           + Aview(e, 1, 1) * Bview(e, 1, 1)
-                           + Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                           + Aview(e, 1, 1) * Bview(e, 1, 2)
-                           + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                           + Aview(e, 2, 1) * Bview(e, 1, 0)
-                           + Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                           + Aview(e, 2, 1) * Bview(e, 1, 1)
-                           + Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                           + Aview(e, 2, 1) * Bview(e, 1, 2)
-                           + Aview(e, 2, 2) * Bview(e, 2, 2);
-
-        });
-    // _permutedlayout_batchedmatmult_omp_end
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-  
-  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
-  checkResult(Cview, N, N_r, N_c);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 2 (RAJA - omp parallel for) ... " << std::endl;
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::omp_parallel_for_exec>(
-        RAJA::RangeSegment(0, N), [=](Index_type e) {
-
-          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 2);
-
-        });
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
-  checkResult(Cview2, N, N_r, N_c);
-
-#endif
-
-//----------------------------------------------------------------------------//
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 1 (RAJA - sequential) ... " << std::endl;
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, N), [=](Index_type e) {
-
-      Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                       + Aview(e, 0, 1) * Bview(e, 1, 0)
-                       + Aview(e, 0, 2) * Bview(e, 2, 0);
-      Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                       + Aview(e, 0, 1) * Bview(e, 1, 1)
-                       + Aview(e, 0, 2) * Bview(e, 2, 1);
-      Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                       + Aview(e, 0, 1) * Bview(e, 1, 2)
-                       + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-      Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                       + Aview(e, 1, 1) * Bview(e, 1, 0)
-                       + Aview(e, 1, 2) * Bview(e, 2, 0);
-      Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                       + Aview(e, 1, 1) * Bview(e, 1, 1)
-                       + Aview(e, 1, 2) * Bview(e, 2, 1);
-      Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                       + Aview(e, 1, 1) * Bview(e, 1, 2)
-                       + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-      Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                       + Aview(e, 2, 1) * Bview(e, 1, 0)
-                       + Aview(e, 2, 2) * Bview(e, 2, 0);
-      Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                       + Aview(e, 2, 1) * Bview(e, 1, 1)
-                       + Aview(e, 2, 2) * Bview(e, 2, 1);
-      Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                       + Aview(e, 2, 1) * Bview(e, 1, 2)
-                       + Aview(e, 2, 2) * Bview(e, 2, 2);
-    });
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-    
-  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
-  checkResult(Cview, N, N_r, N_c);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 2 (RAJA - sequential) ... " << std::endl;
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, N), [=](Index_type e) {
-
-      Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                        + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                        + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-      Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                        + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                        + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-      Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                        + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                        + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-      Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                        + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                        + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-      Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                        + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                        + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-      Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                        + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                        + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-      Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                        + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                        + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-      Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                        + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                        + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-      Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                        + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                        + Aview2(e, 2, 2) * Bview2(e, 2, 2);
-
-    });
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
-  checkResult(Cview2, N, N_r, N_c);
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_CUDA)
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 1 (RAJA - cuda) ... " << std::endl;
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) {
-
-          Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
-                           + Aview(e, 0, 1) * Bview(e, 1, 0)
-                           + Aview(e, 0, 2) * Bview(e, 2, 0);
-          Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
-                           + Aview(e, 0, 1) * Bview(e, 1, 1)
-                           + Aview(e, 0, 2) * Bview(e, 2, 1);
-          Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
-                           + Aview(e, 0, 1) * Bview(e, 1, 2)
-                           + Aview(e, 0, 2) * Bview(e, 2, 2);
-
-          Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
-                           + Aview(e, 1, 1) * Bview(e, 1, 0)
-                           + Aview(e, 1, 2) * Bview(e, 2, 0);
-          Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
-                           + Aview(e, 1, 1) * Bview(e, 1, 1)
-                           + Aview(e, 1, 2) * Bview(e, 2, 1);
-          Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
-                           + Aview(e, 1, 1) * Bview(e, 1, 2)
-                           + Aview(e, 1, 2) * Bview(e, 2, 2);
-
-          Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
-                           + Aview(e, 2, 1) * Bview(e, 1, 0)
-                           + Aview(e, 2, 2) * Bview(e, 2, 0);
-          Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
-                           + Aview(e, 2, 1) * Bview(e, 1, 1)
-                           + Aview(e, 2, 2) * Bview(e, 2, 1);
-          Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
-                           + Aview(e, 2, 1) * Bview(e, 1, 2)
-                           + Aview(e, 2, 2) * Bview(e, 2, 2);
-
-        });
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-
-  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
-  checkResult(Cview, N, N_r, N_c);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 2 (RAJA - cuda) ... " << std::endl;
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
-        RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) {
-
-          Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 0);
-          Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 1);
-          Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 0, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 0, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 0);
-          Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 1);
-          Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 1, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 1, 2) * Bview2(e, 2, 2);
-
-          Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 0)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 0);
-          Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 1)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 1);
-          Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
-                            + Aview2(e, 2, 1) * Bview2(e, 1, 2)
-                            + Aview2(e, 2, 2) * Bview2(e, 2, 2);
-
-        });
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
-  checkResult(Cview2, N, N_r, N_c);
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_HIP)
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 1 (RAJA - hip) ... " << std::endl;
-
-  double *d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-
-  double *d_A2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_B2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-  double *d_C2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
-
-  RAJA::View<double, RAJA::Layout<3, Index_type, 2>> d_Aview(d_A, layout1);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 2>> d_Bview(d_B, layout1);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 2>> d_Cview(d_C, layout1);
-
-  RAJA::View<double, RAJA::Layout<3, Index_type, 0>> d_Aview2(d_A2, layout2);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 0>> d_Bview2(d_B2, layout2);
-  RAJA::View<double, RAJA::Layout<3, Index_type, 0>> d_Cview2(d_C2, layout2);
-
-  hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) {
-
-          d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
-
-          d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
-
-          d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 0)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
-          d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 1)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
-          d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2)
-                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 2)
-                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
-
-        });
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-
-  hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
-
-  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
-  checkResult(Cview, N, N_r, N_c);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << " \n Performing batched matrix multiplication"
-            << " with layout 2 (RAJA - hip) ... " << std::endl;
-
-  minRun = std::numeric_limits<double>::max();
-  for (int i = 0; i < NITER; ++i) {
-
-    timer.start();
-    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
-        RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) {
-
-          d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0)
-                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0)
-                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
-          d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1)
-                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1)
-                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1);
-          d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2)
-                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2)
-                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2);
-
-          d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0)
-                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0)
-                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0);
-          d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1)
-                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1)
-                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1);
-          d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2)
-                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2)
-                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2);
-
-          d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0)
-                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0)
-                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0);
-          d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1)
-                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1)
-                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1);
-          d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2)
-                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2)
-                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2);
-
-        });
-    timer.stop();
-
-    RAJA::Timer::ElapsedType tMin = timer.elapsed();
-    if (tMin < minRun) minRun = tMin;
-    timer.reset();
-  }
-
-  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
-
-  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
-  checkResult(Cview2, N, N_r, N_c);
-
-  memoryManager::deallocate_gpu(d_A);
-  memoryManager::deallocate_gpu(d_B);
-  memoryManager::deallocate_gpu(d_C);
-  memoryManager::deallocate_gpu(d_A2);
-  memoryManager::deallocate_gpu(d_B2);
-  memoryManager::deallocate_gpu(d_C2);
-#endif
-
-//----------------------------------------------------------------------------//
-
-//
-// Clean up.
-//
-  memoryManager::deallocate(A);
-  memoryManager::deallocate(B);
-  memoryManager::deallocate(C);
-  memoryManager::deallocate(A2);
-  memoryManager::deallocate(B2);
-  memoryManager::deallocate(C2);
-
-  std::cout << "\n DONE!...\n";
-  return 0;
-}
-
-//
-// check result
-//
-template <typename T>
-void checkResult(T C, Index_type noMat, int nRows, int nCols)
-{
-
-  bool status = true;
-  for (int e = 0; e < noMat; ++e) {
-    for (int row = 0; row < nRows; ++row) {
-      for (int col = 0; col < nCols; ++col) {
-        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) {
-          status = false;
-        }
-      }
-    }
-  }
-
-  if ( status ) {
-    std::cout << "\tresult -- PASS\n";
-  } else {
-    std::cout << "\tresult -- FAIL\n";
-  }
-}
diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp
index eea8493812..a225161e78 100644
--- a/examples/tut_halo-exchange.cpp
+++ b/examples/tut_halo-exchange.cpp
@@ -251,7 +251,7 @@ int main(int argc, char **argv)
   std::vector<int*> unpack_index_lists(num_neighbors, nullptr);
   std::vector<int > unpack_index_list_lengths(num_neighbors, 0);
   create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims);
-  // _halo_exchange_index_lisgeneratete_end
+  // _halo_exchange_index_list_generate_end
 
 
   //
@@ -489,7 +489,8 @@ int main(int argc, char **argv)
     using workgroup_policy = RAJA::WorkGroupPolicy <
                                  RAJA::loop_work,
                                  RAJA::ordered,
-                                 RAJA::ragged_array_of_objects >;
+                                 RAJA::ragged_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
 
     using workpool = RAJA::WorkPool< workgroup_policy,
                                      int,
@@ -733,7 +734,8 @@ int main(int argc, char **argv)
     using workgroup_policy = RAJA::WorkGroupPolicy <
                                  RAJA::omp_work,
                                  RAJA::ordered,
-                                 RAJA::ragged_array_of_objects >;
+                                 RAJA::ragged_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
 
     using workpool = RAJA::WorkPool< workgroup_policy,
                                      int,
@@ -1047,7 +1049,8 @@ int main(int argc, char **argv)
     using workgroup_policy = RAJA::WorkGroupPolicy <
                                  RAJA::cuda_work_async<CUDA_WORKGROUP_BLOCK_SIZE>,
                                  RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-                                 RAJA::constant_stride_array_of_objects >;
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
 
     using workpool = RAJA::WorkPool< workgroup_policy,
                                      int,
@@ -1343,11 +1346,12 @@ int main(int argc, char **argv)
   }
 
 
+#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 //----------------------------------------------------------------------------//
 // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
 //----------------------------------------------------------------------------//
   {
-    std::cout << "\n Running RAJA Hip workgroup halo exchange...\n";
+    std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo exchange...\n";
 
     double minCycle = std::numeric_limits<double>::max();
 
@@ -1380,12 +1384,9 @@ int main(int argc, char **argv)
 
     using workgroup_policy = RAJA::WorkGroupPolicy <
                                  RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
-#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
-#else
-                                 RAJA::ordered,
-#endif
-                                 RAJA::constant_stride_array_of_objects >;
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::indirect_function_call_dispatch >;
 
     using workpool = RAJA::WorkPool< workgroup_policy,
                                      int,
@@ -1518,6 +1519,196 @@ int main(int argc, char **argv)
     }
 
 
+    std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
+
+    // check results against reference copy
+    checkResult(vars, vars_ref, var_size, num_vars);
+    //printResult(vars, var_size, num_vars);
+  }
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution
+//----------------------------------------------------------------------------//
+  {
+    std::cout << "\n Running RAJA Hip direct dispatch workgroup halo exchange...\n";
+
+    double minCycle = std::numeric_limits<double>::max();
+
+
+    std::vector<double*> hip_vars(num_vars, nullptr);
+    std::vector<int*>    hip_pack_index_lists(num_neighbors, nullptr);
+    std::vector<int*>    hip_unpack_index_lists(num_neighbors, nullptr);
+
+    for (int v = 0; v < num_vars; ++v) {
+      hip_vars[v] = memoryManager::allocate_gpu<double>(var_size);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      int pack_len = pack_index_list_lengths[l];
+      hip_pack_index_lists[l] = memoryManager::allocate_gpu<int>(pack_len);
+      hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice ));
+
+      int unpack_len = unpack_index_list_lengths[l];
+      hip_unpack_index_lists[l] = memoryManager::allocate_gpu<int>(unpack_len);
+      hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice ));
+    }
+
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(unpack_index_lists, hip_unpack_index_lists);
+
+
+    using forall_policy = RAJA::hip_exec_async<HIP_BLOCK_SIZE>;
+
+    struct Packer {
+      double* buffer;
+      double* var;
+      int* list;
+      RAJA_DEVICE void operator() (int i) const {
+        buffer[i] = var[list[i]];
+      }
+    };
+
+    struct UnPacker {
+      double* buffer;
+      double* var;
+      int* list;
+      RAJA_DEVICE void operator()(int i) const {
+        var[list[i]] = buffer[i];
+      }
+    };
+
+    using workgroup_policy = RAJA::WorkGroupPolicy <
+                                 RAJA::hip_work_async<HIP_WORKGROUP_BLOCK_SIZE>,
+                                 RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                 RAJA::constant_stride_array_of_objects,
+                                 RAJA::direct_dispatch<camp::list<range_segment, Packer>,
+                                                       camp::list<range_segment, UnPacker>>
+                                 >;
+
+    using workpool = RAJA::WorkPool< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+
+    using workgroup = RAJA::WorkGroup< workgroup_policy,
+                                       int,
+                                       RAJA::xargs<>,
+                                       pinned_allocator<char> >;
+
+    using worksite = RAJA::WorkSite< workgroup_policy,
+                                     int,
+                                     RAJA::xargs<>,
+                                     pinned_allocator<char> >;
+
+    std::vector<double*> buffers(num_neighbors, nullptr);
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      int buffer_len = num_vars * pack_index_list_lengths[l];
+
+      buffers[l] = memoryManager::allocate_gpu<double>(buffer_len);
+
+    }
+
+    workpool pool_pack  (pinned_allocator<char>{});
+    workpool pool_unpack(pinned_allocator<char>{});
+
+    for (int c = 0; c < num_cycles; ++c ) {
+      timer.start();
+      {
+
+      // set vars
+      for (int v = 0; v < num_vars; ++v) {
+
+        double* var = vars[v];
+
+        RAJA::forall<forall_policy>(range_segment(0, var_size), [=] RAJA_DEVICE (int i) {
+          var[i] = i + v;
+        });
+      }
+
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = pack_index_lists[l];
+        int  len  = pack_index_list_lengths[l];
+
+        // pack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list});
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_pack = pool_pack.instantiate();
+
+      worksite site_pack = group_pack.run();
+
+      hipErrchk(hipDeviceSynchronize());
+
+      // send all messages
+
+      // recv all messages
+
+      for (int l = 0; l < num_neighbors; ++l) {
+
+        double* buffer = buffers[l];
+        int* list = unpack_index_lists[l];
+        int  len  = unpack_index_list_lengths[l];
+
+        // unpack
+        for (int v = 0; v < num_vars; ++v) {
+
+          double* var = vars[v];
+
+          pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list});
+
+          buffer += len;
+        }
+      }
+
+      workgroup group_unpack = pool_unpack.instantiate();
+
+      worksite site_unpack = group_unpack.run();
+
+      hipErrchk(hipDeviceSynchronize());
+
+      }
+      timer.stop();
+
+      RAJA::Timer::ElapsedType tCycle = timer.elapsed();
+      if (tCycle < minCycle) minCycle = tCycle;
+      timer.reset();
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+
+      memoryManager::deallocate_gpu(buffers[l]);
+
+    }
+
+
+    std::swap(vars,               hip_vars);
+    std::swap(pack_index_lists,   hip_pack_index_lists);
+    std::swap(unpack_index_lists, hip_unpack_index_lists);
+
+    for (int v = 0; v < num_vars; ++v) {
+      hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost ));
+      memoryManager::deallocate_gpu(hip_vars[v]);
+    }
+
+    for (int l = 0; l < num_neighbors; ++l) {
+      memoryManager::deallocate_gpu(hip_pack_index_lists[l]);
+      memoryManager::deallocate_gpu(hip_unpack_index_lists[l]);
+    }
+
+
     std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl;
 
     // check results against reference copy
diff --git a/examples/tut_indexset-segments.cpp b/examples/tut_indexset-segments.cpp
deleted file mode 100644
index d072a6618e..0000000000
--- a/examples/tut_indexset-segments.cpp
+++ /dev/null
@@ -1,465 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <vector>
-#include <algorithm>
-
-#include "memoryManager.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-#include "camp/resource.hpp"
-
-/*
- *  Index sets and Segments Example
- *
- *  This example uses the daxpy kernel from a previous example. It
- *  illustrates how to use RAJA index sets and segments. This is 
- *  important for applications and algorithms that need to use 
- *  indirection arrays for irregular access. Combining range and
- *  list segments in a single index set, when possible, can 
- *  increase performance by allowing compilers to optimize for
- *  specific segment types (e.g., SIMD for range segments).
- *
- *  RAJA features shown:
- *    - `forall` loop iteration template method
- *    -  Index range segment 
- *    -  Index list segment 
- *    -  Strided index range segment 
- *    -  TypedIndexSet segment container
- *    -  Hierarchical execution policies
- *
- * If CUDA is enabled, CUDA unified memory is used.
- */
-
-/*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-*/
-#if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
-#endif
-
-//----------------------------------------------------------------------------//
-// Define types for ListSegments and indices used in examples
-//----------------------------------------------------------------------------//
-// _raja_list_segment_type_start
-using IdxType = RAJA::Index_type;
-using ListSegType = RAJA::TypedListSegment<IdxType>;
-// _raja_list_segment_type_end
-
-//
-// Functions to check and print results
-//
-void checkResult(double* v1, double* v2, IdxType len);
-void printResult(double* v, int len);
- 
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
-{
-
-  std::cout << "\n\nRAJA index sets and segments example...\n";
-
-//
-// Define vector length
-//
-  const IdxType N = 1000000;
-
-//
-// Allocate and initialize vector data
-//
-  double* a0 = memoryManager::allocate<double>(N);
-  double* aref = memoryManager::allocate<double>(N);
-
-  double* a = memoryManager::allocate<double>(N);
-  double* b = memoryManager::allocate<double>(N);
-  
-  double c = 3.14159;
-  
-  for (IdxType i = 0; i < N; i++) {
-    a0[i] = 1.0;
-    b[i] = 2.0;
-  }
-
-
-//----------------------------------------------------------------------------//
-//
-// C-version of the daxpy kernel to set the reference result.
-//
-  std::cout << "\n Running C-version of daxpy to set reference result...\n";
-
-  std::memcpy( aref, a0, N * sizeof(double) );
-
-  for (IdxType i = 0; i < N; i++) {
-    aref[i] += b[i] * c;
-  }
-
-//printResult(a, N);
-
-//----------------------------------------------------------------------------//
-//
-// In the following, we show RAJA versions of the daxpy operation and 
-// using different Segment constructs and TypedIndexSets. These are all 
-// run sequentially. The only thing that changes in these versions is
-// the object passed to the 'forall' method that defines the iteration
-// space.
-//
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA range segment daxpy...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-  // _rajaseq_daxpy_range_start
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N), [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-  // _rajaseq_daxpy_range_end
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-//----------------------------------------------------------------------------//
-// Resource object used to construct list segment objects with indices
-// living in host (CPU) memory.
-
-  camp::resources::Resource host_res{camp::resources::Host()};
-
-
-//
-// RAJA list segment version #1
-//
-  std::cout << "\n Running RAJA list segment daxpy...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );  
-
-//
-// Collect indices in a vector to create list segment
-//
-// 
-  // _rajaseq_daxpy_list_start
-  std::vector<IdxType> idx;
-  for (IdxType i = 0; i < N; ++i) {
-    idx.push_back(i); 
-  } 
-
-  ListSegType idx_list( &idx[0], idx.size(), host_res );
-
-  RAJA::forall<RAJA::seq_exec>(idx_list, [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-  // _rajaseq_daxpy_list_end
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-//----------------------------------------------------------------------------//
-//
-// RAJA list segment version #2
-//
-  std::cout << "\n Running RAJA list segment daxpy with indices reversed...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );  
-
-//
-// Reverse the order of indices in the vector
-//
-  // _raja_list_segment_daxpy_reverse_start
-  std::reverse( idx.begin(), idx.end() ); 
-
-  ListSegType idx_reverse_list( &idx[0], idx.size(), host_res );
-
-  RAJA::forall<RAJA::seq_exec>(idx_reverse_list, [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-  // _raja_list_segment_daxpy_reverse_end
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-//----------------------------------------------------------------------------//
-//
-// Alternatively, we can also use a RAJA strided range segment to run the
-// loop in reverse.
-//
-  std::cout << "\n Running RAJA daxpy with indices reversed via negatively strided range segment...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-//
-// Reverse the order of indices in the vector
-//
-  // _raja_range_segment_daxpy_negstride_start
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeStrideSegment(N-1, -1, -1), 
-    [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-  // _raja_range_segment_daxpy_negstride_end
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-//----------------------------------------------------------------------------//
-
-//
-// Sequential index set execution policy used in several of the following
-// example implementations.
-//
-
-  // _raja_seq_indexset_policy_daxpy_start
-  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
-                                            RAJA::seq_exec>;
-  // _raja_seq_indexset_policy_daxpy_end
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA index set (ListSegment) daxpy...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-  // _raja_indexset_list_daxpy_start
-  RAJA::TypedIndexSet<ListSegType> is1;
-
-  is1.push_back( idx_list );  // use list segment created earlier.
-  
-  RAJA::forall<SEQ_ISET_EXECPOL>(is1, [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-  // _raja_indexset_list_daxpy_end
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA index set (2 RangeSegments) daxpy...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-  // _raja_indexset_2ranges_daxpy_start
-  RAJA::TypedIndexSet<RAJA::RangeSegment> is2;
-  is2.push_back( RAJA::RangeSegment(0, N/2) );
-  is2.push_back( RAJA::RangeSegment(N/2, N) );
-  
-  RAJA::forall<SEQ_ISET_EXECPOL>(is2, [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-  // _raja_indexset_2ranges_daxpy_end
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-  // _raja_indexset_2ranges_1list_daxpy_start
-//
-// Collect indices in a vector to create list segment
-//
-  std::vector<IdxType> idx1;
-  for (IdxType i = N/3; i < 2*N/3; ++i) {
-    idx1.push_back(i);
-  }
-
-  ListSegType idx1_list( &idx1[0], idx1.size(), host_res );
-
-  RAJA::TypedIndexSet<RAJA::RangeSegment, ListSegType> is3;
-  is3.push_back( RAJA::RangeSegment(0, N/3) );
-  is3.push_back( idx1_list );
-  is3.push_back( RAJA::RangeSegment(2*N/3, N) );
- 
-  RAJA::forall<SEQ_ISET_EXECPOL>(is3, [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-  // _raja_indexset_2ranges_1list_daxpy_end
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_OPENMP)
-//
-// Run the previous version in parallel (2 different ways) just for fun...
-//
-
-  std::cout << 
-    "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << 
-    " (sequential iteration over segments, OpenMP parallel segment execution)...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-  // _raja_indexset_ompinnerpolicy_daxpy_start
-  using OMP_ISET_EXECPOL1 = RAJA::ExecPolicy<RAJA::seq_segit,
-                                             RAJA::omp_parallel_for_exec>;
-  // _raja_indexset_ompinnerpolicy_daxpy_end
-
-  RAJA::forall<OMP_ISET_EXECPOL1>(is3, [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-
-//----------------------------------------------------------------------------//
-
-  std::cout << 
-    "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << 
-    " (OpenMP parallel iteration over segments, sequential segment execution)...\n";
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-  // _raja_indexset_ompouterpolicy_daxpy_start
-  using OMP_ISET_EXECPOL2 = RAJA::ExecPolicy<RAJA::omp_parallel_for_segit,
-                                             RAJA::seq_exec>;
-  // _raja_indexset_ompouterpolicy_daxpy_end
-
-  RAJA::forall<OMP_ISET_EXECPOL2>(is3, [=] (IdxType i) {
-    a[i] += b[i] * c;
-  });
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_CUDA)
-
-//
-// We create a new resource object and index set so that list segment 
-// indices live in CUDA deviec memory.
-//
-  camp::resources::Resource cuda_res{camp::resources::Cuda()};
-
-  ListSegType idx1_list_cuda( &idx1[0], idx1.size(), cuda_res );
-
-  RAJA::TypedIndexSet<RAJA::RangeSegment, ListSegType> is3_cuda;
-  is3_cuda.push_back( RAJA::RangeSegment(0, N/3) );
-  is3_cuda.push_back( idx1_list_cuda );
-  is3_cuda.push_back( RAJA::RangeSegment(2*N/3, N) );
-
-
-  std::cout << 
-    "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << 
-    " (sequential iteration over segments, CUDA parallel segment execution)...\n";
-
-  // _raja_indexset_cudapolicy_daxpy_start
-  using CUDA_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
-                                             RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
-  // _raja_indexset_cudapolicy_daxpy_end
-
-  std::memcpy( a, a0, N * sizeof(double) );
-
-  RAJA::forall<CUDA_ISET_EXECPOL>(is3_cuda, [=] RAJA_DEVICE (IdxType i) {
-    a[i] += b[i] * c;
-  });
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_HIP)
-
-//
-// We create a new resource object and index set so that list segment
-// indices live in Hip deviec memory.
-//
-  camp::resources::Resource hip_res{camp::resources::Hip()};
-
-  ListSegType idx1_list_hip( &idx1[0], idx1.size(), hip_res );
-
-  RAJA::TypedIndexSet<RAJA::RangeSegment, ListSegType> is3_hip;
-  is3_hip.push_back( RAJA::RangeSegment(0, N/3) );
-  is3_hip.push_back( idx1_list_hip );
-  is3_hip.push_back( RAJA::RangeSegment(2*N/3, N) );
-
-  std::cout <<
-    "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" <<
-    " (sequential iteration over segments, HIP parallel segment execution)...\n";
-
-  // _raja_indexset_hippolicy_daxpy_start
-  using HIP_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
-                                            RAJA::hip_exec<HIP_BLOCK_SIZE>>;
-  // _raja_indexset_hippolicy_daxpy_end
-
-  double* d_a = memoryManager::allocate_gpu<double>(N);
-  double* d_b = memoryManager::allocate_gpu<double>(N);
-
-  hipErrchk(hipMemcpy( d_a, a0, N * sizeof(double), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_b,  b, N * sizeof(double), hipMemcpyHostToDevice ));
-
-  RAJA::forall<HIP_ISET_EXECPOL>(is3_hip, [=] RAJA_DEVICE (IdxType i) {
-    d_a[i] += d_b[i] * c;
-  });
-
-  hipErrchk(hipMemcpy( a, d_a, N * sizeof(double), hipMemcpyDeviceToHost ));
-
-  checkResult(a, aref, N);
-//printResult(a, N);
-
-  memoryManager::deallocate_gpu(d_a);
-  memoryManager::deallocate_gpu(d_b);
-#endif
-
-//----------------------------------------------------------------------------//
-
-//
-// Clean up. 
-//
-  memoryManager::deallocate(a); 
-  memoryManager::deallocate(b); 
-  memoryManager::deallocate(a0); 
-  memoryManager::deallocate(aref); 
- 
-  std::cout << "\n DONE!...\n";
- 
-  return 0;
-}
-
-//
-// Function to check result and report P/F.
-//
-void checkResult(double* v1, double* v2, IdxType len) 
-{
-  bool match = true;
-  for (IdxType i = 0; i < len; i++) {
-    if ( v1[i] != v2[i] ) { match = false; }
-  }
-  if ( match ) {
-    std::cout << "\n\t result -- PASS\n";
-  } else {
-    std::cout << "\n\t result -- FAIL\n";
-  } 
-}
-
-//
-// Function to print result. 
-//
-void printResult(double* v, IdxType len) 
-{
-  std::cout << std::endl;
-  for (IdxType i = 0; i < len; i++) {
-    std::cout << "result[" << i << "] = " << v[i] << std::endl;
-  }
-  std::cout << std::endl;
-} 
-
diff --git a/examples/tut_teams_basic.cpp b/examples/tut_launch_basic.cpp
similarity index 64%
rename from examples/tut_teams_basic.cpp
rename to examples/tut_launch_basic.cpp
index 06b04bd510..2ea0c83ea8 100644
--- a/examples/tut_teams_basic.cpp
+++ b/examples/tut_launch_basic.cpp
@@ -12,18 +12,18 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Developing with RAJA Teams
+ *  Developing with RAJA Launch
  *
  *  This example serves as a basic overview of
- *  capabilities with the RAJA Teams API.
+ *  capabilities with the RAJA Launch API.
  *
  *  RAJA features shown:
- *    -  RAJA::expt::launch
- *    -  RAJA::expt::loop
+ *    -  RAJA::launch
+ *    -  RAJA::loop
  */
 
 /*
- * The RAJA teams framework enables developers
+ * The RAJA::Launch framework enables developers
  * to expressed algorithms in terms of nested
  * loops within an execution space. RAJA teams
  * enables run time selection of a host or
@@ -34,27 +34,27 @@
 */
 
 // __host_launch_start
-using host_launch = RAJA::expt::seq_launch_t;
+using host_launch = RAJA::seq_launch_t;
 // __host_launch_end
 
 #if defined(RAJA_ENABLE_CUDA)
 // __device_launch_start
-using device_launch = RAJA::expt::cuda_launch_t<false>;
+using device_launch = RAJA::cuda_launch_t<false>;
 // __device_launch_end
 #elif defined(RAJA_ENABLE_HIP)
-using device_launch = RAJA::expt::hip_launch_t<false>;
+using device_launch = RAJA::hip_launch_t<false>;
 #endif
 
-using launch_policy = RAJA::expt::LaunchPolicy<
+using launch_policy = RAJA::LaunchPolicy<
   host_launch
-#if defined(RAJA_DEVICE_ACTIVE)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
   ,device_launch
 #endif
   >;
 
 /*
- * RAJA teams follows a similar thread/block programming model
- * as found in CUDA/HIP/SYCL. Loops within an execution
+ * RAJA launch exposes a thread/block programming model
+ * as used in CUDA/HIP/SYCL. Loops within an execution
  * maybe mapped to either threads or teams. Under this
  * programming model, computation is performed with
  * a collection of threads which are grouped into teams.
@@ -64,7 +64,7 @@ using launch_policy = RAJA::expt::LaunchPolicy<
  * On the host the loops expands to standard C style for loops.
  */
 
-using teams_x = RAJA::expt::LoopPolicy<
+using teams_x = RAJA::LoopPolicy<
                                        RAJA::loop_exec
 #if defined(RAJA_ENABLE_CUDA)
                                        ,
@@ -76,7 +76,7 @@ using teams_x = RAJA::expt::LoopPolicy<
 #endif
                                        >;
 
-using teams_y = RAJA::expt::LoopPolicy<
+using teams_y = RAJA::LoopPolicy<
                                        RAJA::loop_exec
 #if defined(RAJA_ENABLE_CUDA)
                                        ,
@@ -88,7 +88,7 @@ using teams_y = RAJA::expt::LoopPolicy<
 #endif
                                        >;
 
-using threads_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
+using threads_x = RAJA::LoopPolicy<RAJA::loop_exec
 #if defined(RAJA_ENABLE_CUDA)
                                          ,
                                          RAJA::cuda_thread_x_direct
@@ -99,7 +99,7 @@ using threads_x = RAJA::expt::LoopPolicy<RAJA::loop_exec
 #endif
                                          >;
 
-using threads_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
+using threads_y = RAJA::LoopPolicy<RAJA::loop_exec
 #if defined(RAJA_ENABLE_CUDA)
                                          ,
                                          RAJA::cuda_thread_y_direct
@@ -110,7 +110,7 @@ using threads_y = RAJA::expt::LoopPolicy<RAJA::loop_exec
 #endif
                                          >;
 
-#if defined(RAJA_DEVICE_ACTIVE)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 __global__ void gpuKernel()
 {
   //Equivalent CUDA/HIP style thread/block mapping
@@ -133,29 +133,35 @@ __global__ void gpuKernel()
 }
 #endif
 
-int main(int argc, char *argv[])
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+int main(int argc, char* argv[])
+#else
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv))
+#endif
 {
 
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+
   if(argc != 2) {
-    RAJA_ABORT_OR_THROW("Usage ./tut_teams_basic host or ./tut_teams_basic device");
+    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device");
   }
 
 //
 // Run time policy section is demonstrated in this example by specifying
 // kernel exection space as a command line argument (host or device).
-// Example usage ./tut_teams_basic host or ./tut_teams_basic device
+// Example usage ./tut_launch_basic host or ./tut_launch_basic device
 //
   std::string exec_space = argv[1];
   if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){
-    RAJA_ABORT_OR_THROW("Usage ./tut_teams_basic host or ./tut_teams_basic device");
+    RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device");
     return 0;
   }
 
-  RAJA::expt::ExecPlace select_cpu_or_gpu;
+  RAJA::ExecPlace select_cpu_or_gpu;
   if(exec_space.compare("host") == 0)
-    { select_cpu_or_gpu = RAJA::expt::HOST; printf("Running RAJA-Teams on the host \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams on the host \n"); }
   if(exec_space.compare("device") == 0)
-    { select_cpu_or_gpu = RAJA::expt::DEVICE; printf("Running RAJA-Teams on the device \n"); }
+    { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams on the device \n"); }
 
 //
 // The following three kernels illustrate loop based parallelism
@@ -168,20 +174,22 @@ int main(int argc, char *argv[])
   const int Nteams  = 2;
   const int Nthreads = 2;
   // __compute_grid_end
- 
-  RAJA::expt::launch<launch_policy>(select_cpu_or_gpu,
-  RAJA::expt::Grid(RAJA::expt::Teams(Nteams,Nteams),
-                        RAJA::expt::Threads(Nthreads,Nthreads)),
-  [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) {
-   // _team_loops_start
-   RAJA::expt::loop<teams_y>(ctx, RAJA::RangeSegment(0, Nteams), [&] (int by) {
-     RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, Nteams), [&] (int bx) {
 
-       RAJA::expt::loop<threads_y>(ctx, RAJA::RangeSegment(0, Nthreads), [&] (int ty) {
-         RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, Nthreads), [&] (int tx) {
+  RAJA::launch<launch_policy>(select_cpu_or_gpu,
+    RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams),
+                     RAJA::Threads(Nthreads,Nthreads)),
+
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+     // _team_loops_start
+     RAJA::loop<teams_y>(ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&] (int by) {
+       RAJA::loop<teams_x>(ctx, RAJA::TypedRangeSegment<int>(0, Nteams), [&] (int bx) {
+
+         RAJA::loop<threads_y>(ctx, RAJA::TypedRangeSegment<int>(0, Nthreads), [&] (int ty) {
+           RAJA::loop<threads_x>(ctx, RAJA::TypedRangeSegment<int>(0, Nthreads),       [&] (int tx) {
+               printf("RAJA Teams: threadId_x %d threadId_y %d teamId_x %d teamId_y %d \n",
+                      tx, ty, bx, by);
 
-             printf("RAJA Teams: threadId_x %d threadId_y %d teamId_x %d teamId_y %d \n",
-                    tx, ty, bx, by);
 
            });
          });
@@ -193,16 +201,16 @@ int main(int argc, char *argv[])
    });
 
   //Equivalent C style loops
-  if(select_cpu_or_gpu == RAJA::expt::HOST) {
+  if(select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
     // _c_style_loops_start
-    for(int by=0; by<Nteams; ++by) {
-      for(int bx=0; bx<Nteams; ++bx) {
+    for (int by=0; by<Nteams; ++by) {
+      for (int bx=0; bx<Nteams; ++bx) {
 
-        for(int ty=0; ty<Nthreads; ++ty) {
-          for(int tx=0; tx<Nthreads; ++tx) {
+        for (int ty=0; ty<Nthreads; ++ty) {
+          for (int tx=0; tx<Nthreads; ++tx) {
 
             printf("c-iter: iter_tx %d iter_ty %d iter_bx %d iter_by %d \n",
-                   tx, ty, bx, by);
+	    tx, ty, bx, by);
           }
         }
 
@@ -216,7 +224,7 @@ int main(int argc, char *argv[])
 // The following launches equivalent
 // device kernels
 //
-#if defined(RAJA_DEVICE_ACTIVE)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
   // Define thread block dimensions
   dim3 blockdim(Nthreads, Nthreads);
   // Define grid dimensions to match the RAJA version above
@@ -224,16 +232,20 @@ int main(int argc, char *argv[])
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-  if(select_cpu_or_gpu == RAJA::expt::DEVICE)
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
     gpuKernel<<<griddim, blockdim>>>();
   cudaDeviceSynchronize();
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-  if(select_cpu_or_gpu == RAJA::expt::DEVICE)
+  if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE)
     hipLaunchKernelGGL((gpuKernel), dim3(griddim), dim3(blockdim), 0, 0);
   hipDeviceSynchronize();
 #endif
 
+#else
+  std::cout << "Please build with CUDA or HIP to run this example ...\n";
+#endif
+
   return 0;
 }
diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp
index 8d4b4adeab..be9bebbd11 100644
--- a/examples/tut_matrix-multiply.cpp
+++ b/examples/tut_matrix-multiply.cpp
@@ -150,9 +150,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 // row, column, and dot-product loops for RAJA variants
 //
   // _matmult_ranges_start
-  RAJA::RangeSegment row_range(0, N);
-  RAJA::RangeSegment col_range(0, N);
-  RAJA::RangeSegment dot_range(0, N);
+  RAJA::TypedRangeSegment<int> row_range(0, N);
+  RAJA::TypedRangeSegment<int> col_range(0, N);
+  RAJA::TypedRangeSegment<int> dot_range(0, N);
   // _matmult_ranges_end
 
 //----------------------------------------------------------------------------//
@@ -1012,10 +1012,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
     Shmem aShared, bShared, cShared;
 
-    RAJA::kernel_param<EXEC_POL10>(RAJA::make_tuple(RAJA::RangeSegment(0, N),
-                                                    RAJA::RangeSegment(0, N),
-                                                    RAJA::RangeSegment(0, N)),
-                                   RAJA::make_tuple(aShared, bShared, cShared),
+    RAJA::kernel_param<EXEC_POL10>(
+      RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N)),
+      RAJA::make_tuple(aShared, bShared, cShared),
 
     // Zero out thread local memory for storing dot products
     [=] RAJA_HOST_DEVICE (int tn, int tp, Shmem &cShared) {
diff --git a/examples/tut_nested-loop-reorder.cpp b/examples/tut_nested-loop-reorder.cpp
deleted file mode 100644
index 9bcf4c5a5b..0000000000
--- a/examples/tut_nested-loop-reorder.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-#include <iostream>
-
-#include "RAJA/RAJA.hpp"
-
-/*
- *  Nested Loop Reorder Example
- *
- *  This example shows how to reorder RAJA nested loops by reordering
- *  nested policy arguments. It does no actual computation and just
- *  prints out the loop indices to show the different orderings.
- *
- *  RAJA features shown:
- *    - Index range segment
- *    - 'RAJA::kernel' loop abstractions and execution policies
- *    - Nested loop reordering
- *    - Strongly-typed loop indices
- */
-
-//
-// Define three named loop index types used in the triply-nested loop examples.
-// These will trigger compilation errors if lambda index argument ordering 
-// and types do not match the typed range index ordering.
-//
-// _nestedreorder_idxtypes_start
-RAJA_INDEX_VALUE(KIDX, "KIDX");
-RAJA_INDEX_VALUE(JIDX, "JIDX"); 
-RAJA_INDEX_VALUE(IIDX, "IIDX"); 
-// _nestedreorder_idxtypes_end
-
-
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
-{
-
-    std::cout << "\n\nRAJA nested loop reorder example...\n";
-
-//
-// Typed index ranges
-// 
-// _nestedreorder_ranges_start
-  RAJA::TypedRangeSegment<KIDX> KRange(2, 4);
-  RAJA::TypedRangeSegment<JIDX> JRange(1, 3);
-  RAJA::TypedRangeSegment<IIDX> IRange(0, 2);
-// _nestedreorder_ranges_end
- 
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running loop reorder example (K-outer, J-middle, I-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
-
-  // _nestedreorder_kji_start
-  using KJI_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<2, RAJA::seq_exec,    // k
-                          RAJA::statement::For<1, RAJA::seq_exec,  // j
-                            RAJA::statement::For<0, RAJA::seq_exec,// i 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-
-  RAJA::kernel<KJI_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) { 
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-  // _nestedreorder_kji_end
-
-
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running loop reorder example (J-outer, I-middle, K-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
-
-  // _nestedreorder_jik_start
-  using JIK_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<1, RAJA::seq_exec,    // j
-                          RAJA::statement::For<0, RAJA::seq_exec,  // i
-                            RAJA::statement::For<2, RAJA::seq_exec,// k 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-  // _nestedreorder_jik_end
-
-  RAJA::kernel<JIK_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) { 
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-
-
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running loop reorder example (I-outer, K-middle, J-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
-
-  // _nestedreorder_ikj_start
-  using IKJ_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<0, RAJA::seq_exec,    // i
-                          RAJA::statement::For<2, RAJA::seq_exec,  // k
-                            RAJA::statement::For<1, RAJA::seq_exec,// j 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-  // _nestedreorder_ikj_end
-
-  RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) {
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-
-
-#if 0
-//----------------------------------------------------------------------------//
-// The following demonstrates that code will not compile if lambda argument
-// types/order do not match the types/order of the For statements.
-//----------------------------------------------------------------------------//
-
-  // _nestedreorder_typemismatch_start
-  RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (JIDX i, IIDX j, KIDX k) {
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-  // _nestedreorder_typemismatch_end
-
-#endif
-
-  std::cout << "\n DONE!...\n";
-
-  return 0;
-}
-
diff --git a/examples/tut_vertexsum-coloring.cpp b/examples/tut_vertexsum-coloring.cpp
deleted file mode 100644
index 10116b0ab9..0000000000
--- a/examples/tut_vertexsum-coloring.cpp
+++ /dev/null
@@ -1,446 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <vector>
-
-#include "memoryManager.hpp"
-
-#include "RAJA/RAJA.hpp"
-
-#include "camp/resource.hpp"
-
-/*
- *  Mesh Vertex Sum with Index Coloring Example
- *
- *  Example computes a sum at each vertex on a logically-Cartesian
- *  2D mesh. Each sum includes a contribution from each mesh element
- *  that share a vertex. In many "staggered mesh" applications, such
- *  operations are written in a way that prevents parallelization due
- *  to potential data races -- specifically, multiple loop iterates 
- *  over mesh elements writing to the same shared vertex memory location.
- *  This example illustrates how RAJA contructs can be used to enable one 
- *  to get some parallelism from such operations without fundamentally
- *  changing how the algorithm looks in source code.
- *
- *  RAJA features shown:
- *    - `forall` loop iteration template method
- *    -  Index list segment
- *    -  TypedIndexSet segment container
- *    -  Hierarchical execution policies
- *
- * If CUDA is enabled, CUDA unified memory is used.
- */
-
-/*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-*/
-#if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
-#endif
-
-//
-// Functions to check and print result.
-//
-void checkResult(double* vol, double* volref, int n);
-void printMeshData(double* v, int n, int joff);
-
-
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
-{
-
-  std::cout << "\n\nRAJA mesh vertex sum example...\n";
-
-//
-// 2D mesh has N^2 "interior" vertices, (N+2)^2 total vertices and
-// (N+1)^2 elements (including "ghost" elems)
-//
-  const int N = 1000;
-  const int N_elem = N + 1;
-  const int N_vert = N + 2;
-  double* elemvol = memoryManager::allocate<double>(N_elem*N_elem);
-  double* vertexvol = memoryManager::allocate<double>(N_vert*N_vert);
-  double* vertexvol_ref = memoryManager::allocate<double>(N_vert*N_vert);
-  int* elem2vert_map = memoryManager::allocate<int>(4*N_elem*N_elem);
-
-//
-// Some basic mesh parameters (offsets, mesh spacing factor 'h'),
-// set up elem to vertex mapping array.
-//
-  int jeoff = N_elem;
-
-  int jvoff = N_vert;
-
-  double h = 0.1;
-
-  for (int j = 0 ; j < N_elem ; ++j) {
-    for (int i = 0 ; i < N_elem ; ++i) {
-      int ielem = i + j*jeoff ;
-      int imap = 4 * ielem ;
-      elem2vert_map[imap] = ielem + j;
-      elem2vert_map[imap+1] = ielem + j + 1;
-      elem2vert_map[imap+2] = ielem + j + jvoff;
-      elem2vert_map[imap+3] = ielem + j + 1 + jvoff;
-    }
-  }
-
-//
-// Initialize hexahedral element volumes so every element volume 
-// depends on its i,j coordinates. 
-//
-  std::memset(elemvol, 0, N_elem*N_elem * sizeof(double));
-
-  for (int j = 0 ; j < N_elem ; ++j) {
-    for (int i = 0 ; i < N_elem ; ++i) {
-      int ielem = i + j*jeoff ;
-      elemvol[ielem] = h*(i+1) * h*(j+1);
-    }
-  }
-
-//std::cout << "\n Element volumes...\n";
-//printMeshData(elemvol, N_elem, jeoff);
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running C-version of vertex sum...\n";
-
-  std::memset(vertexvol_ref, 0, N_vert*N_vert * sizeof(double));
-
-  // _cstyle_vertexsum_start
-  for (int j = 0 ; j < N_elem ; ++j) {
-    for (int i = 0 ; i < N_elem ; ++i) {
-      int ie = i + j*jeoff ;
-      int* iv = &(elem2vert_map[4*ie]);
-      vertexvol_ref[ iv[0] ] += elemvol[ie] / 4.0 ;
-      vertexvol_ref[ iv[1] ] += elemvol[ie] / 4.0 ;
-      vertexvol_ref[ iv[2] ] += elemvol[ie] / 4.0 ;
-      vertexvol_ref[ iv[3] ] += elemvol[ie] / 4.0 ;
-    }
-  }
-  // _cstyle_vertexsum_end
-
-//std::cout << "\n Vertex volumes (reference)...\n";
-//printMeshData(vertexvol_ref, N_vert, jvoff);
-
-
-//----------------------------------------------------------------------------//
-
-  std::cout << "\n Running RAJA nested sequential version...\n";
-
-  std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double));
-
-  // _raja_seq_vertexsum_start
-  using EXEC_POL1 = 
-    RAJA::KernelPolicy< 
-      RAJA::statement::For<1, RAJA::seq_exec,    // j
-        RAJA::statement::For<0, RAJA::seq_exec,  // i
-          RAJA::statement::Lambda<0>
-        > 
-      > 
-    >;
-
-  RAJA::kernel<EXEC_POL1>( RAJA::make_tuple(RAJA::RangeSegment(0, N_elem),
-                                            RAJA::RangeSegment(0, N_elem)),
-    [=](int i, int j) {
-      int ie = i + j*jeoff ;
-      int* iv = &(elem2vert_map[4*ie]);
-      vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ;
-      vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ;
-      vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ;
-      vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ;
-  });
-  // _raja_seq_vertexsum_end
-
-  checkResult(vertexvol, vertexvol_ref, N_vert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(vertexvol, N_vert, jvoff);
-
-//----------------------------------------------------------------------------//
-
-//
-// Note that the C-style and RAJA versions of the vertex sum calculation
-// above cannot safely execute in parallel due to potential data races;
-// i.e., multiple loop iterates over mesh elements writing to the same 
-// shared vertex memory location.
-//
-// In the following, we partition the element iteration space into four
-// subsets (or "colors") indicated by numbers in the figure below. 
-// 
-//    -----------------
-//    | 2 | 3 | 2 | 3 |
-//    -----------------
-//    | 0 | 1 | 0 | 1 |
-//    -----------------
-//    | 2 | 3 | 2 | 3 |
-//    -----------------
-//    | 0 | 1 | 0 | 1 |   
-//    -----------------
-//
-// Since none of the elements with the same number share a common vertex,
-// we can iterate over each subset ("color") in parallel.
-//
-// We use RAJA ListSegments and a RAJA TypedIndexSet to define the element 
-// partitioning. 
-//
-
-//
-// First, gather the element indices for each color in a vector.
-//
-  // _colorvectors_vertexsum_start 
-  std::vector<int> idx0;
-  std::vector<int> idx1;
-  std::vector<int> idx2;
-  std::vector<int> idx3;
-
-  for (int j = 0 ; j < N_elem ; ++j) {
-    for (int i = 0 ; i < N_elem ; ++i) {
-      int ie = i + j*jeoff ;
-      if ( i % 2 == 0 ) {
-        if ( j % 2 == 0 ) {
-          idx0.push_back(ie);
-        } else {
-          idx2.push_back(ie);  
-        }
-      } else {
-        if ( j % 2 == 0 ) {
-          idx1.push_back(ie);
-        } else {
-          idx3.push_back(ie);
-        }
-      }
-    }
-  }
-  // _colorvectors_vertexsum_end
- 
-// 
-// Second, create a RAJA TypedIndexSet with four ListSegments
-//
-// The TypedIndexSet is a variadic template, where the template arguments
-// are the segment types that the TypedIndexSet can hold. 
-// 
-  // _colorindexset_vertexsum_start
-  using SegmentType = RAJA::TypedListSegment<int>;
-
-  RAJA::TypedIndexSet<SegmentType> colorset;
-
-  camp::resources::Resource host_res{camp::resources::Host()};
-
-  colorset.push_back( SegmentType(&idx0[0], idx0.size(), host_res) ); 
-  colorset.push_back( SegmentType(&idx1[0], idx1.size(), host_res) ); 
-  colorset.push_back( SegmentType(&idx2[0], idx2.size(), host_res) ); 
-  colorset.push_back( SegmentType(&idx3[0], idx3.size(), host_res) ); 
-  // _colorindexset_vertexsum_end
-
-//----------------------------------------------------------------------------//
- 
-//
-// RAJA vertex volume calculation - sequential TypedIndexSet version 
-// (sequential iteration over segments, 
-//  sequential iteration of each segment)
-//
-// NOTE: we do not need i,j indices for this version since the element
-//       indices are contained in the list segments
-//
-  std::cout << "\n Running RAJA sequential index set version...\n";
-
-  std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double));
-
-  // _raja_seq_colorindexset_vertexsum_start
-  using EXEC_POL2 = RAJA::ExecPolicy<RAJA::seq_segit, 
-                                     RAJA::seq_exec>;
-
-  RAJA::forall<EXEC_POL2>(colorset, [=](int ie) {
-    int* iv = &(elem2vert_map[4*ie]);
-    vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ;
-  });
-  // _raja_seq_colorindexset_vertexsum_end
-
-  checkResult(vertexvol, vertexvol_ref, N_vert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(vertexvol, N_vert, jvoff);
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_OPENMP)
-//
-// RAJA vertex volume calculation - OpenMP TypedIndexSet version
-// (sequential iteration over segments, 
-//  OpenMP parallel iteration of each segment)
-//
-  std::cout << "\n Running RAJA OpenMP index set version...\n";
-
-  std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double));
-
-  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit, 
-                                     RAJA::omp_parallel_for_exec>;
-
-  RAJA::forall<EXEC_POL3>(colorset, [=](int ie) {
-    int* iv = &(elem2vert_map[4*ie]);
-    vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ;
-  });
-
-  checkResult(vertexvol, vertexvol_ref, N_vert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(vertexvol, N_vert, jvoff); 
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_CUDA)
-//
-// RAJA vertex volume calculation - CUDA TypedIndexSet version
-// (sequential iteration over segments, 
-//  CUDA parallel execution of each segment)
-//
-  std::cout << "\n Running RAJA CUDA index set version...\n";
-
-//
-// We create a RAJA TypedIndexSet with four ListSegments as before,
-// but now we use a CUDA resource so the segment indices live in
-// device memory.
-//
-  RAJA::TypedIndexSet<SegmentType> colorset_cuda;
-
-  camp::resources::Resource cuda_res{camp::resources::Cuda()};
-
-  colorset_cuda.push_back( SegmentType(&idx0[0], idx0.size(), cuda_res) );
-  colorset_cuda.push_back( SegmentType(&idx1[0], idx1.size(), cuda_res) );
-  colorset_cuda.push_back( SegmentType(&idx2[0], idx2.size(), cuda_res) );
-  colorset_cuda.push_back( SegmentType(&idx3[0], idx3.size(), cuda_res) );
-
-  std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double));
-
-  // _raja_cuda_colorindexset_vertexsum_start
-  using EXEC_POL4 = RAJA::ExecPolicy<RAJA::seq_segit, 
-                                     RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
-
-  RAJA::forall<EXEC_POL4>(colorset_cuda, [=] RAJA_DEVICE (int ie) {
-    int* iv = &(elem2vert_map[4*ie]);
-    vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ;
-    vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ;
-  });
-  // _raja_cuda_colorindexset_vertexsum_end
-
-  checkResult(vertexvol, vertexvol_ref, N_vert);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(vertexvol, N_vert, jvoff);
-#endif
-
-//----------------------------------------------------------------------------//
-
-#if defined(RAJA_ENABLE_HIP)
-//
-// RAJA vertex volume calculation - HIP IndexSet version
-// (sequential iteration over segments,
-//  HIP parallel execution of each segment)
-//
-  double* d_elemvol    = memoryManager::allocate_gpu<double>(N_elem*N_elem);
-  double* d_vertexvol  = memoryManager::allocate_gpu<double>(N_vert*N_vert);
-  int* d_elem2vert_map = memoryManager::allocate_gpu<int>(4*N_elem*N_elem);
-
-  hipMemcpy(d_elemvol, elemvol, N_elem*N_elem*sizeof(double), hipMemcpyHostToDevice);
-  hipMemcpy(d_elem2vert_map, elem2vert_map, 4*N_elem*N_elem*sizeof(int), hipMemcpyHostToDevice);
-
-  std::cout << "\n Running RAJA HIP index set version...\n";
-
-  std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double));
-  hipMemcpy(d_vertexvol, vertexvol, N_vert*N_vert*sizeof(double), hipMemcpyHostToDevice);
-
-//
-// We create a RAJA TypedIndexSet with four ListSegments as before,
-// but now we use a Hip resource so the segment indices live in
-// device memory.
-//
-  RAJA::TypedIndexSet<SegmentType> colorset_hip;
-
-  camp::resources::Resource hip_res{camp::resources::Hip()};
-
-  colorset_hip.push_back( SegmentType(&idx0[0], idx0.size(), hip_res) );
-  colorset_hip.push_back( SegmentType(&idx1[0], idx1.size(), hip_res) );
-  colorset_hip.push_back( SegmentType(&idx2[0], idx2.size(), hip_res) );
-  colorset_hip.push_back( SegmentType(&idx3[0], idx3.size(), hip_res) );
-
-  using EXEC_POL4 = RAJA::ExecPolicy<RAJA::seq_segit,
-                                     RAJA::hip_exec<HIP_BLOCK_SIZE>>;
-
-  RAJA::forall<EXEC_POL4>(colorset_hip, [=] RAJA_DEVICE (int ie) {
-    int* iv = &(d_elem2vert_map[4*ie]);
-    d_vertexvol[ iv[0] ] += d_elemvol[ie] / 4.0 ;
-    d_vertexvol[ iv[1] ] += d_elemvol[ie] / 4.0 ;
-    d_vertexvol[ iv[2] ] += d_elemvol[ie] / 4.0 ;
-    d_vertexvol[ iv[3] ] += d_elemvol[ie] / 4.0 ;
-  });
-
-  hipMemcpy(vertexvol, d_vertexvol, N_vert*N_vert*sizeof(double), hipMemcpyDeviceToHost);
-  checkResult(vertexvol, vertexvol_ref, N_vert);
-
-  memoryManager::deallocate_gpu(d_elemvol);
-  memoryManager::deallocate_gpu(d_vertexvol);
-  memoryManager::deallocate_gpu(d_elem2vert_map);
-//std::cout << "\n Vertex volumes...\n";
-//printMeshData(vertexvol, N_vert, jvoff);
-#endif
-
-//----------------------------------------------------------------------------//
-
-  // Clean up...
-  memoryManager::deallocate(elemvol);
-  memoryManager::deallocate(vertexvol);
-  memoryManager::deallocate(vertexvol_ref);
-  memoryManager::deallocate(elem2vert_map);
-
-  std::cout << "\n DONE!...\n";
-
-  return 0;
-}
-
-//
-// Function to compare result to reference and print result P/F.
-//
-void checkResult(double* vol, double* volref, int n)
-{
-  bool match = true;
-  for (int i = 0; i < n*n; i++) {
-    if ( std::abs(vol[i] - volref[i]) > 10e-12 ) { match = false; }
-  }
-  if ( match ) {
-    std::cout << "\n\t result -- PASS\n";
-  } else {
-    std::cout << "\n\t result -- FAIL\n";
-  }
-}
-
-//
-// Function to print mesh data with mesh indices.
-//
-void printMeshData(double* v, int n, int joff)
-{
-  std::cout << std::endl;
-  for (int j = 0 ; j < n ; ++j) {
-    for (int i = 0 ; i < n ; ++i) {
-      int ii = i + j*joff ;
-      std::cout << "v(" << i << "," << j << ") = "
-                << v[ii] << std::endl;
-    }
-  }
-  std::cout << std::endl;
-}
diff --git a/exercises/CMakeLists.txt b/exercises/CMakeLists.txt
index e6251e0fe3..7289dd9001 100644
--- a/exercises/CMakeLists.txt
+++ b/exercises/CMakeLists.txt
@@ -5,4 +5,144 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
+raja_add_executable(
+  NAME atomic-histogram
+  SOURCES atomic-histogram.cpp)
+raja_add_executable(
+  NAME atomic-histogram_solution
+  SOURCES atomic-histogram_solution.cpp)
+
+raja_add_executable(
+  NAME dot-product
+  SOURCES dot-product.cpp)
+raja_add_executable(
+  NAME dot-product_solution
+  SOURCES dot-product_solution.cpp)
+
+raja_add_executable(
+  NAME kernelintro-execpols
+  SOURCES kernelintro-execpols.cpp)
+raja_add_executable(
+  NAME kernelintro-execpols_solution
+  SOURCES kernelintro-execpols_solution.cpp)
+
+raja_add_executable(
+  NAME launchintro-execpols
+  SOURCES launchintro-execpols.cpp)
+raja_add_executable(
+  NAME launchintro-execpols_solution
+  SOURCES launchintro-execpols_solution.cpp)
+
+raja_add_executable(
+  NAME kernelintro-nested-loop-reorder
+  SOURCES kernelintro-nested-loop-reorder.cpp)
+raja_add_executable(
+  NAME kernelintro-nested-loop-reorder_solution
+  SOURCES kernelintro-nested-loop-reorder_solution.cpp)
+
+raja_add_executable(
+  NAME kernel-matrix-transpose
+  SOURCES kernel-matrix-transpose.cpp)
+raja_add_executable(
+  NAME kernel-matrix-transpose_solution
+  SOURCES kernel-matrix-transpose_solution.cpp)
+
+raja_add_executable(
+  NAME launch-matrix-transpose
+  SOURCES launch-matrix-transpose.cpp)
+raja_add_executable(
+  NAME launch-matrix-transpose_solution
+  SOURCES launch-matrix-transpose_solution.cpp)
+
+raja_add_executable(
+  NAME kernel-matrix-transpose-tiled
+  SOURCES kernel-matrix-transpose-tiled.cpp)
+raja_add_executable(
+  NAME kernel-matrix-transpose-tiled_solution
+  SOURCES kernel-matrix-transpose-tiled_solution.cpp)
+
+raja_add_executable(
+  NAME launch-matrix-transpose-tiled
+  SOURCES launch-matrix-transpose-tiled.cpp)
+raja_add_executable(
+  NAME launch-matrix-transpose-tiled_solution
+  SOURCES launch-matrix-transpose-tiled_solution.cpp)
+
+raja_add_executable(
+  NAME kernel-matrix-transpose-local-array
+  SOURCES kernel-matrix-transpose-local-array.cpp)
+raja_add_executable(
+  NAME kernel-matrix-transpose-local-array_solution
+  SOURCES kernel-matrix-transpose-local-array_solution.cpp)
+
+raja_add_executable(
+  NAME launch-matrix-transpose-local-array
+  SOURCES launch-matrix-transpose-local-array.cpp)
+raja_add_executable(
+  NAME launch-matrix-transpose-local-array_solution
+  SOURCES launch-matrix-transpose-local-array_solution.cpp)
+
+raja_add_executable(
+  NAME offset-layout-stencil
+  SOURCES offset-layout-stencil.cpp)
+raja_add_executable(
+  NAME offset-layout-stencil_solution
+  SOURCES offset-layout-stencil_solution.cpp)
+
+raja_add_executable(
+  NAME permuted-layout-batch-matrix-multiply
+  SOURCES permuted-layout-batch-matrix-multiply.cpp) 
+raja_add_executable(
+  NAME permuted-layout-batch-matrix-multiply_solution
+  SOURCES permuted-layout-batch-matrix-multiply_solution.cpp) 
+
+raja_add_executable(
+  NAME reductions
+  SOURCES reductions.cpp)
+raja_add_executable(
+  NAME reductions_solution
+  SOURCES reductions_solution.cpp)
+
+raja_add_executable(
+  NAME scan
+  SOURCES scan.cpp)
+raja_add_executable(
+  NAME scan_solution
+  SOURCES scan_solution.cpp)
+
+raja_add_executable(
+  NAME segment-indexset-basics
+  SOURCES segment-indexset-basics.cpp)
+raja_add_executable(
+  NAME segment-indexset-basics_solution
+  SOURCES segment-indexset-basics_solution.cpp)
+
+raja_add_executable(
+  NAME sort
+  SOURCES sort.cpp)
+raja_add_executable(
+  NAME sort_solution
+  SOURCES sort_solution.cpp)
+
+raja_add_executable(
+  NAME vector-addition
+  SOURCES vector-addition.cpp)
+raja_add_executable(
+  NAME vector-addition_solution
+  SOURCES vector-addition_solution.cpp)
+
+raja_add_executable(
+  NAME vertexsum-indexset
+  SOURCES vertexsum-indexset.cpp)
+raja_add_executable(
+  NAME vertexsum-indexset_solution
+  SOURCES vertexsum-indexset_solution.cpp)
+
+raja_add_executable(
+  NAME view-layout
+  SOURCES view-layout.cpp)
+raja_add_executable(
+  NAME view-layout_solution
+  SOURCES view-layout_solution.cpp)
+
 add_subdirectory(tutorial_halfday)
diff --git a/exercises/Dockerfile b/exercises/Dockerfile
new file mode 100644
index 0000000000..1a72413e41
--- /dev/null
+++ b/exercises/Dockerfile
@@ -0,0 +1,36 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11
+
+ARG USER=AWSUSER
+ENV HOME /home/${USER}
+
+RUN apt-get update && apt-get install -y supervisor
+
+RUN useradd --create-home --shell /bin/bash ${USER}
+USER ${USER}
+
+WORKDIR $HOME
+RUN git clone --recursive -b task/tut-reorg-aws https://github.com/llnl/raja
+
+WORKDIR $HOME/raja/build
+RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \
+    cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. 
+
+WORKDIR /opt/archives
+RUN curl -L https://github.com/gitpod-io/openvscode-server/releases/download/openvscode-server-v1.69.1/openvscode-server-v1.69.1-linux-x64.tar.gz > \
+    /opt/archives/openvscode-server-v1.69.1-linux-x64.tar.gz
+RUN tar xzf openvscode-server-v1.69.1-linux-x64.tar.gz && chown -R ${USER}:${USER} openvscode-server-v1.69.1-linux-x64
+
+USER root
+ADD supervisord.conf /etc/supervisord.conf
+RUN sed -i "s/XXX/${USER}/g" /etc/supervisord.conf
+
+RUN touch /var/log/openvscode-server.log && chown -R ${USER}:${USER} /var/log/openvscode-server.log
+ 
+CMD ["/usr/bin/supervisord"]
diff --git a/exercises/tutorial_halfday/ex4_atomic-histogram.cpp b/exercises/atomic-histogram.cpp
similarity index 73%
rename from exercises/tutorial_halfday/ex4_atomic-histogram.cpp
rename to exercises/atomic-histogram.cpp
index 8ad89a45e3..dac9f9bcdd 100644
--- a/exercises/tutorial_halfday/ex4_atomic-histogram.cpp
+++ b/exercises/atomic-histogram.cpp
@@ -15,7 +15,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #4: Atomic histogram
+ *  Atomic histogram exercise
  *
  *  In this exercise, you will use use RAJA atomic operations to compute
  *  an array which represents a histogram of values in another array.
@@ -38,14 +38,15 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-
-                    Uncomment to use when filling in exercises.
-
+  Specifies the number of threads in a GPU thread block
+*/
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
+//const int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+//const int HIP_BLOCK_SIZE = 256;
 #endif
-*/
 
 //
 // Functions to check and print result.
@@ -57,32 +58,35 @@ void printArray(int* v, int len);
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nExercise #4: Atomic histogram...\n";
+  std::cout << "\n\nExercise: Atomic histogram...\n";
 
   //
   // Define array bounds and initialize array to compute histogram of values
   // on. 
   //
-  int M = 20;
-  int N = 100000;
+
+  // _array_atomic_histogram_start
+  constexpr int M = 20;
+  constexpr int N = 100000;
 
   int* array = memoryManager::allocate<int>(N);
   int* hist = memoryManager::allocate<int>(M);
-  int* hist_ref = memoryManager::allocate<int>(M);
 
   for (int i = 0; i < N; ++i) { 
     array[i] = rand() % M;
   }
+  // _array_atomic_histogram_end
 
+  int* hist_ref = memoryManager::allocate<int>(M);
 
 //----------------------------------------------------------------------------//
 // C-style sequential variant establishes reference solution to compare with.
 //----------------------------------------------------------------------------//
 
-  std::memset(hist_ref, 0, M * sizeof(int));
-
   std::cout << "\n\n Running C-style sequential historgram...\n";
 
+  std::memset(hist_ref, 0, M * sizeof(int));
+
   for (int i = 0; i < N; ++i) {
       hist_ref[ array[i] ]++;
   }
@@ -120,6 +124,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::memset(hist, 0, M * sizeof(int));
 
+  // _range_atomic_histogram_start
+  //RAJA::TypedRangeSegment<int> array_range(0,N);
+  // _range_atomic_histogram_end
+
   ///
   /// TODO...
   ///
@@ -127,7 +135,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           method with RAJA::seq_exec execution policy type and a 
   ///           RAJA::atomicAdd operation with RAJA::seq_atomic policy.
   ///
-
+  ///           You will need to uncomment the range segment definition
+  ///           above to use it in the kernel.
+  ///
+  //RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
+  //});
 
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
@@ -151,7 +163,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           and a RAJA::atomicAdd operation with RAJA::omp_atomic policy.
   /// 
 
-
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
 
@@ -177,7 +188,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           and a RAJA::atomicAdd operation with RAJA::auto_atomic policy.
   ///
 
-
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
 
@@ -201,7 +211,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           method with RAJA::cuda_exec execution policy type
   ///           and a RAJA::atomicAdd operation with RAJA::cuda_atomic policy.
   ///
-
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here.
+  ///
 
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
@@ -227,6 +239,63 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   ///           method with RAJA::cuda_exec execution policy type
   ///           and a RAJA::atomicAdd operation with RAJA::auto_atomic policy.
   ///
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here.
+  ///
+   
+  checkResult(hist, hist_ref, M);
+//printArray(hist, M);
+
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA hip_atomic policy is used with the RAJA HIP execution policy.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+  std::cout << "\n Running RAJA HIP atomic histogram...\n";
+
+  std::memset(hist, 0, M * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
+  ///           method with RAJA::hip_exec execution policy type
+  ///           and a RAJA::atomicAdd operation with RAJA::hip_atomic policy.
+  ///
+  ///           NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here.
+  ///
+
+  checkResult(hist, hist_ref, M);
+//printArray(hist, M);
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA HIP 
+// execution policy.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+  std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n";
+ 
+  std::memset(hist, 0, M * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall
+  ///           method with RAJA::hip_exec execution policy type
+  ///           and a RAJA::atomicAdd operation with RAJA::auto_atomic policy.
+  ///
+  ///           NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here.
+  ///
    
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
diff --git a/exercises/tutorial_halfday/ex4_atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp
similarity index 68%
rename from exercises/tutorial_halfday/ex4_atomic-histogram_solution.cpp
rename to exercises/atomic-histogram_solution.cpp
index 7f937e48d5..924721385f 100644
--- a/exercises/tutorial_halfday/ex4_atomic-histogram_solution.cpp
+++ b/exercises/atomic-histogram_solution.cpp
@@ -15,7 +15,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #4: Atomic histogram
+ *  Atomic histogram exercise
  *
  *  In this exercise, you will use use RAJA atomic operations to compute
  *  an array which represents a histogram of values in another array.
@@ -38,12 +38,16 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+  Specifies the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
 const int CUDA_BLOCK_SIZE = 256;
 #endif
 
+#if defined(RAJA_ENABLE_HIP)
+const int HIP_BLOCK_SIZE = 256;
+#endif
+
 //
 // Functions to check and print result.
 //
@@ -54,23 +58,26 @@ void printArray(int* v, int len);
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nExercise #4: Atomic histogram...\n";
+  std::cout << "\n\nExercise: Atomic histogram...\n";
 
   //
   // Define array bounds and initialize array to compute histogram of values
   // on. 
   //
-  int M = 20;
-  int N = 100000;
+
+  // _array_atomic_histogram_start
+  constexpr int M = 20;
+  constexpr int N = 100000;
 
   int* array = memoryManager::allocate<int>(N);
   int* hist = memoryManager::allocate<int>(M);
-  int* hist_ref = memoryManager::allocate<int>(M);
 
   for (int i = 0; i < N; ++i) { 
     array[i] = rand() % M;
   }
+  // _array_atomic_histogram_end
 
+  int* hist_ref = memoryManager::allocate<int>(M);
 
 //----------------------------------------------------------------------------//
 // C-style sequential variant establishes reference solution to compare with.
@@ -93,10 +100,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  std::memset(hist, 0, M * sizeof(int));
-
   std::cout << "\n\n Running C-style OpenMP historgram...\n";
 
+  std::memset(hist, 0, M * sizeof(int));
+
   #pragma omp parallel for
   for (int i = 0; i < N; ++i) {
       #pragma omp atomic
@@ -113,15 +120,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 // RAJA::seq_exec policy enforces strictly sequential execution.
 //----------------------------------------------------------------------------//
 
+  std::cout << "\n Running RAJA sequential atomic histogram...\n";
+
   std::memset(hist, 0, M * sizeof(int));
 
-  using EXEC_POL1 = RAJA::seq_exec;
-  using ATOMIC_POL1 = RAJA::seq_atomic;
+  // _range_atomic_histogram_start 
+  RAJA::TypedRangeSegment<int> array_range(0,N);
+  // _range_atomic_histogram_end 
 
-  std::cout << "\n Running RAJA sequential atomic histogram...\n";
+  RAJA::forall<RAJA::seq_exec>(array_range, [=](int i) {
+
+    RAJA::atomicAdd<RAJA::seq_atomic>(&hist[array[i]], 1);
 
-  RAJA::forall<EXEC_POL1>(RAJA::RangeSegment(0, N), [=](int i) {
-    RAJA::atomicAdd<ATOMIC_POL1>(&hist[array[i]], 1);
   });
 
   checkResult(hist, hist_ref, M);
@@ -134,16 +144,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
+  std::cout << "\n Running RAJA OpenMP atomic histogram...\n";
+
   std::memset(hist, 0, M * sizeof(int));
 
-  using EXEC_POL2 = RAJA::omp_parallel_for_exec;
-  using ATOMIC_POL2 = RAJA::omp_atomic;
+  // _rajaomp_atomic_histogram_start 
+  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
 
-  std::cout << "\n Running RAJA OpenMP atomic histogram...\n";
+    RAJA::atomicAdd<RAJA::omp_atomic>(&hist[array[i]], 1);
 
-  RAJA::forall<EXEC_POL2>(RAJA::RangeSegment(0, N), [=](int i) {
-    RAJA::atomicAdd<ATOMIC_POL2>(&hist[array[i]], 1);
   });
+  // _rajaomp_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
@@ -158,15 +169,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_OPENMP)
 
+  std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n";
+  
   std::memset(hist, 0, M * sizeof(int));
 
-  using EXEC_POL3 = RAJA::omp_parallel_for_exec;
-  using ATOMIC_POL3 = RAJA::auto_atomic;
+  RAJA::forall<RAJA::omp_parallel_for_exec>(array_range, [=](int i) {
+
+    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
 
-  std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n";
-  
-  RAJA::forall<EXEC_POL3>(RAJA::RangeSegment(0, N), [=](int i) {
-    RAJA::atomicAdd<ATOMIC_POL3>(&hist[array[i]], 1);
   });
     
   checkResult(hist, hist_ref, M);
@@ -181,16 +191,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
 
+  std::cout << "\n Running RAJA CUDA atomic histogram...\n";
+
   std::memset(hist, 0, M * sizeof(int));
 
-  std::cout << "\n Running RAJA CUDA atomic histogram...\n";
+  // _rajacuda_atomic_histogram_start 
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
 
-  using EXEC_POL4 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
-  using ATOMIC_POL4 = RAJA::cuda_atomic;
+    RAJA::atomicAdd<RAJA::cuda_atomic>(&hist[array[i]], 1);
 
-  RAJA::forall<EXEC_POL4>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) {
-    RAJA::atomicAdd<ATOMIC_POL4>(&hist[array[i]], 1);
   });
+  // _rajacuda_atomic_histogram_end
 
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
@@ -205,16 +216,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
 
+  std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n";
+ 
   std::memset(hist, 0, M * sizeof(int));
 
-  using EXEC_POL5 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
-  using ATOMIC_POL5 = RAJA::auto_atomic;
+  // _rajacuda_atomicauto_histogram_start 
+  RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
 
-  std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n";
+    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
+
+  });
+  // _rajacuda_atomicauto_histogram_end
+   
+  checkResult(hist, hist_ref, M);
+//printArray(hist, M);
+
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA hip_atomic policy is used with the RAJA HIP execution policy.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+  std::cout << "\n Running RAJA HIP atomic histogram...\n";
+
+  std::memset(hist, 0, M * sizeof(int));
+
+  // _rajahip_atomic_histogram_start 
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
+
+    RAJA::atomicAdd<RAJA::hip_atomic>(&hist[array[i]], 1);
+
+  });
+  // _rajahip_atomic_histogram_end
+
+  checkResult(hist, hist_ref, M);
+//printArray(hist, M);
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+// RAJA auto_atomic policy can also be used with the RAJA HIP 
+// execution policy.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+  std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n";
  
-  RAJA::forall<EXEC_POL5>(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) {
-    RAJA::atomicAdd<ATOMIC_POL5>(&hist[array[i]], 1);
+  std::memset(hist, 0, M * sizeof(int));
+
+  // _rajahip_atomicauto_histogram_start 
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(array_range, [=] RAJA_DEVICE (int i) {
+
+    RAJA::atomicAdd<RAJA::auto_atomic>(&hist[array[i]], 1);
+
   });
+  // _rajahip_atomicauto_histogram_end
    
   checkResult(hist, hist_ref, M);
 //printArray(hist, M);
diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp
new file mode 100644
index 0000000000..67ec877f89
--- /dev/null
+++ b/exercises/dot-product.cpp
@@ -0,0 +1,236 @@
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Vector Dot Product Exercise
+ *
+ *  Computes dot = (a,b), where a, b are vectors of 
+ *  doubles and dot is a scalar double. It illustrates how RAJA
+ *  supports a portable parallel reduction opertion in a way that 
+ *  the code looks like it does in a sequential implementation.
+ *
+ *  RAJA features shown:
+ *    - `forall` loop iteration template method
+ *    -  Index range segment
+ *    -  Execution policies
+ *    -  Reduction types
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+//  Function to check dot product result.
+//
+void checkResult(double compdot, double refdot);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nExercise: vector dot product...\n";
+
+#if defined(RAJA_ENABLE_SYCL)
+  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
+  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
+#endif
+
+//
+// Define vector length
+//
+  constexpr int N = 1000000;
+
+//
+// Allocate and initialize vector data
+//
+  double *a = memoryManager::allocate<double>(N);
+  double *b = memoryManager::allocate<double>(N);
+
+  for (int i = 0; i < N; ++i) {
+    a[i] = 1.0;
+    b[i] = 1.0;
+  }
+
+//----------------------------------------------------------------------------//
+
+//
+// C-style dot product operation.
+//
+  std::cout << "\n Running C-version of dot product...\n";
+
+  // _csytle_dotprod_start
+  double dot = 0.0;
+
+  for (int i = 0; i < N; ++i) {
+    dot += a[i] * b[i];
+  }
+
+  std::cout << "\t (a, b) = " << dot << std::endl;
+  // _csytle_dotprod_end
+
+  double dot_ref = dot;
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential dot product...\n";
+
+  dot = 0.0;
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the dot product kernel using a RAJA::seq_exec
+  ///           execution policy type and RAJA::seq_reduce. 
+  ///
+  /// NOTE: We've done this one for you to help you get started...
+  ///
+
+  RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
+
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+    seqdot += a[i] * b[i]; 
+  });
+
+  dot = seqdot.get();
+
+  std::cout << "\t (a, b) = " << dot << std::endl;
+
+  checkResult(dot, dot_ref);
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running RAJA OpenMP dot product...\n";
+
+  dot = 0.0;
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the dot product kernel using a RAJA::omp_parallel_for_exec
+  ///           execution policy type and RAJA::omp_reduce reduction policy type.
+  ///
+
+  std::cout << "\t (a, b) = " << dot << std::endl;
+
+  checkResult(dot, dot_ref);
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//const int CUDA_BLOCK_SIZE = 256;
+
+  std::cout << "\n Running RAJA CUDA dot product...\n";
+
+  dot = 0.0;
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the dot product kernel using a RAJA::cuda_exec
+  ///           execution policy type and RAJA::cuda_reduce reduction policy type.
+  ///      
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' above.
+  ///                 if you want to use it here. 
+  ///
+
+  std::cout << "\t (a, b) = " << dot << std::endl;
+
+  checkResult(dot, dot_ref);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+//const int HIP_BLOCK_SIZE = 256;
+
+  std::cout << "\n Running RAJA HIP dot product...\n";
+
+  dot = 0.0;
+
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the dot product kernel using a RAJA::hip_exec
+  ///           execution policy type and RAJA::hip_reduce reduction policy type.
+  ///      
+  ///           NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' above
+  ///                 if you want to use it here. 
+  ///
+
+  std::cout << "\t (a, b) = " << dot << std::endl;
+
+  checkResult(dot, dot_ref);
+
+  memoryManager::deallocate_gpu(d_a);
+  memoryManager::deallocate_gpu(d_b);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_SYCL)
+
+//const int SYCL_BLOCK_SIZE = 256;
+
+  std::cout << "\n Running RAJA SYCL dot product...\n";
+
+  dot = 0.0;
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the dot product kernel using a RAJA::sycl_exec
+  ///           execution policy type and RAJA::sycl_reduce. 
+  ///
+  ///           NOTE: You will need to uncomment 'SYCL_BLOCK_SIZE' above
+  ///                 if you want to use it here. 
+  ///
+
+  std::cout << "\t (a, b) = " << dot << std::endl;
+
+  checkResult(dot, dot_ref);
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+
+  memoryManager::deallocate(a);
+  memoryManager::deallocate(b);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+//  Function to check computed dot product and report P/F.
+//
+void checkResult(double compdot, double refdot)
+{
+  if ( compdot == refdot ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
+
diff --git a/examples/tut_dot-product.cpp b/exercises/dot-product_solution.cpp
similarity index 87%
rename from examples/tut_dot-product.cpp
rename to exercises/dot-product_solution.cpp
index 00a1691048..9d984fa066 100644
--- a/examples/tut_dot-product.cpp
+++ b/exercises/dot-product_solution.cpp
@@ -14,7 +14,7 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Vector Dot Product Example
+ *  Vector Dot Product Exercise
  *
  *  Computes dot = (a,b), where a, b are vectors of 
  *  doubles and dot is a scalar double. It illustrates how RAJA
@@ -30,21 +30,6 @@
  * If CUDA is enabled, CUDA unified memory is used.
  */
 
-/*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-*/
-#if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
-#endif
-
-#if defined(RAJA_ENABLE_SYCL)
-const int SYCL_BLOCK_SIZE = 256;
-#endif
-
 //
 //  Function to check dot product result.
 //
@@ -53,7 +38,7 @@ void checkResult(double compdot, double refdot);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nRAJA vector dot product example...\n";
+  std::cout << "\n\nExercise: vector dot product...\n";
 
 #if defined(RAJA_ENABLE_SYCL)
   memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
@@ -63,7 +48,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //
 // Define vector length
 //
-  const int N = 1000000;
+  constexpr int N = 1000000;
 
 //
 // Allocate and initialize vector data
@@ -89,9 +74,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   for (int i = 0; i < N; ++i) {
     dot += a[i] * b[i];
   }
-  // _csytle_dotprod_end
 
   std::cout << "\t (a, b) = " << dot << std::endl;
+  // _csytle_dotprod_end
 
   double dot_ref = dot;
 
@@ -99,10 +84,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA sequential dot product...\n";
 
+  dot = 0.0;
+
   // _rajaseq_dotprod_start
   RAJA::ReduceSum<RAJA::seq_reduce, double> seqdot(0.0);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, N), [=] (int i) { 
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
     seqdot += a[i] * b[i]; 
   });
 
@@ -119,6 +106,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_OPENMP)
   std::cout << "\n Running RAJA OpenMP dot product...\n";
 
+  dot = 0.0;
+
   // _rajaomp_dotprod_start
   RAJA::ReduceSum<RAJA::omp_reduce, double> ompdot(0.0);
 
@@ -138,8 +127,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_CUDA)
+
+  const int CUDA_BLOCK_SIZE = 256;
+
   std::cout << "\n Running RAJA CUDA dot product...\n";
 
+  dot = 0.0;
+
   // _rajacuda_dotprod_start
   RAJA::ReduceSum<RAJA::cuda_reduce, double> cudot(0.0);
 
@@ -159,13 +153,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_HIP)
+
+  const int HIP_BLOCK_SIZE = 256;
+
   std::cout << "\n Running RAJA HIP dot product...\n";
 
-  int *d_a = memoryManager::allocate_gpu<int>(N);
-  int *d_b = memoryManager::allocate_gpu<int>(N);
+  dot = 0.0;
+
+  double *d_a = memoryManager::allocate_gpu<double>(N);
+  double *d_b = memoryManager::allocate_gpu<double>(N);
 
-  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
-  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice ));
 
   // _rajahip_dotprod_start
   RAJA::ReduceSum<RAJA::hip_reduce, double> hpdot(0.0);
@@ -189,8 +188,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_SYCL)
+
+  const int SYCL_BLOCK_SIZE = 256;
+
   std::cout << "\n Running RAJA SYCL dot product...\n";
 
+  dot = 0.0;
+
   // _rajasycl_dotprod_start
   RAJA::ReduceSum<RAJA::sycl_reduce, double> hpdot(0.0);
 
diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp
new file mode 100644
index 0000000000..c9e6dfa062
--- /dev/null
+++ b/exercises/kernel-matrix-transpose-local-array.cpp
@@ -0,0 +1,689 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "memoryManager.hpp"
+
+/*
+ *  Matrix Transpose Exercise
+ *
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At of size N_c x N_r.
+ *
+ *  This operation is carried out using a local memory tiling
+ *  algorithm. The algorithm first loads matrix entries into an
+ *  iteraion shared tile, a two-dimensional array, and then
+ *  reads from the tile with row and column indices swapped for
+ *  the output matrix.
+ *
+ *  The algorithm is expressed as a collection of ``outer``
+ *  and ``inner`` for loops. Iterations of the inner loops will load/read
+ *  data into the tile; while outer loops will iterate over the number
+ *  of tiles needed to carry out the transpose.
+ *
+ *  RAJA variants of the exercise use RAJA local arrays as tile memory.
+ *  Furthermore, the tiling pattern is handled by RAJA's tile statements.
+ *  For CPU execution, RAJA local arrays are used to improve
+ *  performance via cache blocking. For CUDA GPU execution,
+ *  RAJA shared memory is mapped to CUDA shared memory which
+ *  enables threads in the same thread block to share data.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::kernel' abstractions for nested loops
+ *       - Multiple lambdas
+ *       - Options for specifying lambda arguments
+ *       - Tile statement
+ *       - ForICount statement
+ *       - RAJA local arrays
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
+
+  //
+  // Define num rows/cols in matrix, tile dimensions, and number of tiles
+  //
+  // _mattranspose_localarray_dims_start
+  constexpr int N_r = 267;
+  constexpr int N_c = 251;
+
+  constexpr int TILE_DIM = 16;
+
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  // _mattranspose_localarray_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_localarray_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_localarray_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  // printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of shared matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _mattranspose_localarray_cstyle_start
+  //
+  // (0) Outer loops to iterate over tiles
+  //
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
+
+      // Stack-allocated local array for data on a tile
+      int Tile[TILE_DIM][TILE_DIM];
+
+      //
+      // (1) Inner loops to read input matrix tile data into the array
+      //
+      //     Note: loops are ordered so that input matrix data access
+      //           is stride-1.
+      //
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Tile[ty][tx] = Aview(row, col);
+          }
+        }
+      }
+
+      //
+      // (2) Inner loops to write array data into output array tile
+      //
+      //     Note: loop order is swapped from above so that output matrix
+      //           data access is stride-1.
+      //
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Atview(col, row) = Tile[ty][tx];
+          }
+        }
+      }
+
+    }
+  }
+  // _mattranspose_localarray_cstyle_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::Kernel
+  // method to carryout the transpose
+  //
+
+  // Here we define a RAJA local array type.
+  // The array type is templated on
+  // 1) Data type
+  // 2) Index permutation
+  // 3) Dimensions of the array
+  //
+
+  // _mattranspose_localarray_start
+  using TILE_MEM =
+    RAJA::LocalArray<int, RAJA::Perm<0, 1>, RAJA::SizeList<TILE_DIM, TILE_DIM>>;
+  TILE_MEM Tile_Array;
+  // _mattranspose_localarray_end
+
+  // **NOTE** Although the LocalArray is constructed
+  // the array memory has not been allocated.
+
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _mattranspose_localarray_raja_start
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  using SEQ_EXEC_POL_I =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+
+          ///
+          /// TODO...
+          ///
+          /// EXERCISE: Initialize the local memory statement as position 2 
+          ///           in the paramater list.
+          ///
+
+          RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::loop_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::loop_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >,
+
+          RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::loop_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::loop_exec,
+              RAJA::statement::Lambda<1>
+            >
+          >
+
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel_param<SEQ_EXEC_POL_I>( 
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      Tile_Array(ty, tx) = Aview(row, col);
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      Atview(col, row) = Tile_Array(ty, tx);
+    }
+
+  );
+  */
+  // _mattranspose_localarray_raja_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix "
+               "transpose exercise ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  using OPENMP_EXEC_1_POL =
+  RAJA::KernelPolicy<
+    //
+    // (0) Execution policies for outer loops
+    //      These loops iterate over the number of
+    //      tiles needed to carry out the transpose
+    //
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        // This statement will initalize local array memory inside a
+        // kernel. The cpu_tile_mem policy specifies that memory should be
+        // allocated on the stack. The entries in the RAJA::ParamList
+        // identify RAJA local arrays in the parameter tuple to intialize.
+        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+          //
+          // (1) Execution policies for the first set of inner
+          // loops. These loops copy data from the global matrices
+          // to the local tile.
+          //
+
+          ///
+          /// TODO...
+          ///
+          /// EXERCISE: Use two ForICount statements with loop_exec to call the first lambda.
+          ///
+
+          //
+          // (2) Execution policies for the second set of inner
+          // loops. These loops copy data from the local tile to
+          // the global matrix.
+          //     Note: The order of the loops have been
+          //     swapped! This enables us to swap which
+          //     index has unit stride.
+          //
+
+          ///
+          /// TODO...
+          ///
+          /// EXERCISE: Use two ForICount statements with loop_exec to call the second lambda.
+          ///
+        >
+      >
+    >
+   >;
+
+  RAJA::kernel_param<OPENMP_EXEC_1_POL>(
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Tile_Array(ty, tx) = Aview(row, col);
+
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
+  */
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - OpenMP (parallel inner loops) matrix "
+               "transpose exercise ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  using OPENMP_EXEC_2_POL =
+  RAJA::KernelPolicy<
+    //
+    // (0) Execution policies for outer loops
+    //      These loops iterate over the number of
+    //      tiles needed to carry out the transpose
+    //
+    RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+      // This statement will initalize local array memory inside a
+      // kernel. The cpu_tile_mem policy specifies that memory should be
+      // allocated on the stack. The entries in the RAJA::ParamList
+      // identify RAJA local arrays to intialize in the parameter tuple.
+        RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<2>,
+          //
+          // (1) Execution policies for the first set of inner
+          // loops. These loops copy data from the global matrices
+          // to the local tile.
+          //
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec,
+            RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec,
+                                       RAJA::statement::Lambda<0>
+             >
+          >,
+          //
+          // (2) Execution policies for the second set of inner
+          // loops. These loops copy data from the local tile to
+          // the global matrix.
+          //     Note: The order of the loops have been
+          //     swapped! This enables us to swap which
+          //     index has unit stride.
+          //
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec,
+            RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec,
+              RAJA::statement::Lambda<1>
+            >
+          >
+        >
+      >
+    >
+  >;
+
+  RAJA::kernel_param<OPENMP_EXEC_2_POL>(
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Tile_Array(ty, tx) = Aview(row, col);
+
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_r, N_c);
+#endif
+
+  //--------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  using CUDA_EXEC_POL =
+  RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      //
+      // (0) Execution policies for outer loops
+      //      These loops iterate over the number of
+      //      tiles needed to carry out the transpose
+      //
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+          // This statement will initalize local array memory inside a
+          // kernel. The cpu_tile_mem policy specifies that memory should be
+          // allocated on the stack. The entries in the RAJA::ParamList
+          // identify RAJA local arrays to intialize in the parameter tuple.
+          RAJA::statement::InitLocalMem<RAJA::cuda_shared_mem, RAJA::ParamList<2>,
+            //
+            // (1) Execution policies for the first set of inner
+            // loops. These loops copy data from the global matrices
+            // to the local tile.
+            //
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct,
+                                          RAJA::statement::Lambda<0>
+              >
+            >,
+            // Synchronize threads to ensure all loads
+            // to the local array are complete
+            RAJA::statement::CudaSyncThreads,
+            //
+            // (2) Execution policies for the second set of inner
+            // loops. These loops copy data from the local tile to
+            // the global matrix.
+            //     Note: The order of the loops have been
+            //     swapped! This enables us to swap which
+            //     index has unit stride.
+            //
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
+                                            RAJA::statement::Lambda<1>
+              >
+            >,
+            // Synchronize threads to ensure all reads
+            // from the local array are complete
+            RAJA::statement::CudaSyncThreads
+          >
+        >
+      >
+    >
+  >;
+
+
+  RAJA::kernel_param<CUDA_EXEC_POL>(
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Tile_Array(ty, tx) = Aview(row, col);
+
+    },
+
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+//--------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
+
+  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+
+  using HIP_EXEC_POL =
+  RAJA::KernelPolicy<
+    RAJA::statement::HipKernel<
+      //
+      // (0) Execution policies for outer loops
+      //      These loops iterate over the number of
+      //      tiles needed to carry out the transpose
+      //
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+          // This statement will initalize local array memory inside a
+          // kernel. The cpu_tile_mem policy specifies that memory should be
+          // allocated on the stack. The entries in the RAJA::ParamList
+          // identify RAJA local arrays to intialize in the parameter tuple.
+          RAJA::statement::InitLocalMem<RAJA::hip_shared_mem, RAJA::ParamList<2>,
+            //
+            // (1) Execution policies for the first set of inner
+            // loops. These loops copy data from the global matrices
+            // to the local tile.
+            //
+            RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct,
+                                          RAJA::statement::Lambda<0>
+              >
+            >,
+            // Synchronize threads to ensure all loads
+            // to the local array are complete
+            RAJA::statement::HipSyncThreads,
+            //
+            // (2) Execution policies for the second set of inner
+            // loops. These loops copy data from the local tile to
+            // the global matrix.
+            //     Note: The order of the loops have been
+            //     swapped! This enables us to swap which
+            //     index has unit stride.
+            //
+            RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
+                                            RAJA::statement::Lambda<1>
+              >
+            >,
+            // Synchronize threads to ensure all reads
+            // from the local array are complete
+            RAJA::statement::HipSyncThreads
+          >
+        >
+      >
+    >
+  >;
+
+
+  RAJA::kernel_param<HIP_EXEC_POL>(
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
+
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      Tile_Array(ty, tx) = d_Aview(row, col);
+
+    },
+
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+
+      d_Atview(col, row) = Tile_Array(ty, tx);
+
+    }
+  );
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //Alias for convenience
+  using RAJA::Segs;
+  using RAJA::Offsets;
+  using RAJA::Params;
+
+  // _mattranspose_localarray_raja_lambdaargs_start
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  using SEQ_EXEC_POL_II =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0>,
+
+          RAJA::statement::For<1, RAJA::loop_exec,
+            RAJA::statement::For<0, RAJA::loop_exec,
+              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> >
+            >
+          >,
+
+          RAJA::statement::For<0, RAJA::loop_exec,
+            RAJA::statement::For<1, RAJA::loop_exec,
+              RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> >
+            >
+          >
+
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel_param<SEQ_EXEC_POL_II>( 
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+
+    RAJA::make_tuple(Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      Atview(col, row) = Tile_Array(ty, tx);
+    }
+  );
+  */
+  // _mattranspose_localarray_raja_lambdaargs_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+//--------------------------------------------------------------------------//
+
+  return 0;
+}
+
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+                << std::endl;
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/examples/tut_matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
similarity index 87%
rename from examples/tut_matrix-transpose-local-array.cpp
rename to exercises/kernel-matrix-transpose-local-array_solution.cpp
index 1a62446fd5..06841483fa 100644
--- a/examples/tut_matrix-transpose-local-array.cpp
+++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -14,9 +14,9 @@
 #include "memoryManager.hpp"
 
 /*
- *  Matrix Transpose Example
+ *  Matrix Transpose Exercise
  *
- *  In this example, an input matrix A of dimension N_r x N_c is
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
  *  transposed and returned as a second matrix At of size N_c x N_r.
  *
  *  This operation is carried out using a local memory tiling
@@ -30,7 +30,7 @@
  *  data into the tile; while outer loops will iterate over the number
  *  of tiles needed to carry out the transpose.
  *
- *  RAJA variants of the example use RAJA local arrays as tile memory.
+ *  RAJA variants of the exercise use RAJA local arrays as tile memory.
  *  Furthermore, the tiling pattern is handled by RAJA's tile statements.
  *  For CPU execution, RAJA local arrays are used to improve
  *  performance via cache blocking. For CUDA GPU execution,
@@ -51,7 +51,7 @@
 //
 // Define dimensionality of matrices
 //
-const int DIM = 2;
+constexpr int DIM = 2;
 
 //
 // Function for checking results
@@ -69,19 +69,19 @@ void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nRAJA shared matrix transpose example...\n";
+  std::cout << "\n\nRAJA shared matrix transpose exercise...\n";
 
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles
   //
   // _mattranspose_localarray_dims_start
-  const int N_r = 267;
-  const int N_c = 251;
+  constexpr int N_r = 267;
+  constexpr int N_c = 251;
 
-  const int TILE_DIM = 16;
+  constexpr int TILE_DIM = 16;
 
-  const int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
-  const int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
   // _mattranspose_localarray_dims_end
 
   //
@@ -195,7 +195,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // the array memory has not been allocated.
 
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -224,8 +224,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel_param<SEQ_EXEC_POL_I>( RAJA::make_tuple(RAJA::RangeSegment(0, N_c),
-                                                       RAJA::RangeSegment(0, N_r)),
+  RAJA::kernel_param<SEQ_EXEC_POL_I>( 
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
 
     RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
@@ -235,70 +236,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
     [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
       Atview(col, row) = Tile_Array(ty, tx);
+    }
 
-  });
+  );
   // _mattranspose_localarray_raja_end
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
-  //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - sequential matrix transpose example with args in statement ...\n";
-
-  std::memset(At, 0, N_r * N_c * sizeof(int));
-
-  //Alias for convenience
-  using RAJA::Segs;
-  using RAJA::Offsets;
-  using RAJA::Params;
-
-  // _mattranspose_localarray_raja_lambdaargs_start
-  using SEQ_EXEC_POL_II =
-    RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
-
-          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0>,
-
-          RAJA::statement::For<1, RAJA::loop_exec,
-            RAJA::statement::For<0, RAJA::loop_exec,
-              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> >
-            >
-          >,
-
-          RAJA::statement::For<0, RAJA::loop_exec,
-            RAJA::statement::For<1, RAJA::loop_exec,
-              RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> >
-            >
-          >
-
-          >
-        >
-      >
-    >;
-
-  RAJA::kernel_param<SEQ_EXEC_POL_II>( RAJA::make_tuple(RAJA::RangeSegment(0, N_c),
-                                                        RAJA::RangeSegment(0, N_r)),
-
-    RAJA::make_tuple(Tile_Array),
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-        Tile_Array(ty, tx) = Aview(row, col);
-    },
-
-    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
-      Atview(col, row) = Tile_Array(ty, tx);
-
-  });
-  // _mattranspose_localarray_raja_lambdaargs_end
-
-  checkResult<int>(Atview, N_c, N_r);
-  // printResult<int>(Atview, N_c, N_r);
 
 #if defined(RAJA_ENABLE_OPENMP)
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix "
-               "transpose example ...\n";
+               "transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -345,27 +295,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
    >;
 
   RAJA::kernel_param<OPENMP_EXEC_1_POL>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        Tile_Array(ty, tx) = Aview(row, col);
+      Tile_Array(ty, tx) = Aview(row, col);
 
-      },
+    },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        Atview(col, row) = Tile_Array(ty, tx);
+      Atview(col, row) = Tile_Array(ty, tx);
 
-      });
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 
   //--------------------------------------------------------------------------//
   std::cout << "\n Running RAJA - OpenMP (parallel inner loops) matrix "
-               "transpose example ...\n";
+               "transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -412,20 +364,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   >;
 
   RAJA::kernel_param<OPENMP_EXEC_2_POL>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        Tile_Array(ty, tx) = Aview(row, col);
+      Tile_Array(ty, tx) = Aview(row, col);
 
-      },
+    },
 
-      [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        Atview(col, row) = Tile_Array(ty, tx);
+      Atview(col, row) = Tile_Array(ty, tx);
 
-      });
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_r, N_c);
@@ -433,7 +387,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   //--------------------------------------------------------------------------//
 #if defined(RAJA_ENABLE_CUDA)
-  std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n";
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
 
@@ -489,20 +443,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 
   RAJA::kernel_param<CUDA_EXEC_POL>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        Tile_Array(ty, tx) = Aview(row, col);
+      Tile_Array(ty, tx) = Aview(row, col);
 
-      },
+    },
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        Atview(col, row) = Tile_Array(ty, tx);
+      Atview(col, row) = Tile_Array(ty, tx);
 
-      });
+    }
+  );
 
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
@@ -512,7 +468,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_HIP)
   //--------------------------------------------------------------------------//
-  std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
+  std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n";
 
   int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
   int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
@@ -582,26 +538,82 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 
   RAJA::kernel_param<HIP_EXEC_POL>(
-      RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)),
-      RAJA::make_tuple((int)0, (int)0, Tile_Array),
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+    RAJA::make_tuple((int)0, (int)0, Tile_Array),
 
-      [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        Tile_Array(ty, tx) = d_Aview(row, col);
+      Tile_Array(ty, tx) = d_Aview(row, col);
 
-      },
+    },
 
-      [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+    [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
 
-        d_Atview(col, row) = Tile_Array(ty, tx);
+      d_Atview(col, row) = Tile_Array(ty, tx);
 
-      });
+    }
+  );
 
   hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif
 
+
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //Alias for convenience
+  using RAJA::Segs;
+  using RAJA::Offsets;
+  using RAJA::Params;
+
+  // _raja_mattranspose_lambdaargs_start
+  using SEQ_EXEC_POL_II =
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+
+          RAJA::statement::InitLocalMem<RAJA::cpu_tile_mem, RAJA::ParamList<0>,
+
+          RAJA::statement::For<1, RAJA::loop_exec,
+            RAJA::statement::For<0, RAJA::loop_exec,
+              RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> >
+            >
+          >,
+
+          RAJA::statement::For<0, RAJA::loop_exec,
+            RAJA::statement::For<1, RAJA::loop_exec,
+              RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> >
+            >
+          >
+
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel_param<SEQ_EXEC_POL_II>( 
+    RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, N_c),
+                     RAJA::TypedRangeSegment<int>(0, N_r)),
+
+    RAJA::make_tuple(Tile_Array),
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+        Tile_Array(ty, tx) = Aview(row, col);
+    },
+
+    [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) {
+      Atview(col, row) = Tile_Array(ty, tx);
+    }
+  );
+  // _raja_mattranspose_lambdaargs_start
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
 //--------------------------------------------------------------------------//
 
   return 0;
diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp
new file mode 100644
index 0000000000..d513f0041b
--- /dev/null
+++ b/exercises/kernel-matrix-transpose-tiled.cpp
@@ -0,0 +1,382 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Tiled Matrix Transpose Exercise
+ *
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At.
+ *
+ *  This operation is carried out using a tiling algorithm.
+ *  The algorithm iterates over tiles of the matrix A and 
+ *  performs a transpose copy without explicitly storing the tile.
+ *
+ *  The algorithm is expressed as a collection of ``outer``
+ *  and ``inner`` for loops. Iterations of the inner loop will 
+ *  tranpose tile entries; while outer loops will iterate over
+ *  the number of tiles needed to carryout the transpose.
+ *  We do not assume that tiles divide the number of rows and 
+ *  and columns of the matrix.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::kernel' abstractions for nested loops
+ *    - Tiling statement
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA matrix transpose exercise...\n";
+
+  //
+  // Define num rows/cols in matrix, tile dimensions, and number of tiles.
+  //
+  // _tiled_mattranspose_dims_start
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
+
+  constexpr int TILE_DIM = 16;
+
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  // _tiled_mattranspose_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of tiled matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _tiled_mattranspose_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _tiled_mattranspose_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of tiled matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _cstyle_tiled_mattranspose_start
+  //
+  // (0) Outer loops to iterate over tiles
+  //
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
+      //
+      // (1) Loops to iterate over tile entries
+      //
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Atview(col, row) = Aview(row, col);
+          }
+        }
+      }
+      
+    }
+  }
+  // _cstyle_tiled_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::kernel method to carryout the
+  // transpose. 
+  //
+  // Here, we define RAJA range segments to establish the iteration spaces.
+  // Further partioning of the iteration space is carried out in the 
+  // tile_fixed statements. Iterations inside a RAJA loop is given by their
+  // global iteration number. 
+  //
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running sequential tiled matrix transpose ...\n";
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // The following policy carries out the transpose
+  // using sequential loops. The template parameter inside 
+  // tile_fixed corresponds to the dimension size of the tile.
+  //
+  // _raja_tiled_mattranspose_start
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a
+  ///           tiled matrix transpose.
+  ///
+  /// NOTE: We have done this first one for you.
+  ///
+
+  using TILED_KERNEL_EXEC_POL = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+          RAJA::statement::For<1, RAJA::loop_exec, 
+            RAJA::statement::For<0, RAJA::loop_exec,
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  // _raja_tiled_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+//----------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops over tiles sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a
+  ///           tiled matrix transpose.
+  ///
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  */
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops over tiles sequentially while collapsing inner loops
+  // into a single OpenMP parallel for loop enabling parallel loads/reads
+  // to/from the tile.
+  //
+  using TILED_KERNEL_EXEC_POL_OMP2 = 
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+          RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                    RAJA::ArgList<0, 1>,
+                                    RAJA::statement::Lambda<0>
+          > //closes collapse
+        > // closes Tile 0
+      > // closes Tile 1
+    >; // closes policy list
+      
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running cuda tiled matrix transpose ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  
+  // _raja_mattranspose_cuda_start
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a
+  ///           tiled matrix transpose.
+  ///
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+    [=] RAJA_DEVICE (int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  */
+  // _raja_mattranspose_cuda_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  std::cout << "\n Running hip tiled matrix transpose ...\n";
+
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+
+  RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+
+  using TILED_KERNEL_EXEC_POL_HIP =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+            RAJA::statement::For<1, RAJA::hip_thread_x_direct,
+              RAJA::statement::For<0, RAJA::hip_thread_y_direct,
+                                      RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>( RAJA::make_tuple(col_Range, row_Range),
+    [=] RAJA_DEVICE (int col, int row) {
+      d_Atview(col, row) = d_Aview(row, col);
+  });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+  //----------------------------------------------------------------------------//
+
+
+  //
+  // Clean up.
+  //
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(At);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+} 
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //                << std::endl;
+      std::cout<<Atview(row, col)<<" ";
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/examples/tut_tiled-matrix-transpose.cpp b/exercises/kernel-matrix-transpose-tiled_solution.cpp
similarity index 84%
rename from examples/tut_tiled-matrix-transpose.cpp
rename to exercises/kernel-matrix-transpose-tiled_solution.cpp
index 09298e8bb5..f064b38c41 100644
--- a/examples/tut_tiled-matrix-transpose.cpp
+++ b/exercises/kernel-matrix-transpose-tiled_solution.cpp
@@ -15,9 +15,9 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Tiled Matrix Transpose Example
+ *  Tiled Matrix Transpose Exercise
  *
- *  In this example, an input matrix A of dimension N_r x N_c is
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
  *  transposed and returned as a second matrix At.
  *
  *  This operation is carried out using a tiling algorithm.
@@ -41,7 +41,7 @@
 //
 // Define dimensionality of matrices
 //
-const int DIM = 2;
+constexpr int DIM = 2;
 
 //
 // Function for checking results
@@ -59,19 +59,19 @@ void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nRAJA tiled matrix transpose example...\n";
+  std::cout << "\n\nRAJA matrix transpose exercise...\n";
 
   //
   // Define num rows/cols in matrix, tile dimensions, and number of tiles.
   //
   // _tiled_mattranspose_dims_start
-  const int N_r = 56;
-  const int N_c = 75;
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
 
-  const int TILE_DIM = 16;
+  constexpr int TILE_DIM = 16;
 
-  const int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
-  const int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
   // _tiled_mattranspose_dims_end
 
   //
@@ -101,7 +101,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
   //printResult<int>(Aview, N_r, N_c);
 
-
   //----------------------------------------------------------------------------//
   std::cout << "\n Running C-version of tiled matrix transpose...\n";
 
@@ -146,8 +145,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // tile_fixed statements. Iterations inside a RAJA loop is given by their
   // global iteration number. 
   //
-  RAJA::RangeSegment row_Range(0, N_r);
-  RAJA::RangeSegment col_Range(0, N_c);
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
 
   //----------------------------------------------------------------------------//
   std::cout << "\n Running sequential tiled matrix transpose ...\n";
@@ -159,12 +158,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // tile_fixed corresponds to the dimension size of the tile.
   //
   // _raja_tiled_mattranspose_start
-  using KERNEL_EXEC_POL = 
+  using TILED_KERNEL_EXEC_POL = 
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-          RAJA::statement::For<1, RAJA::seq_exec, 
-            RAJA::statement::For<0, RAJA::seq_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+          RAJA::statement::For<1, RAJA::loop_exec, 
+            RAJA::statement::For<0, RAJA::loop_exec,
               RAJA::statement::Lambda<0>
             >
           >
@@ -172,7 +171,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel<KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+  RAJA::kernel<TILED_KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
     [=](int col, int row) {
       Atview(col, row) = Aview(row, col);
   });
@@ -191,7 +190,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // This policy loops over tiles sequentially while exposing parallelism on
   // one of the inner loops.
   //
-  using KERNEL_EXEC_POL_OMP = 
+  using TILED_KERNEL_EXEC_POL_OMP = 
     RAJA::KernelPolicy<
       RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::omp_parallel_for_exec,
         RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
@@ -204,12 +203,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       >
     >; 
 
-  RAJA::kernel<KERNEL_EXEC_POL_OMP>(
-                          RAJA::make_tuple(col_Range, row_Range), 
-                          [=](int col, int row) {
-
-    Atview(col, row) = Aview(row, col);
-
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
   });
 
   checkResult<int>(Atview, N_c, N_r);
@@ -225,10 +221,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   // into a single OpenMP parallel for loop enabling parallel loads/reads
   // to/from the tile.
   //
-  using KERNEL_EXEC_POL_OMP2 = 
+  using TILED_KERNEL_EXEC_POL_OMP2 = 
     RAJA::KernelPolicy<
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::seq_exec,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::loop_exec,
           RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
                                     RAJA::ArgList<0, 1>,
                                     RAJA::statement::Lambda<0>
@@ -237,12 +233,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       > // closes Tile 1
     >; // closes policy list
       
-  RAJA::kernel<KERNEL_EXEC_POL_OMP2>(
-                        RAJA::make_tuple(col_Range, row_Range), 
-                        [=](int col, int row) {
-
-    Atview(col, row) = Aview(row, col);
-
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_OMP2>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
   });
 
   checkResult<int>(Atview, N_c, N_r);
@@ -257,14 +250,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(At, 0, N_r * N_c * sizeof(int));
   
-  using KERNEL_EXEC_POL_CUDA = 
+  // _raja_mattranspose_cuda_start
+  using TILED_KERNEL_EXEC_POL_CUDA = 
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
         RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
           RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
             RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
               RAJA::statement::For<0, RAJA::cuda_thread_y_direct,
-                                      RAJA::statement::Lambda<0> 
+                RAJA::statement::Lambda<0> 
               >
             >
           >
@@ -272,13 +266,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel<KERNEL_EXEC_POL_CUDA>(
-                           RAJA::make_tuple(col_Range, row_Range), 
-                           [=] RAJA_DEVICE (int col, int row) {
-                             
-                             Atview(col, row) = Aview(row, col);
-      
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+    [=] RAJA_DEVICE (int col, int row) {
+      Atview(col, row) = Aview(row, col);
   });
+  // _raja_mattranspose_cuda_end
 
   checkResult<int>(Atview, N_c, N_r);
   //printResult<int>(Atview, N_c, N_r);
@@ -299,14 +291,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
   hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
 
-  using KERNEL_EXEC_POL_HIP =
+  using TILED_KERNEL_EXEC_POL_HIP =
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
         RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
           RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
             RAJA::statement::For<1, RAJA::hip_thread_x_direct,
               RAJA::statement::For<0, RAJA::hip_thread_y_direct,
-                                      RAJA::statement::Lambda<0>
+                RAJA::statement::Lambda<0>
               >
             >
           >
@@ -314,12 +306,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel<KERNEL_EXEC_POL_HIP>(
-                           RAJA::make_tuple(col_Range, row_Range),
-                           [=] RAJA_DEVICE (int col, int row) {
-
-                             d_Atview(col, row) = d_Aview(row, col);
-
+  RAJA::kernel<TILED_KERNEL_EXEC_POL_HIP>( RAJA::make_tuple(col_Range, row_Range),
+    [=] RAJA_DEVICE (int col, int row) {
+      d_Atview(col, row) = d_Aview(row, col);
   });
 
   hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
diff --git a/exercises/kernel-matrix-transpose.cpp b/exercises/kernel-matrix-transpose.cpp
new file mode 100644
index 0000000000..01b0ff5b78
--- /dev/null
+++ b/exercises/kernel-matrix-transpose.cpp
@@ -0,0 +1,272 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Tiled Matrix Transpose Exercise
+ *
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::kernel' abstractions for nested loops
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA matrix transpose exercise...\n";
+
+  //
+  // Define num rows/cols in matrix.
+  //
+  // _mattranspose_dims_start
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
+  // _mattranspose_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _cstyle_mattranspose_start
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
+    }
+  }
+  // _cstyle_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::kernel method to carryout the
+  // transpose. 
+  //
+  // Here, we define RAJA range segments to establish the iteration spaces.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
+  //
+//RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+//RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running sequential matrix transpose ...\n";
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // The following policy carries out the transpose
+  // using sequential loops. 
+  //
+  // _raja_mattranspose_start
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a
+  ///           basic matrix transpose.
+  ///   
+  ///           Uncomment 'row_Range' and 'col_Range' objects above so they
+  ///           can be used in the kernel. 
+  ///
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  RAJA::kernel<KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  */
+  // _raja_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+//----------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a
+  ///           basic matrix transpose.
+  ///   
+  ///           Uncomment 'row_Range' and 'col_Range' objects above so they
+  ///           can be used in the kernel. 
+  ///
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  */
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running cuda matrix transpose ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  
+  // _raja_mattranspose_cuda_start
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a
+  ///           basic matrix transpose.
+  ///   
+  ///           Uncomment 'row_Range' and 'col_Range' objects above so they
+  ///           can be used in the kernel. 
+  ///
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Uncomment this code block.
+  ///
+  /*
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+    [=] RAJA_DEVICE (int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  */
+  // _raja_mattranspose_cuda_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+//----------------------------------------------------------------------------//
+
+  //
+  // Clean up.
+  //
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(At);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+} 
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //                << std::endl;
+      std::cout<<Atview(row, col)<<" ";
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/kernel-matrix-transpose_solution.cpp b/exercises/kernel-matrix-transpose_solution.cpp
new file mode 100644
index 0000000000..a8d0610136
--- /dev/null
+++ b/exercises/kernel-matrix-transpose_solution.cpp
@@ -0,0 +1,247 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Tiled Matrix Transpose Exercise
+ *
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::kernel' abstractions for nested loops
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA matrix transpose exercise...\n";
+
+  //
+  // Define num rows/cols in matrix.
+  //
+  // _mattranspose_dims_start
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
+  // _mattranspose_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _cstyle_mattranspose_start
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
+    }
+  }
+  // _cstyle_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //--------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::kernel method to carryout the
+  // transpose. 
+  //
+  // Here, we define RAJA range segments to establish the iteration spaces.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
+  //
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running sequential matrix transpose ...\n";
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // The following policy carries out the transpose
+  // using sequential loops. 
+  //
+  // _raja_mattranspose_start
+  using KERNEL_EXEC_POL = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::loop_exec, 
+        RAJA::statement::For<0, RAJA::loop_exec,
+          RAJA::statement::Lambda<0>
+         >
+      >
+    >;
+
+  RAJA::kernel<KERNEL_EXEC_POL>( RAJA::make_tuple(col_Range, row_Range),
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  // _raja_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+//----------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+  using KERNEL_EXEC_POL_OMP = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::omp_parallel_for_exec, 
+        RAJA::statement::For<0, RAJA::loop_exec,
+          RAJA::statement::Lambda<0>
+        >
+      > 
+    >; 
+
+  RAJA::kernel<KERNEL_EXEC_POL_OMP>( RAJA::make_tuple(col_Range, row_Range), 
+    [=](int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running cuda matrix transpose ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  
+  // _raja_mattranspose_cuda_start
+  using KERNEL_EXEC_POL_CUDA = 
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_thread_x_loop,
+          RAJA::statement::For<0, RAJA::cuda_thread_y_loop,
+                                  RAJA::statement::Lambda<0> 
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<KERNEL_EXEC_POL_CUDA>( RAJA::make_tuple(col_Range, row_Range), 
+    [=] RAJA_DEVICE (int col, int row) {
+      Atview(col, row) = Aview(row, col);
+  });
+  // _raja_mattranspose_cuda_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+//----------------------------------------------------------------------------//
+
+  //
+  // Clean up.
+  //
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(At);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+} 
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //                << std::endl;
+      std::cout<<Atview(row, col)<<" ";
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/kernelintro-execpols.cpp b/exercises/kernelintro-execpols.cpp
new file mode 100644
index 0000000000..0c923825ed
--- /dev/null
+++ b/exercises/kernelintro-execpols.cpp
@@ -0,0 +1,507 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/resource.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+ *  RAJA::kernel execution policies
+ *
+ *  In this exercise, you will use a variety of nested-loop execution
+ *  policies to initalize entries in a three-dimensional tensor. The
+ *  goal of the exercise is to gain familiarity with RAJA::kernel
+ *  execution policies for various RAJA execution back-ends.
+ *
+ *  RAJA features you will use:
+ *    - `RAJA::kernel` kernel execution template method and exec policies
+ *    - Simple RAJA View/Layout
+ *    - RAJA Range segment
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ * If HIP is enabled, HIP global device memory is used, with explicit
+ * host-device mem copy operations.
+ */
+
+#if defined(RAJA_ENABLE_CUDA)
+// _cuda_tensorinit_kernel_start
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
+{
+  int i = blockIdx.x * i_block_size + threadIdx.x;
+  int j = blockIdx.y * j_block_size + threadIdx.y;
+  int k = blockIdx.z;
+
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
+  }
+}
+// _cuda_tensorinit_kernel_end
+#endif
+
+//
+// Function to check result.
+//
+void checkResult(double* a, double* aref, const int n);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
+
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_seq_end
+
+
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init...\n";
+
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_view_seq_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a sequential RAJA::kernel based version of the
+  ///           the tensor initialization kernel. Hint: recall the
+  ///           kernelintro-nested-loop-reorder.cpp exercise file used in
+  ///           the previous tutorial section.
+  ///
+
+  checkResult(a, a_ref, N_tot);
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_omp_outer_start
+  using EXEC_POL2 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<2, RAJA::omp_parallel_for_exec,    // k
+        RAJA::statement::For<1, RAJA::loop_exec,              // j
+          RAJA::statement::For<0, RAJA::loop_exec,            // i
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL2>( 
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  // _cstyle_tensorinit_omp_collapse_start
+  #pragma omp parallel for collapse(3)
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_omp_collapse_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_omp_collapse_start
+  using EXEC_POL3 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<2, 1, 0>,  // k, j, i
+        RAJA::statement::Lambda<0>
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL3>( 
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_collapse_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an OpenMP RAJA::kernel based version of the
+  ///           kernel that collapses the outer two (k, j) loops and
+  ///           runs the inner 'i' loop sequentially. Hint: adjust the
+  ///           entries in the 'ArgList' above and insert a 'For' statement
+  ///           statement to execute the inner loop.
+  ///
+
+  checkResult(a, a_ref, N_tot);
+
+#endif // if defined(RAJA_ENABLE_OPENMP)
+
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_cuda_start
+  using EXEC_POL5 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<2, RAJA::cuda_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::cuda_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::cuda_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL5>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+// _cuda_blockdim_start
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+// _cuda_blockdim_end
+
+// _raja_tensorinit_cuda_tiled_direct_start
+  using EXEC_POL6 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::cuda_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::cuda_block_x_direct,
+            RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL6>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _cuda_tensorinit_tiled_direct_start
+  dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
+
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+
+  nested_init<i_block_sz, j_block_sz, k_block_sz>
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
+  cudaErrchk(cudaDeviceSynchronize());
+// _cuda_tensorinit_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+#endif // if defined(RAJA_ENABLE_CUDA)
+
+
+#if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_device_view_end
+
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_start
+  using EXEC_POL7 =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<2, RAJA::hip_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL7>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_tiled_direct_start
+  using EXEC_POL8 =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::hip_block_x_direct,
+            RAJA::statement::For<2, RAJA::hip_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL8>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_tiled_direct_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+  memoryManager::deallocate_gpu(d_a);
+
+#endif // if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+
+  // Clean up...
+  memoryManager::deallocate(a);
+  memoryManager::deallocate(a_ref);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to compare result to reference and print result P/F.
+//
+void checkResult(double* a, double* aref, const int n)
+{
+  bool correct = true;
+
+  int i = 0;
+  while ( correct && (i < n) ) {
+    correct = std::abs(a[i] - aref[i]) < 10e-12;
+    i++;
+  }
+
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp
new file mode 100644
index 0000000000..50c360dde6
--- /dev/null
+++ b/exercises/kernelintro-execpols_solution.cpp
@@ -0,0 +1,533 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/resource.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+ *  RAJA::kernel execution policies
+ *
+ *  In this exercise, you will use a variety of nested-loop execution
+ *  policies to initalize entries in a three-dimensional tensor. The
+ *  goal of the exercise is to gain familiarity with RAJA::kernel
+ *  execution policies for various RAJA execution back-ends.
+ *
+ *  RAJA features you will use:
+ *    - `RAJA::kernel` kernel execution template method and exec policies
+ *    - Simple RAJA View/Layout
+ *    - RAJA Range segment
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ * If HIP is enabled, HIP global device memory is used, with explicit
+ * host-device mem copy operations.
+ */
+
+#if defined(RAJA_ENABLE_CUDA)
+// _cuda_tensorinit_kernel_start
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
+{
+  int i = blockIdx.x * i_block_size + threadIdx.x;
+  int j = blockIdx.y * j_block_size + threadIdx.y;
+  int k = blockIdx.z;
+
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
+  }
+}
+// _cuda_tensorinit_kernel_end
+#endif
+
+//
+// Function to check result.
+//
+void checkResult(double* a, double* aref, const int n);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
+
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_seq_end
+
+
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init...\n";
+
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_view_seq_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_seq_start
+  using EXEC_POL1 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<2, RAJA::loop_exec,    // k
+        RAJA::statement::For<1, RAJA::loop_exec,  // j
+          RAJA::statement::For<0, RAJA::loop_exec,// i
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL1>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_seq_end
+
+  checkResult(a, a_ref, N_tot);
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_omp_outer_start
+  using EXEC_POL2 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<2, RAJA::omp_parallel_for_exec,    // k
+        RAJA::statement::For<1, RAJA::loop_exec,              // j
+          RAJA::statement::For<0, RAJA::loop_exec,            // i
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL2>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  // _cstyle_tensorinit_omp_collapse_start
+  #pragma omp parallel for collapse(3)
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_omp_collapse_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_omp_collapse_start
+  using EXEC_POL3 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<2, 1, 0>,  // k, j, i
+        RAJA::statement::Lambda<0>
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL3>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_collapse_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_omp_collapse_start
+  using EXEC_POL4 =
+    RAJA::KernelPolicy<
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<2, 1>,    // k, j
+        RAJA::statement::For<0, RAJA::loop_exec,        // i
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL4>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=]( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_omp_collapse_end
+
+  checkResult(a, a_ref, N_tot);
+
+#endif // if defined(RAJA_ENABLE_OPENMP)
+
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_cuda_start
+  using EXEC_POL5 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<2, RAJA::cuda_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::cuda_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::cuda_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL5>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+// _cuda_blockdim_start
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+// _cuda_blockdim_end
+
+// _raja_tensorinit_cuda_tiled_direct_start
+  using EXEC_POL6 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::cuda_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::cuda_block_x_direct,
+            RAJA::statement::For<2, RAJA::cuda_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::cuda_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL6>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_cuda_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _cuda_tensorinit_tiled_direct_start
+  dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
+
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+
+  nested_init<i_block_sz, j_block_sz, k_block_sz>
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
+  cudaErrchk(cudaDeviceSynchronize());
+// _cuda_tensorinit_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+#endif // if defined(RAJA_ENABLE_CUDA)
+
+
+#if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_device_view_end
+
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_start
+  using EXEC_POL7 =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<2, RAJA::hip_thread_z_loop,      // k
+          RAJA::statement::For<1, RAJA::hip_thread_y_loop,    // j
+            RAJA::statement::For<0, RAJA::hip_thread_x_loop,  // i
+              RAJA::statement::Lambda<0>
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL7>(
+    RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N),
+                      RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_tiled_direct_start
+  using EXEC_POL8 =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz,
+        RAJA::statement::Tile<1, RAJA::tile_fixed<j_block_sz>,
+                                 RAJA::hip_block_y_direct,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<i_block_sz>,
+                                   RAJA::hip_block_x_direct,
+            RAJA::statement::For<2, RAJA::hip_block_z_direct,      // k
+              RAJA::statement::For<1, RAJA::hip_thread_y_direct,   // j
+                RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i
+                  RAJA::statement::Lambda<0>
+                >
+              >
+            >
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<EXEC_POL8>(
+     RAJA::make_tuple( RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N),
+                       RAJA::TypedRangeSegment<int>(0, N) ),
+
+    [=] __device__ ( int i, int j, int k) {
+       d_aView(i, j, k) = c * i * j * k ;
+    }
+  );
+// _raja_tensorinit_hip_tiled_direct_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+  memoryManager::deallocate_gpu(d_a);
+
+#endif // if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+
+  // Clean up...
+  memoryManager::deallocate(a);
+  memoryManager::deallocate(a_ref);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to compare result to reference and print result P/F.
+//
+void checkResult(double* a, double* aref, const int n)
+{
+  bool correct = true;
+
+  int i = 0;
+  while ( correct && (i < n) ) {
+    correct = std::abs(a[i] - aref[i]) < 10e-12;
+    i++;
+  }
+
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp
new file mode 100644
index 0000000000..c9327ecc56
--- /dev/null
+++ b/exercises/kernelintro-nested-loop-reorder.cpp
@@ -0,0 +1,184 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ * Nested Loop Basics and Loop Reordering (RAJA::kernel)
+ *
+ *  In this exercise, we introduce basic RAJA::kernel mechanics for executing
+ *  nested loop kernels, including using execution policies to permute the 
+ *  order of loops in a loop nest. The exercise performs no actual 
+ *  computation and just prints out loop indices to show different
+ *  loop ordering. Also, to avoid difficulty in interpreting parallel 
+ *  output, the execution policies use sequential execution.
+ *
+ *  RAJA features shown:
+ *    - 'RAJA::kernel' loop abstractions and execution policies
+ *    - 'RAJA::TypedRangeSegment' iteration spaces
+ *    - Strongly-typed loop indices
+ */
+
+//
+// Define three named loop index integer types used in the triply-nested loops.
+// These will trigger compilation errors if lambda index argument ordering 
+// and types do not match the typed range index ordering.  See final
+// example in this file.
+//
+// _raja_typed_indices_start
+RAJA_INDEX_VALUE_T(KIDX, int, "KIDX");
+RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); 
+RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); 
+// _raja_typed_indices_end
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  // _range_min_max_start
+  constexpr int imin = 0;
+  constexpr int imax = 2;
+  constexpr int jmin = 1;
+  constexpr int jmax = 3;
+  constexpr int kmin = 2;
+  constexpr int kmax = 4;
+  // _range_min_max_end
+
+//
+// The RAJA variants of the loop nest use the following typed range segments
+// based on the typed indices defined above, outside of main().
+//
+  // _raja_typed_index_ranges_start
+  RAJA::TypedRangeSegment<KIDX> KRange(kmin, kmax);
+  RAJA::TypedRangeSegment<JIDX> JRange(jmin, jmax);
+  RAJA::TypedRangeSegment<IIDX> IRange(imin, imax);
+  // _raja_typed_index_ranges_end
+ 
+
+  std::cout << "\n\nRAJA::kernel nested loop reorder example...\n";
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+
+  // _cstyle_kji_loops_start
+  for (int k = kmin; k < kmax; ++k) {
+    for (int j = jmin; j < jmax; ++j) {
+      for (int i = imin; i < imax; ++i) {
+        printf( " (%d, %d, %d) \n", i, j, k);
+      }
+    }
+  }
+  // _cstyle_kji_loops_end
+
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)"
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+
+  // _raja_kji_loops_start
+  using KJI_EXECPOL = RAJA::KernelPolicy<
+                        RAJA::statement::For<2, RAJA::seq_exec,    // k
+                          RAJA::statement::For<1, RAJA::seq_exec,  // j
+                            RAJA::statement::For<0, RAJA::seq_exec,// i 
+                              RAJA::statement::Lambda<0>
+                            > 
+                          > 
+                        > 
+                      >;
+
+  RAJA::kernel<KJI_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
+  [=] (IIDX i, JIDX j, KIDX k) { 
+     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+  });
+  // _raja_kji_loops_end
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+
+  // _cstyle_jik_loops_start
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      for (int k = kmin; k < kmax; ++k) {
+        printf( " (%d, %d, %d) \n", i, j, k);
+      }
+    }
+  }
+  // _cstyle_jik_loops_end
+
+//----------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)"
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, 
+  ///           i on middle loop, and k on inner loop
+  ///
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+
+  // _cstyle_ikj_loops_start
+  for (int i = imin; i < imax; ++i) {
+    for (int k = kmin; k < kmax; ++k) {
+      for (int j = jmin; j < jmax; ++j) {
+        printf( " (%d, %d, %d) \n", i, j, k);
+      }
+    }
+  }
+  // _cstyle_ikj_loops_end
+
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)"
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, 
+  ///           k on middle loop, and j on inner loop
+  ///
+
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+#if 0  // Enable this code block to generate compiler error.
+//----------------------------------------------------------------------------//
+// The following demonstrates that code will not compile if lambda argument
+// types/order do not match the types/order For statements in the execution
+// policy. To see this, enable this code section and try to compile this file.
+//----------------------------------------------------------------------------//
+
+  // _raja_compile_error_start
+  RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
+  [=] (JIDX i, IIDX j, KIDX k) {
+     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
+  });
+  // _raja_compile_error_end
+
+#endif
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
diff --git a/exercises/tutorial_halfday/ex7_nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp
similarity index 52%
rename from exercises/tutorial_halfday/ex7_nested-loop-reorder_solution.cpp
rename to exercises/kernelintro-nested-loop-reorder_solution.cpp
index 2461e0e40a..14ef279f73 100644
--- a/exercises/tutorial_halfday/ex7_nested-loop-reorder_solution.cpp
+++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp
@@ -11,64 +11,81 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- * EXERCISE #6: Nested Loop Reordering
+ * Nested Loop Basics and Loop Reordering (RAJA::kernel)
  *
- *  In this exercise, you will use RAJA::kernel execution policies 
- *  to permute the order of loops in a triple loop nest. In particular,
- *  you will reorder loop statements in execution policies. The exercise
- *  does no actual computation and just prints out the loop indices to show 
- *  the different orderings.
- *
- *  To avoid the complexity of interpreting parallel output, the execution
- *  policies you will write will use sequential execution.
+ *  In this exercise, we introduce basic RAJA::kernel mechanics for executing
+ *  nested loop kernels, including using execution policies to permute the 
+ *  order of loops in a loop nest. The exercise performs no actual 
+ *  computation and just prints out loop indices to show different
+ *  loop ordering. Also, to avoid difficulty in interpreting parallel 
+ *  output, the execution policies use sequential execution.
  *
  *  RAJA features shown:
- *    - Index range segment
  *    - 'RAJA::kernel' loop abstractions and execution policies
- *    - Nested loop reordering
+ *    - 'RAJA::TypedRangeSegment' iteration spaces
  *    - Strongly-typed loop indices
  */
 
 //
-// Define three named loop index types used in the triply-nested loops.
+// Define three named loop index integer types used in the triply-nested loops.
 // These will trigger compilation errors if lambda index argument ordering 
 // and types do not match the typed range index ordering.  See final
 // example in this file.
 //
-RAJA_INDEX_VALUE(KIDX, "KIDX");
-RAJA_INDEX_VALUE(JIDX, "JIDX"); 
-RAJA_INDEX_VALUE(IIDX, "IIDX"); 
+// _raja_typed_indices_start
+RAJA_INDEX_VALUE_T(KIDX, int, "KIDX");
+RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); 
+RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); 
+// _raja_typed_indices_end
 
 
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nExercise #7: RAJA nested loop reorder example...\n";
+  // _range_min_max_start
+  constexpr int imin = 0;
+  constexpr int imax = 2;
+  constexpr int jmin = 1;
+  constexpr int jmax = 3;
+  constexpr int kmin = 2;
+  constexpr int kmax = 4;
+  // _range_min_max_end
+
+//
+// The RAJA variants of the loop nest use the following typed range segments
+// based on the typed indices defined above, outside of main().
+//
+  // _raja_typed_index_ranges_start
+  RAJA::TypedRangeSegment<KIDX> KRange(kmin, kmax);
+  RAJA::TypedRangeSegment<JIDX> JRange(jmin, jmax);
+  RAJA::TypedRangeSegment<IIDX> IRange(imin, imax);
+  // _raja_typed_index_ranges_end
+ 
+
+  std::cout << "\n\nRAJA::kernel nested loop reorder example...\n";
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 
-  std::cout << "\n Running C-style loop nest with loop ordering: K-outer, J-middle, I-inner" 
+  std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" 
             << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
-  for (int k = 2; k < 4; ++k) {
-    for (int j = 1; j < 3; ++j) {
-      for (int i = 0; i < 2; ++i) {
+  // _cstyle_kji_loops_start
+  for (int k = kmin; k < kmax; ++k) {
+    for (int j = jmin; j < jmax; ++j) {
+      for (int i = imin; i < imax; ++i) {
         printf( " (%d, %d, %d) \n", i, j, k);
       }
     }
   }
+  // _cstyle_kji_loops_end
 
-//
-// The RAJA variants of the loop nest used following typed range segments
-// based on the typed indices defined above, outside of main().
-//
-  RAJA::TypedRangeSegment<KIDX> KRange(2, 4);
-  RAJA::TypedRangeSegment<JIDX> JRange(1, 3);
-  RAJA::TypedRangeSegment<IIDX> IRange(0, 2);
- 
 //----------------------------------------------------------------------------//
  
-  std::cout << "\n\n Running RAJA nested loop example (K-outer, J-middle, I-inner)"
+  std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)"
             << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
+  // _raja_kji_loops_start
   using KJI_EXECPOL = RAJA::KernelPolicy<
                         RAJA::statement::For<2, RAJA::seq_exec,    // k
                           RAJA::statement::For<1, RAJA::seq_exec,  // j
@@ -83,13 +100,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   [=] (IIDX i, JIDX j, KIDX k) { 
      printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
   });
+  // _raja_kji_loops_end
 
-
+//----------------------------------------------------------------------------//
 //----------------------------------------------------------------------------//
  
-  std::cout << "\n Running RAJA nested loop example (J-outer, I-middle, K-inner)"
+  std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
+
+  // _cstyle_jik_loops_start
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      for (int k = kmin; k < kmax; ++k) {
+        printf( " (%d, %d, %d) \n", i, j, k);
+      }
+    }
+  }
+  // _cstyle_jik_loops_end
+
+//----------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)"
             << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
+  // _raja_jik_loops_start
   using JIK_EXECPOL = RAJA::KernelPolicy<
                         RAJA::statement::For<1, RAJA::seq_exec,    // j
                           RAJA::statement::For<0, RAJA::seq_exec,  // i
@@ -104,13 +137,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   [=] (IIDX i, JIDX j, KIDX k) { 
      printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
   });
+  // _raja_jik_loops_end
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+  std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" 
+            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
+  // _cstyle_ikj_loops_start
+  for (int i = imin; i < imax; ++i) {
+    for (int k = kmin; k < kmax; ++k) {
+      for (int j = jmin; j < jmax; ++j) {
+        printf( " (%d, %d, %d) \n", i, j, k);
+      }
+    }
+  }
+  // _cstyle_ikj_loops_end
 
 //----------------------------------------------------------------------------//
  
-  std::cout << "\n Running RAJA nested loop example (I-outer, K-middle, J-inner)"
+  std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)"
             << "...\n\n" << " (I, J, K)\n" << " ---------\n";
 
+  // _raja_ikj_loops_start
   using IKJ_EXECPOL = RAJA::KernelPolicy<
                         RAJA::statement::For<0, RAJA::seq_exec,    // i
                           RAJA::statement::For<2, RAJA::seq_exec,  // k
@@ -125,19 +175,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   [=] (IIDX i, JIDX j, KIDX k) {
      printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
   });
+  // _raja_ikj_loops_end
 
 
-#if 0
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+ 
+#if 0  // Enable this code block to generate compiler error.
 //----------------------------------------------------------------------------//
 // The following demonstrates that code will not compile if lambda argument
 // types/order do not match the types/order For statements in the execution
 // policy. To see this, enable this code section and try to compile this file.
 //----------------------------------------------------------------------------//
 
+  // _raja_compile_error_start
   RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
   [=] (JIDX i, IIDX j, KIDX k) {
      printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
   });
+  // _raja_compile_error_end
 
 #endif
 
diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
new file mode 100644
index 0000000000..06fe36d53a
--- /dev/null
+++ b/exercises/launch-matrix-transpose-local-array.cpp
@@ -0,0 +1,442 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "memoryManager.hpp"
+
+/*
+ *  Matrix Transpose Example
+ *
+ *  In this example, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At of size N_c x N_r.
+ *
+ *  This operation is carried out using a local memory tiling
+ *  algorithm. The algorithm first loads matrix entries into an
+ *  iteraion shared tile, a two-dimensional array, and then
+ *  reads from the tile with row and column indices swapped for
+ *  the output matrix.
+ *
+ *  The algorithm is expressed as a collection of ``outer``
+ *  and ``inner`` for loops. Iterations of the inner loops will load/read
+ *  data into the tile; while outer loops will iterate over the number
+ *  of tiles needed to carry out the transpose.
+ *
+ *  RAJA variants of the example use RAJA_TEAM_SHARED as tile memory.
+ *  Furthermore, the tiling pattern is handled by RAJA's tile methods.
+ *  For CPU execution, RAJA_TEAM_SHARED are used to improve
+ *  performance via cache blocking. For CUDA GPU execution,
+ *  RAJA shared memory is mapped to CUDA shared memory which
+ *  enables threads in the same thread block to share data.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::launch' abstractions for nested loops
+ *       - tile methods
+ *       - loop_icount methods
+ *       - RAJA_TEAM_SHARED
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices and tile size
+//
+const int DIM = 2;
+#define TILE_DIM (16)  // #define to appease msvc
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA shared matrix transpose example...\n";
+
+  //
+  // Define num rows/cols in matrix, tile dimensions, and number of tiles
+  //
+  // _mattranspose_localarray_dims_start
+  constexpr int N_r = 267;
+  constexpr int N_c = 251;
+
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  // _mattranspose_localarray_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_localarray_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_localarray_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  // printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of shared matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _mattranspose_localarray_cstyle_start
+  //
+  // (0) Outer loops to iterate over tiles
+  //
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
+
+      // Stack-allocated local array for data on a tile
+      int Tile[TILE_DIM][TILE_DIM];
+
+      //
+      // (1) Inner loops to read input matrix tile data into the array
+      //
+      //     Note: loops are ordered so that input matrix data access
+      //           is stride-1.
+      //
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Tile[ty][tx] = Aview(row, col);
+          }
+        }
+      }
+
+      //
+      // (2) Inner loops to write array data into output array tile
+      //
+      //     Note: loop order is swapped from above so that output matrix
+      //           data access is stride-1.
+      //
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Atview(col, row) = Tile[ty][tx];
+          }
+        }
+      }
+
+    }
+  }
+  // _mattranspose_localarray_cstyle_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _mattranspose_localarray_raja_start
+  using loop_pol_1 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_1>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          ///
+          /// TODO ...
+          ///
+          /// Exercise Implement loop_icount methods to load tiles of the
+          /// input matrix into the RAJA_TEAM_SHARED memory array
+          ///
+
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+
+              Atview(col, row) = Tile_Array[ty][tx];
+
+            });
+          });
+
+        });
+      });
+
+    });
+  // _mattranspose_localarray_raja_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+#if defined(RAJA_ENABLE_OPENMP)
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix "
+               "transpose example ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops over tiles sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an omp_pol_2 type that will distribute loop iterations
+  ///           within the omp parallel region.
+  ///
+
+  //using loop_pol_2 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+
+  RAJA::launch<launch_policy_2>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+
+      /*
+      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+          RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+            RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+              RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+
+                Tile_Array[ty][tx] = Aview(row, col);
+
+              });
+            });
+
+            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+              RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+
+                Atview(col, row) = Tile_Array[ty][tx];
+
+                });
+              });
+
+          });
+        });
+    */
+    });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+  //--------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+
+  /// TODO...
+  ///
+  /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads directly
+  ///
+
+  const bool cuda_async = false;
+  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+
+  RAJA::launch<cuda_launch_policy>(
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+      /*
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
+
+            });
+          });
+
+          RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+
+              Atview(col, row) = Tile_Array[ty][tx];
+
+            });
+          });
+
+        });
+      });
+      */
+  });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+//--------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
+
+  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+
+  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
+  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+
+  using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+  using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+
+  const bool hip_async = false;
+  using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
+
+  RAJA::launch<hip_launch_policy>
+     (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = d_Aview(row, col);
+
+            });
+          });
+
+          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+
+              d_Atview(col, row) = Tile_Array[ty][tx];
+
+            });
+          });
+
+        });
+      });
+
+    });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+//--------------------------------------------------------------------------//
+
+  return 0;
+}
+
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+                << std::endl;
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
new file mode 100644
index 0000000000..536d21bfbe
--- /dev/null
+++ b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -0,0 +1,437 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "memoryManager.hpp"
+
+/*
+ *  Matrix Transpose Example
+ *
+ *  In this example, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At of size N_c x N_r.
+ *
+ *  This operation is carried out using a local memory tiling
+ *  algorithm. The algorithm first loads matrix entries into an
+ *  iteraion shared tile, a two-dimensional array, and then
+ *  reads from the tile with row and column indices swapped for
+ *  the output matrix.
+ *
+ *  The algorithm is expressed as a collection of ``outer``
+ *  and ``inner`` for loops. Iterations of the inner loops will load/read
+ *  data into the tile; while outer loops will iterate over the number
+ *  of tiles needed to carry out the transpose.
+ *
+ *  RAJA variants of the example use RAJA_TEAM_SHARED as tile memory.
+ *  Furthermore, the tiling pattern is handled by RAJA's tile methods.
+ *  For CPU execution, RAJA_TEAM_SHARED are used to improve
+ *  performance via cache blocking. For CUDA GPU execution,
+ *  RAJA shared memory is mapped to CUDA shared memory which
+ *  enables threads in the same thread block to share data.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::launch' abstractions for nested loops
+ *       - tile methods
+ *       - loop_icount methods
+ *       - RAJA_TEAM_SHARED
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices and tile size
+//
+const int DIM = 2;
+#define TILE_DIM (16)  // #define to appease msvc
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA shared matrix transpose example...\n";
+
+  //
+  // Define num rows/cols in matrix, tile dimensions, and number of tiles
+  //
+  // _mattranspose_localarray_dims_start
+  constexpr int N_r = 267;
+  constexpr int N_c = 251;
+
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  // _mattranspose_localarray_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_localarray_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_localarray_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  // printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of shared matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _mattranspose_localarray_cstyle_start
+  //
+  // (0) Outer loops to iterate over tiles
+  //
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
+
+      // Stack-allocated local array for data on a tile
+      int Tile[TILE_DIM][TILE_DIM];
+
+      //
+      // (1) Inner loops to read input matrix tile data into the array
+      //
+      //     Note: loops are ordered so that input matrix data access
+      //           is stride-1.
+      //
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Tile[ty][tx] = Aview(row, col);
+          }
+        }
+      }
+
+      //
+      // (2) Inner loops to write array data into output array tile
+      //
+      //     Note: loop order is swapped from above so that output matrix
+      //           data access is stride-1.
+      //
+      for (int tx = 0; tx < TILE_DIM; ++tx) {
+        for (int ty = 0; ty < TILE_DIM; ++ty) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Atview(col, row) = Tile[ty][tx];
+          }
+        }
+      }
+
+    }
+  }
+  // _mattranspose_localarray_cstyle_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA - sequential matrix transpose example ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _mattranspose_localarray_raja_start
+  using loop_pol_1 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_1>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
+
+            });
+          });
+
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+
+              Atview(col, row) = Tile_Array[ty][tx];
+
+            });
+          });
+
+        });
+      });
+
+    });
+  // _mattranspose_localarray_raja_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+#if defined(RAJA_ENABLE_OPENMP)
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix "
+               "transpose example ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops over tiles sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+  using omp_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_pol_2 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+
+  RAJA::launch<launch_policy_2>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
+
+            });
+          });
+
+          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+
+              Atview(col, row) = Tile_Array[ty][tx];
+
+            });
+          });
+
+        });
+      });
+
+    });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+  //--------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+
+  using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
+  using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+
+  using cuda_threads_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+  using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+
+  const bool cuda_async = false;
+  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+
+  RAJA::launch<cuda_launch_policy>(
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = Aview(row, col);
+
+            });
+          });
+
+         RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+           RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+
+             Atview(col, row) = Tile_Array[ty][tx];
+
+           });
+         });
+
+       });
+     });
+
+   });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+//--------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  //--------------------------------------------------------------------------//
+  std::cout << "\n Running RAJA - HIP matrix transpose example ...\n";
+
+  int *d_A = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int *d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+
+  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
+  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+
+  using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+  using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+
+  const bool hip_async = false;
+  using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
+
+  RAJA::launch<hip_launch_policy>
+     (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
+
+          RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+
+              Tile_Array[ty][tx] = d_Aview(row, col);
+
+            });
+          });
+
+          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
+           RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+
+             d_Atview(col, row) = Tile_Array[ty][tx];
+
+           });
+         });
+
+       });
+     });
+
+   });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+
+//--------------------------------------------------------------------------//
+
+  return 0;
+}
+
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+                << std::endl;
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp
new file mode 100644
index 0000000000..86a88413b7
--- /dev/null
+++ b/exercises/launch-matrix-transpose-tiled.cpp
@@ -0,0 +1,422 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Tiled Matrix Transpose Example
+ *
+ *  In this example, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At.
+ *
+ *  This operation is carried out using a tiling algorithm.
+ *  The algorithm iterates over tiles of the matrix A and
+ *  performs a transpose copy without explicitly storing the tile.
+ *
+ *  The algorithm is expressed as a collection of ``outer``
+ *  and ``inner`` for loops. Iterations of the inner loop will
+ *  tranpose tile entries; while outer loops will iterate over
+ *  the number of tiles needed to carryout the transpose.
+ *  We do not assume that tiles divide the number of rows and
+ *  and columns of the matrix.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::launch' abstractions for nested loops
+ *    - tiling method
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA tiled matrix transpose example...\n";
+
+  //
+  // Define num rows/cols in matrix, tile dimensions, and number of tiles.
+  //
+  // _tiled_mattranspose_dims_start
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
+
+  constexpr int TILE_DIM = 16;
+
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  // _tiled_mattranspose_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of tiled matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _tiled_mattranspose_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _tiled_mattranspose_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of tiled matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _cstyle_tiled_mattranspose_start
+  //
+  // (0) Outer loops to iterate over tiles
+  //
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
+      //
+      // (1) Loops to iterate over tile entries
+      //
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Atview(col, row) = Aview(row, col);
+          }
+        }
+      }
+
+    }
+  }
+  // _cstyle_tiled_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::kernel method to carryout the
+  // transpose.
+  //
+  // Here, we define RAJA range segments to establish the iteration spaces.
+  // Further partioning of the iteration space is carried out in the
+  // tile_fixed statements. Iterations inside a RAJA loop is given by their
+  // global iteration number.
+  //
+
+/// 
+/// TODO: Uncomment these range segments so you can use them in the 
+///       non-HIP exercises in this file.
+/*
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+*/
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running sequential tiled matrix transpose ...\n";
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // The following policy carries out the transpose
+  // using sequential loops. The template parameter inside
+  // tile_fixed corresponds to the dimension size of the tile.
+  //
+  // _raja_tiled_mattranspose_start
+  //using loop_pol_1 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_1>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+
+      /*
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
+
+              /// 
+              /// TODO...
+              ///
+              /// EXERCISE: Implement a loop method that takes a col_tile and 
+              ///           returns the global index to the column iteration
+              ///
+              /// Uncomment the statement below to run the kernel and check the 
+              /// result. 
+              /// 
+              
+              //Atview(col, row) = Aview(row, col);
+
+          });
+              
+        });
+      });
+      */
+  });
+  // _raja_tiled_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+//----------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops over tiles sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+  //using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  //using loop_pol_2 = RAJA::LoopPolicy<RAJA::loop_exec>;
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Create a launch_policy_2 that will create an omp parallel region
+  ///
+  /// Uncomment the kernel below to run it and check the result. 
+  /// 
+  ///           
+
+  /*
+  RAJA::launch<launch_policy_2>(
+     RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
+     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+            RAJA::loop<loop_pol_2>(ctx, row_tile, [&] (int row) {
+                RAJA::loop<loop_pol_2>(ctx, col_tile, [&] (int col) {
+
+                    Atview(col, row) = Aview(row, col);
+
+                  });
+              });
+
+          });
+        });
+
+    });
+*/
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running cuda tiled matrix transpose ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  /*
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+
+  using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
+  using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+
+  using cuda_threads_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+  using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+  */
+
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the cuda launch policy to dispatch the kernel below
+  ///           on the GPU
+  ///
+  ///           When you uncomment kernel code below, you will also need to 
+  ///           uncomment variables above that are used within it.
+  ///
+
+/*
+  RAJA::launch<cuda_launch_policy>(
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
+            });
+          });
+
+        });
+      });
+
+  });
+*/
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  std::cout << "\n Running hip tiled matrix transpose ...\n";
+
+  RAJA::TypedRangeSegment<int> row_Range2(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range2(0, N_c);
+
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+
+  RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+
+  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
+  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+
+  using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+  using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+
+  const bool hip_async = false;
+  using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
+
+  RAJA::launch<hip_launch_policy>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                        RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, row_Range2, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, col_Range2, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<hip_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<hip_threads_x>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
+            });
+          });
+
+        });
+      });
+
+  });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+  //----------------------------------------------------------------------------//
+
+
+  //
+  // Clean up.
+  //
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(At);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //                << std::endl;
+      std::cout<<Atview(row, col)<<" ";
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/launch-matrix-transpose-tiled_solution.cpp b/exercises/launch-matrix-transpose-tiled_solution.cpp
new file mode 100644
index 0000000000..674a078495
--- /dev/null
+++ b/exercises/launch-matrix-transpose-tiled_solution.cpp
@@ -0,0 +1,385 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Tiled Matrix Transpose Example
+ *
+ *  In this example, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At.
+ *
+ *  This operation is carried out using a tiling algorithm.
+ *  The algorithm iterates over tiles of the matrix A and
+ *  performs a transpose copy without explicitly storing the tile.
+ *
+ *  The algorithm is expressed as a collection of ``outer``
+ *  and ``inner`` for loops. Iterations of the inner loop will
+ *  tranpose tile entries; while outer loops will iterate over
+ *  the number of tiles needed to carryout the transpose.
+ *  We do not assume that tiles divide the number of rows and
+ *  and columns of the matrix.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::launch' abstractions for nested loops
+ *    - tiling method
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA tiled matrix transpose example...\n";
+
+  //
+  // Define num rows/cols in matrix, tile dimensions, and number of tiles.
+  //
+  // _tiled_mattranspose_dims_start
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
+
+  constexpr int TILE_DIM = 16;
+
+  constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1;
+  constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1;
+  // _tiled_mattranspose_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of tiled matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _tiled_mattranspose_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _tiled_mattranspose_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of tiled matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _cstyle_tiled_mattranspose_start
+  //
+  // (0) Outer loops to iterate over tiles
+  //
+  for (int by = 0; by < outer_Dimr; ++by) {
+    for (int bx = 0; bx < outer_Dimc; ++bx) {
+      //
+      // (1) Loops to iterate over tile entries
+      //
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
+
+          int col = bx * TILE_DIM + tx;  // Matrix column index
+          int row = by * TILE_DIM + ty;  // Matrix row index
+
+          // Bounds check
+          if (row < N_r && col < N_c) {
+            Atview(col, row) = Aview(row, col);
+          }
+        }
+      }
+
+    }
+  }
+  // _cstyle_tiled_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::kernel method to carryout the
+  // transpose.
+  //
+  // Here, we define RAJA range segments to establish the iteration spaces.
+  // Further partioning of the iteration space is carried out in the
+  // tile_fixed statements. Iterations inside a RAJA loop is given by their
+  // global iteration number.
+  //
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running sequential tiled matrix transpose ...\n";
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // The following policy carries out the transpose
+  // using sequential loops. The template parameter inside
+  // tile_fixed corresponds to the dimension size of the tile.
+  //
+  // _raja_tiled_mattranspose_start
+  using loop_pol_1 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_1>(RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<loop_pol_1>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<loop_pol_1>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
+            });
+          });
+
+        });
+      });
+
+  });
+  // _raja_tiled_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+//----------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running openmp tiled matrix transpose -  parallel top inner loop...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops over tiles sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+  using omp_for_pol_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_pol_2 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+
+  RAJA::launch<launch_policy_2>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<omp_for_pol_2>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<loop_pol_2>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<loop_pol_2>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
+            });
+          });
+
+        });
+      });
+
+  });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running cuda tiled matrix transpose ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+  
+  // _raja_mattranspose_cuda_start
+  using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
+  using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+
+  using cuda_threads_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+  using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+
+  const bool cuda_async = false;
+  using cuda_launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<cuda_async>>;
+
+  RAJA::launch<cuda_launch_policy>(
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<cuda_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<cuda_threads_x>(ctx, col_tile, [&] (int col) {
+
+              Atview(col, row) = Aview(row, col);
+
+            });
+          });
+
+        });
+      });
+
+  });
+  // _raja_mattranspose_cuda_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  std::cout << "\n Running hip tiled matrix transpose ...\n";
+
+  int* d_A  = memoryManager::allocate_gpu<int>(N_r * N_c);
+  int* d_At = memoryManager::allocate_gpu<int>(N_r * N_c);
+
+  RAJA::View<int, RAJA::Layout<DIM>> d_Aview(d_A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> d_Atview(d_At, N_c, N_r);
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice ));
+
+  constexpr int c_block_sz = TILE_DIM;
+  constexpr int r_block_sz = TILE_DIM;
+  const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
+  const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
+
+  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
+  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+
+  using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+  using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+
+  const bool hip_async = false;
+  using hip_launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<hip_async>>;
+
+  RAJA::launch<hip_launch_policy>(
+    RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
+                     RAJA::Threads(c_block_sz, r_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<hip_teams_x> (ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+
+          RAJA::loop<hip_threads_y>(ctx, row_tile, [&] (int row) {
+            RAJA::loop<hip_threads_x>(ctx, col_tile, [&] (int col) {
+
+              d_Atview(col, row) = d_Aview(row, col);
+
+           });
+         });
+
+       });
+     });
+
+  });
+
+  hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost ));
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+  //----------------------------------------------------------------------------//
+
+
+  //
+  // Clean up.
+  //
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(At);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //                << std::endl;
+      std::cout<<Atview(row, col)<<" ";
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/launch-matrix-transpose.cpp b/exercises/launch-matrix-transpose.cpp
new file mode 100644
index 0000000000..d2d1cd8a7c
--- /dev/null
+++ b/exercises/launch-matrix-transpose.cpp
@@ -0,0 +1,262 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Tiled Matrix Transpose Exercise
+ *
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::launch' abstractions for nested loops
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA matrix transpose exercise...\n";
+
+  //
+  // Define num rows/cols in matrix.
+  //
+  // _mattranspose_dims_start
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
+  // _mattranspose_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _cstyle_mattranspose_start
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
+    }
+  }
+  // _cstyle_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::kernel method to carryout the
+  // transpose. 
+  //
+  // Here, we define RAJA range segments to establish the iteration spaces.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
+  //
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running sequential matrix transpose ...\n";
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // The following policy carries out the transpose
+  // using sequential loops. 
+  //
+  // _raja_mattranspose_start
+  using loop_policy_seq = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_seq>
+   (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_seq>(ctx, row_Range, [&] (int /*row*/) {
+        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int /*col*/) {
+
+	  /// TODO...
+	  ///
+	  /// EXERCISE: Implement the kernel body for the transpose operation
+	  ///
+
+        });
+      });
+
+  });
+  // _raja_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+//----------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops sequentially while exposing parallelism on
+  // one of the inner loops.
+  
+  //uncomment to use in example below
+  //using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+
+  RAJA::launch<launch_policy_omp>(RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+
+
+      /// TODO...
+      ///
+      /// EXERCISE: Implement the loops to apply omp parallism and sequential
+      ///           execution on the column and row loops respectively
+      ///
+
+      //Atview(col, row) = Aview(row, col);
+
+
+  });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running cuda matrix transpose ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  
+  // _raja_mattranspose_cuda_start
+  using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
+  using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
+
+  const bool async = false; //execute asynchronously
+  using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
+
+  RAJA::launch<launch_policy_cuda>(
+    RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_thread_y>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<cuda_thread_x>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
+      });
+
+  });
+  // _raja_mattranspose_cuda_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+//----------------------------------------------------------------------------//
+
+  //
+  // Clean up.
+  //
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(At);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+} 
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //                << std::endl;
+      std::cout<<Atview(row, col)<<" ";
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/launch-matrix-transpose_solution.cpp b/exercises/launch-matrix-transpose_solution.cpp
new file mode 100644
index 0000000000..c9f397f894
--- /dev/null
+++ b/exercises/launch-matrix-transpose_solution.cpp
@@ -0,0 +1,257 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Tiled Matrix Transpose Exercise
+ *
+ *  In this exercise, an input matrix A of dimension N_r x N_c is
+ *  transposed and returned as a second matrix At.
+ *
+ *  RAJA features shown:
+ *    - Basic usage of 'RAJA::launch' abstractions for nested loops
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+//
+// Define dimensionality of matrices
+//
+constexpr int DIM = 2;
+
+//
+// Function for checking results
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+//
+// Function for printing results
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA matrix transpose exercise...\n";
+
+  //
+  // Define num rows/cols in matrix.
+  //
+  // _mattranspose_dims_start
+  constexpr int N_r = 56;
+  constexpr int N_c = 75;
+  // _mattranspose_dims_end
+
+  //
+  // Allocate matrix data
+  //
+  int *A = memoryManager::allocate<int>(N_r * N_c);
+  int *At = memoryManager::allocate<int>(N_r * N_c);
+
+  //
+  // In the following implementations of matrix transpose, we
+  // use RAJA 'View' objects to access the matrix data. A RAJA view
+  // holds a pointer to a data array and enables multi-dimensional indexing
+  // into the data.
+  //
+  // _mattranspose_views_start
+  RAJA::View<int, RAJA::Layout<DIM>> Aview(A, N_r, N_c);
+  RAJA::View<int, RAJA::Layout<DIM>> Atview(At, N_c, N_r);
+  // _mattranspose_views_end
+
+  //
+  // Initialize matrix data
+  //
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      Aview(row, col) = col;
+    }
+  }
+  //printResult<int>(Aview, N_r, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running C-version of matrix transpose...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  // _cstyle_mattranspose_start
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+        Atview(col, row) = Aview(row, col);
+    }
+  }
+  // _cstyle_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+
+  //----------------------------------------------------------------------------//
+
+  //
+  // The following RAJA variants use the RAJA::kernel method to carryout the
+  // transpose. 
+  //
+  // Here, we define RAJA range segments to establish the iteration spaces.
+  // Iterations inside a RAJA loop is given by their global iteration number. 
+  //
+  RAJA::TypedRangeSegment<int> row_Range(0, N_r);
+  RAJA::TypedRangeSegment<int> col_Range(0, N_c);
+
+  //----------------------------------------------------------------------------//
+  std::cout << "\n Running sequential matrix transpose ...\n";
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // The following policy carries out the transpose
+  // using sequential loops. 
+  //
+  // _raja_mattranspose_start
+  using loop_policy_seq = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_seq = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_seq>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_seq>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
+      });
+
+  });
+  // _raja_mattranspose_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+//----------------------------------------------------------------------------//
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running openmp matrix transpose -  parallel top inner loop...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+
+  //
+  // This policy loops sequentially while exposing parallelism on
+  // one of the inner loops.
+  //
+  using loop_policy_omp = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using launch_policy_omp = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+
+  RAJA::launch<launch_policy_omp>(
+    RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_omp>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<loop_policy_seq>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
+      });
+
+  });
+
+  checkResult<int>(Atview, N_c, N_r);
+  // printResult<int>(Atview, N_c, N_r);
+#endif
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running cuda matrix transpose ...\n";
+
+  std::memset(At, 0, N_r * N_c * sizeof(int));
+  
+  // _raja_mattranspose_cuda_start
+  using cuda_thread_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
+  using cuda_thread_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_loop>;
+
+  const bool async = false; //execute asynchronously
+  using launch_policy_cuda = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async>>;
+
+  RAJA::launch<launch_policy_cuda>
+    (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_thread_y>(ctx, row_Range, [&] (int row) {
+        RAJA::loop<cuda_thread_x>(ctx, col_Range, [&] (int col) {
+
+            Atview(col, row) = Aview(row, col);
+
+        });
+      });
+
+  });
+  // _raja_mattranspose_cuda_end
+
+  checkResult<int>(Atview, N_c, N_r);
+  //printResult<int>(Atview, N_c, N_r);
+#endif
+
+//----------------------------------------------------------------------------//
+
+  //
+  // Clean up.
+  //
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(At);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+} 
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  bool match = true;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      if (Atview(row, col) != row) {
+        match = false;
+      }
+    }
+  }
+  if (match) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+//
+// Function to print result.
+//
+template <typename T>
+void printResult(RAJA::View<T, RAJA::Layout<DIM>> Atview, int N_r, int N_c)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < N_r; ++row) {
+    for (int col = 0; col < N_c; ++col) {
+      // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col)
+      //                << std::endl;
+      std::cout<<Atview(row, col)<<" ";
+    }
+    std::cout << "" << std::endl;
+  }
+  std::cout << std::endl;
+}
diff --git a/exercises/launchintro-execpols.cpp b/exercises/launchintro-execpols.cpp
new file mode 100644
index 0000000000..921efa0e5b
--- /dev/null
+++ b/exercises/launchintro-execpols.cpp
@@ -0,0 +1,488 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/resource.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+ *  RAJA::Launch execution policies
+ *
+ *  In this exercise, you will use a variety of nested-loop execution
+ *  policies to initalize entries in a three-dimensional tensor. The
+ *  goal of the exercise is to gain familiarity with RAJA::Launch
+ *  execution policies for various RAJA execution back-ends.
+ *
+ *  RAJA features you will use:
+ *    - `RAJA::Launch` kernel execution template method and exec policies
+ *    - Simple RAJA View/Layout
+ *    - RAJA Range segment
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ * If HIP is enabled, HIP global device memory is used, with explicit
+ * host-device mem copy operations.
+ */
+
+#if defined(RAJA_ENABLE_CUDA)
+// _cuda_tensorinit_kernel_start
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
+{
+  int i = blockIdx.x * i_block_size + threadIdx.x;
+  int j = blockIdx.y * j_block_size + threadIdx.y;
+  int k = blockIdx.z;
+
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
+  }
+}
+// _cuda_tensorinit_kernel_end
+#endif
+
+//
+// Function to check result.
+//
+void checkResult(double* a, double* aref, const int n);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
+
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_seq_end
+
+
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init...\n";
+
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_view_seq_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Complete sequential RAJA::launch based version of the
+  ///           the tensor initialization kernel.
+  ///
+
+// _raja_tensorinit_seq_start
+  //using loop_policy_1 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_1>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+      /*
+      RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+          //Add additional loop methods to complete the kernel
+
+      });
+      */
+  });
+// _raja_tensorinit_seq_end
+
+  checkResult(a, a_ref, N_tot);
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Complete an OpenMP RAJA::launch based version of the
+  ///           kernel that creates a parallel outer loop.
+  ///
+
+// _raja_tensorinit_omp_outer_start
+  /*
+  using omp_policy_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_policy_2 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  */
+  using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+
+  RAJA::launch<launch_policy_2>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) {
+
+         //TODO: Use the omp_policy_2 to distribute loop iterations
+         //in a RAJA::loop method
+         /*
+         RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+            RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+
+            });
+         });
+        */
+
+  });
+// _raja_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+#endif
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+// _cuda_blockdim_start
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+// _cuda_blockdim_end
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_cuda_start
+  using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+  using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
+  using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+
+  const bool async_3 = false;
+  using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
+
+  RAJA::launch<launch_policy_3>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+        RAJA::loop<cuda_global_thread_y_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+          RAJA::loop<cuda_global_thread_x_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+            aView(i, j, k) = c * i * j * k ;
+
+          });
+        });
+      });
+  });
+
+// _raja_tensorinit_cuda_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_cuda_tiled_direct_start
+  using cuda_teams_z_4 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+  using cuda_teams_y_4 = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
+  using cuda_teams_x_4 = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+
+  using cuda_threads_y_4 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+  using cuda_threads_x_4 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+
+  const bool async_4 = false;
+  using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
+
+  RAJA::launch<launch_policy_4>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_4>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<cuda_teams_y_4>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<cuda_teams_x_4>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<cuda_threads_x_4>(ctx, i_tile, [&] (int i) {
+
+                    aView(i, j, k) = c * i * j * k ;
+
+                  });
+              });
+
+            });
+          });
+
+      });
+    });
+// _raja_tensorinit_cuda_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _cuda_tensorinit_tiled_direct_start
+  dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
+
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+
+  nested_init<i_block_sz, j_block_sz, k_block_sz>
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
+  cudaErrchk(cudaDeviceSynchronize());
+// _cuda_tensorinit_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+#endif // if defined(RAJA_ENABLE_CUDA)
+
+
+#if defined(RAJA_ENABLE_HIP)
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_deviceview_end
+
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_start
+  using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+  using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
+  using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
+
+  const bool async_5 = false;
+  using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
+
+  RAJA::launch<launch_policy_5>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+       RAJA::loop<hip_teams_z_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+           RAJA::loop<hip_global_thread_y_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+               RAJA::loop<hip_global_thread_x_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                   d_aView(i, j, k) = c * i * j * k ;
+
+           });
+         });
+       });
+
+  });
+// _raja_tensorinit_hip_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_tiled_direct_start
+  using hip_teams_z_6 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+  using hip_teams_y_6 = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
+  using hip_teams_x_6 = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+
+  using hip_threads_y_6 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+  using hip_threads_x_6 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+
+  const bool async_6 = false;
+  using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
+
+  RAJA::launch<launch_policy_6>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<hip_teams_z_6>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<hip_teams_y_6>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<hip_teams_x_6>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<hip_threads_x_6>(ctx, i_tile, [&] (int i) {
+
+                    d_aView(i, j, k) = c * i * j * k ;
+
+                  });
+              });
+
+            });
+          });
+
+      });
+    });
+// _raja_tensorinit_hip_tiled_direct_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+  memoryManager::deallocate_gpu(d_a);
+
+#endif // if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+
+  // Clean up...
+  memoryManager::deallocate(a);
+  memoryManager::deallocate(a_ref);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to compare result to reference and print result P/F.
+//
+void checkResult(double* a, double* aref, const int n)
+{
+  bool correct = true;
+
+  int i = 0;
+  while ( correct && (i < n) ) {
+    correct = std::abs(a[i] - aref[i]) < 10e-12;
+    i++;
+  }
+
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp
new file mode 100644
index 0000000000..0dfda9f9f0
--- /dev/null
+++ b/exercises/launchintro-execpols_solution.cpp
@@ -0,0 +1,473 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/resource.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+ *  RAJA::Launch execution policies
+ *
+ *  In this exercise, you will use a variety of nested-loop execution
+ *  policies to initalize entries in a three-dimensional tensor. The
+ *  goal of the exercise is to gain familiarity with RAJA::Launch
+ *  execution policies for various RAJA execution back-ends.
+ *
+ *  RAJA features you will use:
+ *    - `RAJA::Launch` kernel execution template method and exec policies
+ *    - Simple RAJA View/Layout
+ *    - RAJA Range segment
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ * If HIP is enabled, HIP global device memory is used, with explicit
+ * host-device mem copy operations.
+ */
+
+#if defined(RAJA_ENABLE_CUDA)
+// _cuda_tensorinit_kernel_start
+template< int i_block_size, int j_block_size, int k_block_size >
+__launch_bounds__(i_block_size*j_block_size*k_block_size)
+__global__ void nested_init(double* a, double c, int N)
+{
+  int i = blockIdx.x * i_block_size + threadIdx.x;
+  int j = blockIdx.y * j_block_size + threadIdx.y;
+  int k = blockIdx.z;
+
+  if ( i < N && j < N && k < N ) {
+    a[i+N*(j+N*k)] = c * i * j * k ;
+  }
+}
+// _cuda_tensorinit_kernel_end
+#endif
+
+//
+// Function to check result.
+//
+void checkResult(double* a, double* aref, const int n);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n";
+
+// _init_define_start
+//
+// 3D tensor has N^3 entries
+//
+  constexpr int N = 100;
+  constexpr int N_tot = N * N * N;
+  constexpr double c = 0.0001;
+  double* a = memoryManager::allocate<double>(N_tot);
+  double* a_ref = memoryManager::allocate<double>(N_tot);
+// _init_define_end
+
+//----------------------------------------------------------------------------//
+// C-style sequential variant establishes reference solution to compare with.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n";
+
+// _cstyle_tensorinit_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        a_ref[i+N*(j+N*k)] = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_seq_end
+
+
+//----------------------------------------------------------------------------//
+// We introduce a RAJA View to wrap the tensor data pointer and simplify
+// multi-dimensional indexing.
+// We use this in the rest of the examples in this file.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style sequential tensor init...\n";
+
+// _3D_raja_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N);
+// _3D_raja_view_end
+
+// _cstyle_tensorinit_view_seq_start
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_view_seq_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_seq_start
+  using loop_policy_1 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_1 = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+
+  RAJA::launch<launch_policy_1>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+         RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+            RAJA::loop<loop_policy_1>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                aView(i, j, k) = c * i * j * k ;
+
+            });
+         });
+      });
+  });
+// _raja_tensorinit_seq_end
+
+  checkResult(a, a_ref, N_tot);
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA OpenMP multithreading variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-style OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+  // _cstyle_tensorinit_omp_outer_start
+  #pragma omp parallel for
+  for (int k = 0; k < N; ++k ) {
+    for (int j = 0; j < N; ++j ) {
+      for (int i = 0; i < N; ++i ) {
+        aView(i, j, k) = c * i * j * k ;
+      }
+    }
+  }
+// _cstyle_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA OpenMP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_omp_outer_start
+  using omp_policy_2 = RAJA::LoopPolicy<RAJA::omp_for_exec>;
+  using loop_policy_2 = RAJA::LoopPolicy<RAJA::loop_exec>;
+  using launch_policy_2 = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
+
+  RAJA::launch<launch_policy_2>
+    (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<omp_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+         RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+            RAJA::loop<loop_policy_2>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                aView(i, j, k) = c * i * j * k ;
+
+            });
+         });
+      });
+  });
+// _raja_tensorinit_omp_outer_end
+
+  checkResult(a, a_ref, N_tot);
+#endif
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+// _cuda_blockdim_start
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+// _cuda_blockdim_end
+
+//----------------------------------------------------------------------------//
+// C-style and RAJA CUDA GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_cuda_start
+  using cuda_teams_z_3 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+  using cuda_global_thread_y_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
+  using cuda_global_thread_x_3 = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
+
+  const bool async_3 = false;
+  using launch_policy_3 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_3>>;
+
+  RAJA::launch<launch_policy_3>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+        RAJA::loop<cuda_global_thread_y_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+          RAJA::loop<cuda_global_thread_x_3>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+            aView(i, j, k) = c * i * j * k ;
+
+          });
+        });
+      });
+  });
+
+// _raja_tensorinit_cuda_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _raja_tensorinit_cuda_tiled_direct_start
+  using cuda_teams_z_4 = RAJA::LoopPolicy<RAJA::cuda_block_z_direct>;
+  using cuda_teams_y_4 = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
+  using cuda_teams_x_4 = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+
+  using cuda_threads_y_4 = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
+  using cuda_threads_x_4 = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+
+  const bool async_4 = false;
+  using launch_policy_4 = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async_4>>;
+
+  RAJA::launch<launch_policy_4>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                        RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<cuda_teams_z_4>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<cuda_teams_y_4>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<cuda_teams_x_4>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<cuda_threads_y_4>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<cuda_threads_x_4>(ctx, i_tile, [&] (int i) {
+
+                    aView(i, j, k) = c * i * j * k ;
+
+                  });
+              });
+
+            });
+          });
+
+      });
+    });
+// _raja_tensorinit_cuda_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+
+// _cuda_tensorinit_tiled_direct_start
+  dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz);
+  static_assert(i_block_sz*j_block_sz*k_block_sz == block_size,
+                "Invalid block_size");
+
+  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)),
+               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(N, k_block_sz)));
+
+  nested_init<i_block_sz, j_block_sz, k_block_sz>
+    <<<nblocks, nthreads_per_block>>>(a, c, N);
+  cudaErrchk( cudaGetLastError() );
+  cudaErrchk(cudaDeviceSynchronize());
+// _cuda_tensorinit_tiled_direct_end
+
+  checkResult(a, a_ref, N_tot);
+
+#endif // if defined(RAJA_ENABLE_CUDA)
+
+
+#if defined(RAJA_ENABLE_HIP)
+
+  //
+  // Define total thread-block size and size of each block dimension
+  //
+  constexpr int block_size = 256;
+  constexpr int i_block_sz = 32;
+  constexpr int j_block_sz = block_size / i_block_sz;
+  constexpr int k_block_sz = 1;
+
+  const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz);
+  const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz);
+  const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz);
+
+//----------------------------------------------------------------------------//
+// RAJA HIP GPU variants.
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  double *d_a = memoryManager::allocate_gpu<double>(N_tot);
+
+// _3D_raja_device_view_start
+  RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N);
+// _3D_raja_deviceview_end
+
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_start
+  using hip_teams_z_5 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+  using hip_global_thread_y_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_y>;
+  using hip_global_thread_x_5 = RAJA::LoopPolicy<RAJA::hip_global_thread_x>;
+
+  const bool async_5 = false;
+  using launch_policy_5 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_5>>;
+
+  RAJA::launch<launch_policy_5>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                        RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+       RAJA::loop<hip_teams_z_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+           RAJA::loop<hip_global_thread_y_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int j) {
+               RAJA::loop<hip_global_thread_x_5>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int i) {
+
+                   d_aView(i, j, k) = c * i * j * k ;
+
+           });
+         });
+       });
+
+  });
+// _raja_tensorinit_hip_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n";
+
+  // set tensor data to zero to ensure we initializing it correctly.
+  std::memset(a, 0, N_tot * sizeof(double));
+  hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice ));
+
+// _raja_tensorinit_hip_tiled_direct_start
+  using hip_teams_z_6 = RAJA::LoopPolicy<RAJA::hip_block_z_direct>;
+  using hip_teams_y_6 = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
+  using hip_teams_x_6 = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+
+  using hip_threads_y_6 = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
+  using hip_threads_x_6 = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
+
+  const bool async_6 = false;
+  using launch_policy_6 = RAJA::LaunchPolicy<RAJA::hip_launch_t<async_6>>;
+
+  RAJA::launch<launch_policy_6>
+    (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k),
+                      RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)),
+    [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+      RAJA::loop<hip_teams_z_6>(ctx, RAJA::TypedRangeSegment<int>(0, N), [&] (int k) {
+
+        RAJA::tile<hip_teams_y_6>
+          (ctx, j_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &j_tile) {
+
+          RAJA::tile<hip_teams_x_6>
+            (ctx, i_block_sz, RAJA::TypedRangeSegment<int>(0, N), [&] (RAJA::TypedRangeSegment<int> const &i_tile) {
+
+            RAJA::loop<hip_threads_y_6>(ctx, j_tile, [&] (int j) {
+                RAJA::loop<hip_threads_x_6>(ctx, i_tile, [&] (int i) {
+
+                    d_aView(i, j, k) = c * i * j * k ;
+
+                  });
+              });
+
+            });
+          });
+
+      });
+    });
+// _raja_tensorinit_hip_tiled_direct_end
+
+  hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost ));
+  checkResult(a, a_ref, N_tot);
+
+  memoryManager::deallocate_gpu(d_a);
+
+#endif // if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+
+  // Clean up...
+  memoryManager::deallocate(a);
+  memoryManager::deallocate(a_ref);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to compare result to reference and print result P/F.
+//
+void checkResult(double* a, double* aref, const int n)
+{
+  bool correct = true;
+
+  int i = 0;
+  while ( correct && (i < n) ) {
+    correct = std::abs(a[i] - aref[i]) < 10e-12;
+    i++;
+  }
+
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp
new file mode 100644
index 0000000000..ef0a430f1b
--- /dev/null
+++ b/exercises/memoryManager.hpp
@@ -0,0 +1,101 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef EXAMPLES_MEMORYMANAGER_HPP
+#define EXAMPLES_MEMORYMANAGER_HPP
+
+#include "RAJA/RAJA.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+#include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+#include "RAJA/policy/hip/raja_hiperrchk.hpp"
+#endif
+
+/*
+  As RAJA does not manage memory we include a general purpose memory
+  manager which may be used to perform c++ style allocation/deallocation
+  or allocate/deallocate CUDA unified memory. The type of memory allocated
+  is dependent on how RAJA was configured.
+*/
+namespace memoryManager
+{
+
+#if defined(RAJA_ENABLE_SYCL)
+  static camp::resources::Resource* sycl_res;
+#endif
+
+template <typename T>
+T *allocate(RAJA::Index_type size)
+{
+  T *ptr;
+#if defined(RAJA_ENABLE_CUDA)
+  cudaErrchk(
+      cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal));
+#elif defined(RAJA_ENABLE_HIP)
+      hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
+#elif defined(RAJA_ENABLE_SYCL)
+      ptr = sycl_res->allocate<T>(size, camp::resources::MemoryAccess::Managed);
+#else
+  ptr = new T[size];
+#endif
+  return ptr;
+}
+
+template <typename T>
+void deallocate(T *&ptr)
+{
+  if (ptr) {
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaFree(ptr));
+#elif defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipFree(ptr));
+#elif defined(RAJA_ENABLE_SYCL)
+    sycl_res->deallocate(ptr);
+#else
+    delete[] ptr;
+#endif
+    ptr = nullptr;
+  }
+}
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
+  template <typename T>
+  T *allocate_gpu(RAJA::Index_type size)
+  {
+    T *ptr;
+#if defined(RAJA_ENABLE_CUDA)
+    cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size));
+#elif defined(RAJA_ENABLE_HIP)
+    hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size));
+#elif defined(RAJA_ENABLE_SYCL)
+      auto qu = sycl_res->get<camp::resources::Sycl>().get_queue();
+      ptr = cl::sycl::malloc_device<T>(size, *qu);
+#endif
+    return ptr;
+  }
+
+  template <typename T>
+  void deallocate_gpu(T *&ptr)
+  {
+    if (ptr) {
+#if defined(RAJA_ENABLE_CUDA)
+      cudaErrchk(cudaFree(ptr));
+#elif defined(RAJA_ENABLE_HIP)
+      hipErrchk(hipFree(ptr));
+#elif defined(RAJA_ENABLE_SYCL)
+    sycl_res->deallocate(ptr);
+#endif
+      ptr = nullptr;
+    }
+  }
+#endif
+
+};  // namespace memoryManager
+#endif
diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp
new file mode 100644
index 0000000000..e12fc0a268
--- /dev/null
+++ b/exercises/offset-layout-stencil.cpp
@@ -0,0 +1,399 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "memoryManager.hpp"
+
+/*
+ *  Offset Layout Stencil Exercise
+ *
+ *  This exercise applies a five-point stencil to the interior cells of a 
+ *  lattice and stores the resulting sums in a second lattice of equal size.
+ *  You can think of the lattice as representing the centers of cells on a 
+ *  two-dimensional Cartesian mesh. 
+ *
+ *  The five-point stencil accumulates values of a cell and its four neighbors. 
+ *  Assuming the cells of a lattice may be accessed through a row/col fashion, 
+ *  the stencil may be expressed as the following sum:
+ * 
+ *  output(row, col) = input(row, col) +
+ *                     input(row - 1, col) + input(row + 1, col) +
+ *                     input(row, col - 1) + input(row, col + 1)
+ *
+ *  We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros
+ *  for a lattice of size (N_r + 2) x (N_c + 2).  
+ *
+ *  In the case of N_r = N_c = 3, the input lattice values are:
+ *
+ *  ---------------------
+ *  | 0 | 0 | 0 | 0 | 0 |
+ *  ---------------------
+ *  | 0 | 1 | 1 | 1 | 0 |
+ *  ---------------------
+ *  | 0 | 1 | 1 | 1 | 0 |
+ *  ---------------------
+ *  | 0 | 1 | 1 | 1 | 0 |
+ *  ---------------------
+ *  | 0 | 0 | 0 | 0 | 0 |
+ *  ---------------------
+ *
+ *  after the computation, we expect the output lattice to have values:
+ *
+ *  ---------------------
+ *  | 0 | 0 | 0 | 0 | 0 |
+ *  ---------------------
+ *  | 0 | 3 | 4 | 3 | 0 |
+ *  ---------------------
+ *  | 0 | 4 | 5 | 4 | 0 |
+ *  ---------------------
+ *  | 0 | 3 | 4 | 3 | 0 |
+ *  ---------------------
+ *  | 0 | 0 | 0 | 0 | 0 |
+ *  ---------------------
+ *
+ * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to 
+ * simplify the indexing to perform the stencil calculation. For the 
+ * purposes of discussion, we enumerate the lattice in the following manner:
+ *
+ *  --------------------------------------------------
+ *  | (-1, 3) | (0, 3)  | (1, 3)  | (2, 3)  | (3, 3)  |
+ *  --------------------------------------------------
+ *  | (-1, 2) | (0, 2)  | (1, 2)  | (2, 2)  | (3, 2)  |
+ *  --------------------------------------------------
+ *  | (-1, 1) | (0, 1)  | (1, 1)  | (2, 1)  | (3, 1)  |
+ *  --------------------------------------------------
+ *  | (-1, 0) | (0, 0)  | (1, 0)  | (2, 0)  | (3, 0)  |
+ *  ---------------------------------------------------
+ *  | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
+ *  ---------------------------------------------------
+ *
+ *  Notably (0, 0) corresponds to the bottom left corner of the stencil
+ *  interior region to which we apply stencil.
+ *
+ *  RAJA features shown:
+ *    - RAJA::kernel kernel execution method and execution policies
+ *    - RAJA::View 
+ *    - RAJA::Layout
+ *
+ * For the CUDA implementation, we use unified memory to hold the lattice data.
+ * For HIP, we use explicit host-device memory and manually copy data between
+ * the two. 
+ */
+
+/*
+ * Define number of threads in x and y dimensions of a GPU thread block
+ */
+#if defined(RAJA_ENABLE_CUDA)
+#define CUDA_BLOCK_SIZE 16
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+#define HIP_BLOCK_SIZE 16
+#endif
+
+//
+// Functions for printing and checking results
+//
+void printLattice(int* lattice, int N_r, int N_c);
+void checkResult(int* compLattice, int* refLattice, int totCells);
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nFive-point stencil example...\n";
+
+// _stencil_define_start
+//
+// Define num of interior cells in row/cols in a lattice
+//
+  constexpr int N_r = 5;
+  constexpr int N_c = 4;
+
+//
+// Define total num of cells in rows/cols in a lattice
+//
+  constexpr int totCellsInRow = N_r + 2;
+  constexpr int totCellsInCol = N_c + 2;
+
+//
+// Define total num of cells in a lattice
+//
+  constexpr int totCells = totCellsInRow * totCellsInCol;
+// _stencil_define_end
+
+//
+// Allocate and initialize lattice
+//
+  int* input = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* output = memoryManager::allocate<int>(totCells * sizeof(int));
+  int* output_ref = memoryManager::allocate<int>(totCells * sizeof(int));
+
+  std::memset(input, 0, totCells * sizeof(int));
+  std::memset(output, 0, totCells * sizeof(int));
+  std::memset(output_ref, 0, totCells * sizeof(int));
+
+//
+// C-Style intialization
+//
+// _stencil_input_init_start
+  for (int row = 1; row <= N_r; ++row) {
+    for (int col = 1; col <= N_c; ++col) {
+      int id = col + totCellsInCol * row;
+      input[id] = 1;
+    }
+  }
+// _stencil_input_init_end
+
+  std::cout << "\ninput lattice:\n"; 
+  printLattice(input, totCellsInRow, totCellsInCol);
+
+//
+// Generate reference solution
+//
+// _stencil_output_ref_start
+  for (int row = 1; row <= N_r; ++row) {
+    for (int col = 1; col <= N_c; ++col) {
+
+      int id = col + totCellsInCol * row;
+      output_ref[id] = input[id] + input[id + 1]
+                        + input[id - 1]
+                        + input[id + totCellsInCol]
+                        + input[id - totCellsInCol];
+    }
+  }
+// _stencil_output_ref_end
+
+  std::cout << "\noutput reference lattice:\n"; 
+  printLattice(output_ref, totCellsInRow, totCellsInCol);
+
+//----------------------------------------------------------------------------//
+
+//
+// The following code illustrates pairing an offset layout and a RAJA view
+// object to simplify multidimensional indexing.
+// An offset layout is constructed by using the make_offset_layout method.
+// The first argument of the layout is an array object with the coordinates of
+// the bottom left corner of the lattice, and the second argument is an array
+// object of the coordinates of the top right corner plus 1.
+// The example uses double braces to initiate the array object and its
+// subobjects.
+//
+  // _offsetlayout_views_start
+  const int DIM = 2;
+
+  RAJA::OffsetLayout<DIM, int> layout =
+      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r+1, N_c+1}});
+
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> inputView(input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> outputView(output, layout);
+  // _offsetlayout_views_end
+
+//
+// Create range segments used in kernels
+//
+  // _offsetlayout_ranges_start
+  RAJA::TypedRangeSegment<int> col_range(0, N_c);
+  RAJA::TypedRangeSegment<int> row_range(0, N_r);
+  // _offsetlayout_ranges_end
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n";
+
+  std::memset(output, 0, totCells * sizeof(int));
+
+  // _offsetlayout_rajaseq_start
+  using NESTED_EXEC_POL1 =
+    RAJA::KernelPolicy<
+      RAJA::statement::For<1, RAJA::loop_exec,    // row
+        RAJA::statement::For<0, RAJA::loop_exec,  // col
+          RAJA::statement::Lambda<0>
+        >
+      >  
+    >;  
+
+  RAJA::kernel<NESTED_EXEC_POL1>(RAJA::make_tuple(col_range, row_range),
+                                 [=](int col, int row) {
+
+                                   outputView(row, col) =
+                                       inputView(row, col)
+                                       + inputView(row - 1, col)
+                                       + inputView(row + 1, col)
+                                       + inputView(row, col - 1)
+                                       + inputView(row, col + 1);
+
+                                 });
+  // _offsetlayout_rajaseq_end
+
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
+  checkResult(output, output_ref, totCells);
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+  std::cout << "\n Running five-point stencil (RAJA-Kernel OpenMP)...\n";
+
+  std::memset(output, 0, totCells * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an OpenMP RAJA::kernel based version of the
+  ///           the stencil operation where you collapse both loops to
+  ///           parallelize the entire computation. Hint: recall the
+  ///           kernelintro-execpols.cpp exercise file used in an
+  ///           earlier tutorial section.
+  ///
+
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
+  checkResult(output, output_ref, totCells);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+  std::cout << "\n Running five-point stencil (RAJA-Kernel CUDA)...\n";
+
+  std::memset(output, 0, totCells * sizeof(int));
+
+  // _offsetlayout_rajacuda_start
+  using NESTED_EXEC_POL3 =
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;                                                     
+
+  RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
+                                 [=] RAJA_DEVICE(int col, int row) {
+
+                                   outputView(row, col) =
+                                       inputView(row, col)
+                                       + inputView(row - 1, col)
+                                       + inputView(row + 1, col)
+                                       + inputView(row, col - 1)
+                                       + inputView(row, col + 1);
+
+                                 });
+  // _offsetlayout_rajacuda_end
+
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
+  checkResult(output, output_ref, totCells);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+  std::cout << "\n Running five-point stencil (RAJA-Kernel - "
+               "hip)...\n";
+
+  int* d_input  = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
+  int* d_output = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
+
+  hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice ));
+
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView (d_input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
+
+  // _offsetlayout_rajahip_start
+  using NESTED_EXEC_POL4 =
+    RAJA::KernelPolicy<
+      RAJA::statement::HipKernel<
+        RAJA::statement::For<1, RAJA::hip_block_x_loop, //row
+          RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >;
+
+  RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
+                                 [=] RAJA_DEVICE(int col, int row) {
+
+                                   d_outputView(row, col) =
+                                         d_inputView(row, col)
+                                       + d_inputView(row - 1, col)
+                                       + d_inputView(row + 1, col)
+                                       + d_inputView(row, col - 1)
+                                       + d_inputView(row, col + 1);
+                                 });
+  // _offsetlayout_rajahip_end
+
+  hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost ));
+
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
+  checkResult(output, output_ref, totCells);
+
+  memoryManager::deallocate_gpu(d_input);
+  memoryManager::deallocate_gpu(d_output);
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(input);
+  memoryManager::deallocate(output);
+  memoryManager::deallocate(output_ref);
+
+  std::cout << "\n DONE!...\n";
+  return 0;
+}
+
+//
+// Print Lattice
+//
+void printLattice(int* lattice, int totCellsInRow, int totCellsInCol)
+{
+  std::cout << std::endl;
+  for (int row = 0; row < totCellsInRow; ++row) {
+    for (int col = 0; col < totCellsInCol; ++col) {
+
+      const int id = col + totCellsInCol * row;
+      std::cout << lattice[id] << " ";
+    }
+    std::cout << " " << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+//
+// Check Result
+//
+void checkResult(int* compLattice, int* refLattice, int totCells)
+{
+  bool correct = true;
+
+  int i = 0;
+  while ( correct && (i < totCells) ) {
+    correct = (compLattice[i] == refLattice[i]);
+    i++;
+  }
+
+  if ( correct ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+}
diff --git a/examples/tut_offset-layout.cpp b/exercises/offset-layout-stencil_solution.cpp
similarity index 67%
rename from examples/tut_offset-layout.cpp
rename to exercises/offset-layout-stencil_solution.cpp
index f5d4befa3a..a9abc8bcb1 100644
--- a/examples/tut_offset-layout.cpp
+++ b/exercises/offset-layout-stencil_solution.cpp
@@ -14,27 +14,25 @@
 #include "memoryManager.hpp"
 
 /*
- *  Offset Layout example
+ *  Offset Layout Stencil Exercise
  *
- *  This example applies a five-cell stencil to the
- *  interior cells of a lattice and stores the 
- *  resulting sums in a second lattice of equal size.
+ *  This exercise applies a five-point stencil to the interior cells of a 
+ *  lattice and stores the resulting sums in a second lattice of equal size.
+ *  You can think of the lattice as representing the centers of cells on a 
+ *  two-dimensional Cartesian mesh. 
  *
- *  The five-cell stencil accumulates values of a cell 
- *  and its four neighbors. Assuming the cells of a 
- *  lattice may be accessed through a row/col fashion, 
- *  the stencil may be expressed as the following sum
+ *  The five-point stencil accumulates values of a cell and its four neighbors. 
+ *  Assuming the cells of a lattice may be accessed through a row/col fashion, 
+ *  the stencil may be expressed as the following sum:
  * 
  *  output(row, col) = input(row, col) +
  *                     input(row - 1, col) + input(row + 1, col) +
  *                     input(row, col - 1) + input(row, col + 1)
  *
- *  We assume a lattice has N x N interior nodes 
- *  and a padded edge of zeros for a lattice
- *  of size (N_r + 2) x (N_c + 2).  
+ *  We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros
+ *  for a lattice of size (N_r + 2) x (N_c + 2).  
  *
- *  In the case of N = 3, the input lattice generated
- *  takes the form
+ *  In the case of N_r = N_c = 3, the input lattice values are:
  *
  *  ---------------------
  *  | 0 | 0 | 0 | 0 | 0 |
@@ -48,8 +46,7 @@
  *  | 0 | 0 | 0 | 0 | 0 |
  *  ---------------------
  *
- *  after the computation, we expect the output
- *  lattice to take the form
+ *  after the computation, we expect the output lattice to have values:
  *
  *  ---------------------
  *  | 0 | 0 | 0 | 0 | 0 |
@@ -63,13 +60,9 @@
  *  | 0 | 0 | 0 | 0 | 0 |
  *  ---------------------
  *
- * In this example, we use RAJA's make_offset_layout
- * method and view object to simplify applying
- * the stencil to interior cells.
- * The make_offset_layout method enables developers
- * to create layouts which offset
- * the enumeration of values in an array. Here we
- * choose to enumerate the lattice in the following manner:
+ * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to 
+ * simplify the indexing to perform the stencil calculation. For the 
+ * purposes of discussion, we enumerate the lattice in the following manner:
  *
  *  --------------------------------------------------
  *  | (-1, 3) | (0, 3)  | (1, 3)  | (2, 3)  | (3, 3)  |
@@ -83,18 +76,22 @@
  *  | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) |
  *  ---------------------------------------------------
  *
- *  Notably (0, 0) corresponds to the bottom left
- *  corner of the region to which we wish to apply stencil.
+ *  Notably (0, 0) corresponds to the bottom left corner of the stencil
+ *  interior region to which we apply stencil.
  *
  *  RAJA features shown:
- *    - `forall` loop iteration template method
- *    -  Offset-layouts for RAJA Views
- *    -  Index range segment
- *    -  Execution policies
+ *    - RAJA::kernel kernel execution method and execution policies
+ *    - RAJA::View 
+ *    - RAJA::OffsetLayout
+ *    - RAJA::make_offset_layout method
+ *
+ * For the CUDA implementation, we use unified memory to hold the lattice data.
+ * For HIP, we use explicit host-device memory and manually copy data between
+ * the two. 
  */
 
 /*
- * Define number of threads in x and y dimensions of a CUDA thread block
+ * Define number of threads in x and y dimensions of a GPU thread block
  */
 #if defined(RAJA_ENABLE_CUDA)
 #define CUDA_BLOCK_SIZE 16
@@ -113,24 +110,26 @@ void checkResult(int* compLattice, int* refLattice, int totCells);
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nRAJA five-cell stencil example...\n";
+  std::cout << "\n\nFive-point stencil example...\n";
 
+// _stencil_define_start
 //
 // Define num of interior cells in row/cols in a lattice
 //
-  const int N_r = 3;
-  const int N_c = 3;
+  constexpr int N_r = 5;
+  constexpr int N_c = 4;
 
 //
 // Define total num of cells in rows/cols in a lattice
 //
-  const int totCellsInRow = N_r + 2;
-  const int totCellsInCol = N_c + 2;
+  constexpr int totCellsInRow = N_r + 2;
+  constexpr int totCellsInCol = N_c + 2;
 
 //
 // Define total num of cells in a lattice
 //
-  const int totCells = totCellsInRow * totCellsInCol;
+  constexpr int totCells = totCellsInRow * totCellsInCol;
+// _stencil_define_end
 
 //
 // Allocate and initialize lattice
@@ -146,17 +145,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // C-Style intialization
 //
+// _stencil_input_init_start
   for (int row = 1; row <= N_r; ++row) {
     for (int col = 1; col <= N_c; ++col) {
       int id = col + totCellsInCol * row;
       input[id] = 1;
     }
   }
-// printLattice(input, totCellsInRow, totCellsInCol);
+// _stencil_input_init_end
+
+  std::cout << "\ninput lattice:\n"; 
+  printLattice(input, totCellsInRow, totCellsInCol);
 
 //
 // Generate reference solution
 //
+// _stencil_output_ref_start
   for (int row = 1; row <= N_r; ++row) {
     for (int col = 1; col <= N_c; ++col) {
 
@@ -167,7 +171,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                         + input[id - totCellsInCol];
     }
   }
-// printLattice(output_ref, totCellsInRow, totCellsInCol);
+// _stencil_output_ref_end
+
+  std::cout << "\noutput reference lattice:\n"; 
+  printLattice(output_ref, totCellsInRow, totCellsInCol);
 
 //----------------------------------------------------------------------------//
 
@@ -184,31 +191,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   // _offsetlayout_views_start
   const int DIM = 2;
 
-  RAJA::OffsetLayout<DIM> layout =
-      RAJA::make_offset_layout<DIM>({{-1, -1}}, {{N_r+1, N_c+1}});
+  RAJA::OffsetLayout<DIM, int> layout =
+      RAJA::make_offset_layout<DIM, int>({{-1, -1}}, {{N_r+1, N_c+1}});
 
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> inputView(input, layout);
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> outputView(output, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> inputView(input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> outputView(output, layout);
   // _offsetlayout_views_end
 
 //
 // Create range segments used in kernels
 //
   // _offsetlayout_ranges_start
-  RAJA::RangeSegment col_range(0, N_r);
-  RAJA::RangeSegment row_range(0, N_c);
+  RAJA::TypedRangeSegment<int> col_range(0, N_c);
+  RAJA::TypedRangeSegment<int> row_range(0, N_r);
   // _offsetlayout_ranges_end
 
 //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running five-cell stencil (RAJA-Kernel - "
-               "sequential)...\n";
+  std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n";
+
+  std::memset(output, 0, totCells * sizeof(int));
 
   // _offsetlayout_rajaseq_start
   using NESTED_EXEC_POL1 =
     RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::seq_exec,    // row
-        RAJA::statement::For<0, RAJA::seq_exec,  // col
+      RAJA::statement::For<1, RAJA::loop_exec,    // row
+        RAJA::statement::For<0, RAJA::loop_exec,  // col
           RAJA::statement::Lambda<0>
         >
       >  
@@ -223,25 +231,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        + inputView(row + 1, col)
                                        + inputView(row, col - 1)
                                        + inputView(row, col + 1);
+
                                  });
   // _offsetlayout_rajaseq_end
 
-  //printLattice(output_ref, totCellsInRow, totCellsInCol);
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
 //----------------------------------------------------------------------------//
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  std::cout << "\n Running five-cell stencil (RAJA-Kernel - omp "
-               "parallel for)...\n";
+  std::cout << "\n Running five-point stencil (RAJA-Kernel OpenMP)...\n";
+
+  std::memset(output, 0, totCells * sizeof(int));
 
+  // _offsetlayout_rajaomp_start
   using NESTED_EXEC_POL2 = 
     RAJA::KernelPolicy<
-      RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row
-        RAJA::statement::For<0, RAJA::seq_exec,            // col
-          RAJA::statement::Lambda<0>
-        > 
+      RAJA::statement::Collapse<RAJA::omp_parallel_collapse_exec,
+                                RAJA::ArgList<1, 0>,   // row, col
+        RAJA::statement::Lambda<0>
       > 
     >;
 
@@ -254,9 +265,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        + inputView(row + 1, col)
                                        + inputView(row, col - 1)
                                        + inputView(row, col + 1);
+
                                  });
+  // _offsetlayout_rajaomp_end
 
-  //printLattice(output_ref, totCellsInRow, totCellsInCol);
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
@@ -264,9 +278,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_CUDA)
 
-  std::cout << "\n Running five-cell stencil (RAJA-Kernel - "
-               "cuda)...\n";
+  std::cout << "\n Running five-point stencil (RAJA-Kernel CUDA)...\n";
+
+  std::memset(output, 0, totCells * sizeof(int));
 
+  // _offsetlayout_rajacuda_start
   using NESTED_EXEC_POL3 =
     RAJA::KernelPolicy<
       RAJA::statement::CudaKernel<
@@ -287,9 +303,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        + inputView(row + 1, col)
                                        + inputView(row, col - 1)
                                        + inputView(row, col + 1);
+
                                  });
+  // _offsetlayout_rajacuda_end
 
-  //printLattice(output, totCellsInRow, totCellsInCol);
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 #endif
 
@@ -297,18 +316,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #if defined(RAJA_ENABLE_HIP)
 
-  std::cout << "\n Running five-cell stencil (RAJA-Kernel - "
+  std::cout << "\n Running five-point stencil (RAJA-Kernel - "
                "hip)...\n";
 
-  int* d_input  = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
-  int* d_output = memoryManager::allocate_gpu<int>(totCells * sizeof(int));
+  std::memset(output, 0, totCells * sizeof(int));
+
+  int* d_input  = memoryManager::allocate_gpu<int>(totCells);
+  int* d_output = memoryManager::allocate_gpu<int>(totCells);
 
   hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice ));
 
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> d_inputView (d_input, layout);
-  RAJA::View<int, RAJA::OffsetLayout<DIM>> d_outputView(d_output, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_inputView (d_input, layout);
+  RAJA::View<int, RAJA::OffsetLayout<DIM, int>> d_outputView(d_output, layout);
 
-  using NESTED_EXEC_POL3 =
+  // _offsetlayout_rajahip_start
+  using NESTED_EXEC_POL4 =
     RAJA::KernelPolicy<
       RAJA::statement::HipKernel<
         RAJA::statement::For<1, RAJA::hip_block_x_loop, //row
@@ -319,7 +342,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
       >
     >;
 
-  RAJA::kernel<NESTED_EXEC_POL3>(RAJA::make_tuple(col_range, row_range),
+  RAJA::kernel<NESTED_EXEC_POL4>(RAJA::make_tuple(col_range, row_range),
                                  [=] RAJA_DEVICE(int col, int row) {
 
                                    d_outputView(row, col) =
@@ -329,10 +352,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        + d_inputView(row, col - 1)
                                        + d_inputView(row, col + 1);
                                  });
+  // _offsetlayout_rajahip_end
 
   hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost ));
 
-  //printLattice(output, totCellsInRow, totCellsInCol);
+  std::cout << "\noutput lattice:\n"; 
+  printLattice(output, totCellsInRow, totCellsInCol);
   checkResult(output, output_ref, totCells);
 
   memoryManager::deallocate_gpu(d_input);
@@ -374,14 +399,15 @@ void printLattice(int* lattice, int totCellsInRow, int totCellsInCol)
 //
 void checkResult(int* compLattice, int* refLattice, int totCells)
 {
+  bool correct = true;
 
-  bool pass = true;
-
-  for (int i = 0; i < totCells; ++i) {
-    if (compLattice[i] != refLattice[i]) pass = false;
+  int i = 0;
+  while ( correct && (i < totCells) ) {
+    correct = (compLattice[i] == refLattice[i]);
+    i++;
   }
 
-  if (pass) {
+  if ( correct ) {
     std::cout << "\n\t result -- PASS\n";
   } else {
     std::cout << "\n\t result -- FAIL\n";
diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp
new file mode 100644
index 0000000000..a1805c3e35
--- /dev/null
+++ b/exercises/permuted-layout-batch-matrix-multiply.cpp
@@ -0,0 +1,737 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
+#include "memoryManager.hpp"
+
+/*
+ *  Batched Matrix Multiply Example
+ *
+ *  This example performs batched matrix multiplication
+ *  for matrices of dimension 3 x 3 using two different
+ *  data layouts.
+ *
+ *  Matrices are stored in arrays A and B. Results
+ *  are stored in a third array, C.
+ *  We introduce the notation A^{e}_rc
+ *  to correspond to the matrix entry in the row, r,
+ *  column, c, of matrix, e. Below we describe the two
+ *  layouts for the case of two (N=2) 3 x 3 matrices.
+ *
+ *  Layout 1:
+ *  Matrix entries are grouped together so that each
+ *  matrix is in a row major ordering.
+ *  i.e. A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02},
+ *            A^{0}_{10}, A^{0}_{11}, A^{0}_{12},
+ *            A^{0}_{20}, A^{0}_{21}, A^{0}_{22},
+ *            A^{1}_{00}, A^{1}_{01}, A^{1}_{02},
+ *            A^{1}_{10}, A^{1}_{11}, A^{1}_{12},
+ *            A^{1}_{20}, A^{1}_{21}, A^{1}_{22}];
+ *
+ *  Layout 2:
+ *  Matrix entries are first ordered by matrix number,
+ *  then by column number, and finally by row number.
+ *  i.e. A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01},
+ *            A^{1}_{01}, A^{0}_{02}, A^{1}_{02},
+ *            A^{0}_{10}, A^{1}_{10}, A^{0}_{11},
+ *            A^{1}_{11}, A^{0}_{12}, A^{1}_{12},
+ *            A^{0}_{20}, A^{1}_{20}, A^{0}_{21},
+ *            A^{1}_{21}, A^{0}_{22}, A^{1}_{22}];
+ *
+ * The extension to N > 2 matrices follows by direct
+ * extension. By exploring different data layouts,
+ * we can assess which performs best under a given
+ * execution policy and architecture.
+ *
+ *  RAJA features shown:
+ *    - RAJA::forall kernel execution method
+ *    - RAJA::View
+ *    - RAJA::Layout
+ *    - RAJA::make_permuted_layout method
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+ * Define number of threads in a GPU thread block
+ */
+#if defined(RAJA_ENABLE_CUDA)
+constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+constexpr int HIP_BLOCK_SIZE = 256;
+#endif
+
+//
+//Function for checking results
+//
+template <typename T>
+void checkResult(T C, int nMat, int nRows, int nCols);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA batched matrix multiplication exercise...\n";
+
+// Dimensions of matrices
+  constexpr int N_c = 3;
+  constexpr int N_r = 3;
+
+// Number of matrices
+  constexpr int N = 8000000;
+
+// Number of iterations
+  constexpr int NITER = 20;
+
+  std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n";
+
+//
+// Initialize a RAJA timer object
+// and variable to store minimum run time
+//
+  auto timer = RAJA::Timer();
+  double minRun = std::numeric_limits<double>::max();
+
+//
+// Allocate space for data in layout 1
+//
+  double *A = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C = memoryManager::allocate<double>(N_c * N_r * N);
+
+//
+// Layout 1
+//
+// make_permuted_layout takes the number of entries in each dimension and a
+// templated array indicating index arguments with slowest to fastest stride.
+// Standard C++ arrays are used to hold the number of entries in each component.
+// This example uses double braces to initalize the array and its subobjects.
+// The layout object will index into the array as the following C macro would
+// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
+//
+// RAJA::Layout objects may be templated on dimension, argument type, and 
+// index with unit stride. Here, the column index has unit stride (argument 2). 
+//
+  // _permutedlayout_defviews_start
+  std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
+  auto layout1 =
+      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 );
+
+  RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> Bview(B, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> Cview(C, layout1);
+  // _permutedlayout_defviews_end
+
+//
+// Allocate space for data in layout 2
+//
+  double *A2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C2 = memoryManager::allocate<double>(N_c * N_r * N);
+
+//
+// Permuted layout - equivalent to indexing using the following macro
+// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
+// In this case the element index has unit stride (argument 0). 
+//
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Define a permuted layout object (layout2) with the appropriate
+  ///           permutation so that the matrix index has unit stride,
+  ///           the column index has stride N (the number of matrices),
+  ///           and the row index has stride N * N_c.
+  ///
+  ///           Then, create views for the A2, B2, C2 arrays using the
+  ///           layout object; i.e., Aview2, Bview2, and Cview2.
+  ///
+  ///           Hint: You will the same indexing to access the array data 
+  ///           via the Views as for the Views above which are created 
+  ///           using the layout1 View (see kernels in the code below).
+  ///
+  ///           When you are done with the Views, test them out by 
+  ///           uncommenting the kernels in the code below that use the
+  ///           the Aview2, Bview2, and Cview2 views. 
+  ///
+
+//
+// Initialize data for layout 1 and layout 2 arrays/views.
+//
+// When OpenMP is enabled, we use an OpenMP exec policy for
+// "first touch" initialization.
+//
+#if defined(RAJA_ENABLE_OPENMP)
+  using INIT_POL = RAJA::omp_parallel_for_exec;
+#else
+  using INIT_POL = RAJA::loop_exec;
+#endif
+
+  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+    for (int row = 0; row < N_r; ++row) {
+      for (int col = 0; col < N_c; ++col) {
+        Aview(e, row, col) = row;
+        Bview(e, row, col) = col;
+        Cview(e, row, col) = 0;
+
+//      Aview2(e, row, col) = row;
+//      Bview2(e, row, col) = col;
+//      Cview2(e, row, col) = 0;
+      }
+    }
+  });
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - sequential) ... " << std::endl;
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    // _permutedlayout_batchedmatmult_loop_start
+    RAJA::forall<RAJA::loop_exec>(RAJA::TypedRangeSegment<int>(0, N),
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+      }
+    );
+    // _permutedlayout_batchedmatmult_loop_end
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+    
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - sequential) ... " << std::endl;
+
+/*
+    timer.start();
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    // _permutedlayout2_batchedmatmult_loop_start
+    RAJA::forall<RAJA::loop_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    // _permutedlayout2_batchedmatmult_loop_end
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+*/
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - omp parallel for) ... " << std::endl;
+
+  std::memset(C, 0, N_c * N_r * N * sizeof(double));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    // _permutedlayout_batchedmatmult_omp_start
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
+    // _permutedlayout_batchedmatmult_omp_end
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - omp parallel for) ... " << std::endl;
+
+  std::memset(C2, 0, N_c * N_r * N * sizeof(double));
+
+/*
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+*/
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - cuda) ... " << std::endl;
+
+  std::memset(C, 0, N_c * N_r * N * sizeof(double));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - cuda) ... " << std::endl;
+
+  std::memset(C2, 0, N_c * N_r * N * sizeof(double));
+
+/*
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+*/
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - hip) ... " << std::endl;
+
+  double *d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+
+  RAJA::View<double, RAJA::Layout<3, int, 2>> d_Aview(d_A, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> d_Bview(d_B, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> d_Cview(d_C, layout1);
+
+  hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+
+  hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//
+// Clean up.
+//
+  memoryManager::deallocate_gpu(d_A);
+  memoryManager::deallocate_gpu(d_B);
+  memoryManager::deallocate_gpu(d_C);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - hip) ... " << std::endl;
+
+/*
+  double *d_A2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_B2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_C2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+
+  RAJA::View<double, RAJA::Layout<3, int, 0>> d_Aview2(d_A2, layout2);
+  RAJA::View<double, RAJA::Layout<3, int, 0>> d_Bview2(d_B2, layout2);
+  RAJA::View<double, RAJA::Layout<3, int, 0>> d_Cview2(d_C2, layout2);
+
+  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2);
+
+        d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2);
+
+        d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+
+  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+
+//
+// Clean up.
+//
+  memoryManager::deallocate_gpu(d_A2);
+  memoryManager::deallocate_gpu(d_B2);
+  memoryManager::deallocate_gpu(d_C2);
+*/
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(B);
+  memoryManager::deallocate(C);
+  memoryManager::deallocate(A2);
+  memoryManager::deallocate(B2);
+  memoryManager::deallocate(C2);
+
+  std::cout << "\n DONE!...\n";
+  return 0;
+}
+
+//
+// check result
+//
+template <typename T>
+void checkResult(T C, int nMat, int nRows, int nCols)
+{
+
+  bool status = true;
+  for (int e = 0; e < nMat; ++e) {
+    for (int row = 0; row < nRows; ++row) {
+      for (int col = 0; col < nCols; ++col) {
+        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) {
+          status = false;
+        }
+      }
+    }
+  }
+
+  if ( status ) {
+    std::cout << "\tresult -- PASS\n";
+  } else {
+    std::cout << "\tresult -- FAIL\n";
+  }
+}
diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
new file mode 100644
index 0000000000..0cdb06d1b0
--- /dev/null
+++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp
@@ -0,0 +1,711 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA/util/Timer.hpp"
+
+
+#include "memoryManager.hpp"
+
+/*
+ *  Batched Matrix Multiply Example
+ *
+ *  This example performs batched matrix multiplication
+ *  for matrices of dimension 3 x 3 using two different
+ *  data layouts.
+ *
+ *  Matrices are stored in arrays A and B. Results
+ *  are stored in a third array, C.
+ *  We introduce the notation A^{e}_rc
+ *  to correspond to the matrix entry in the row, r,
+ *  column, c, of matrix, e. Below we describe the two
+ *  layouts for the case of two (N=2) 3 x 3 matrices.
+ *
+ *  Layout 1:
+ *  Matrix entries are grouped together so that each
+ *  matrix is in a row major ordering.
+ *  i.e. A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02},
+ *            A^{0}_{10}, A^{0}_{11}, A^{0}_{12},
+ *            A^{0}_{20}, A^{0}_{21}, A^{0}_{22},
+ *            A^{1}_{00}, A^{1}_{01}, A^{1}_{02},
+ *            A^{1}_{10}, A^{1}_{11}, A^{1}_{12},
+ *            A^{1}_{20}, A^{1}_{21}, A^{1}_{22}];
+ *
+ *  Layout 2:
+ *  Matrix entries are first ordered by matrix number,
+ *  then by column number, and finally by row number.
+ *  i.e. A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01},
+ *            A^{1}_{01}, A^{0}_{02}, A^{1}_{02},
+ *            A^{0}_{10}, A^{1}_{10}, A^{0}_{11},
+ *            A^{1}_{11}, A^{0}_{12}, A^{1}_{12},
+ *            A^{0}_{20}, A^{1}_{20}, A^{0}_{21},
+ *            A^{1}_{21}, A^{0}_{22}, A^{1}_{22}];
+ *
+ * The extension to N > 2 matrices follows by direct
+ * extension. By exploring different data layouts,
+ * we can assess which performs best under a given
+ * execution policy and architecture.
+ *
+ *  RAJA features shown:
+ *    - RAJA::forall kernel execution method
+ *    - RAJA::View
+ *    - RAJA::Layout
+ *    - RAJA::make_permuted_layout method
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+ * Define number of threads in a GPU thread block
+ */
+#if defined(RAJA_ENABLE_CUDA)
+constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+constexpr int HIP_BLOCK_SIZE = 256;
+#endif
+
+//
+//Function for checking results
+//
+template <typename T>
+void checkResult(T C, int nMat, int nRows, int nCols);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA batched matrix multiplication exercise...\n";
+
+// Dimensions of matrices
+  constexpr int N_c = 3;
+  constexpr int N_r = 3;
+
+// Number of matrices
+  constexpr int N = 8000000;
+
+// Number of iterations
+  constexpr int NITER = 20;
+
+  std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n";
+
+//
+// Initialize a RAJA timer object
+// and variable to store minimum run time
+//
+  auto timer = RAJA::Timer();
+  double minRun = std::numeric_limits<double>::max();
+
+//
+// Allocate space for data in layout 1
+//
+  double *A = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C = memoryManager::allocate<double>(N_c * N_r * N);
+
+//
+// Layout 1
+//
+// make_permuted_layout takes the number of entries in each dimension and a
+// templated array indicating index arguments with slowest to fastest stride.
+// Standard C++ arrays are used to hold the number of entries in each component.
+// This example uses double braces to initalize the array and its subobjects.
+// The layout object will index into the array as the following C macro would
+// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)].
+//
+// RAJA::Layout objects may be templated on dimension, argument type, and 
+// index with unit stride. Here, the column index has unit stride (argument 2). 
+//
+  // _permutedlayout_defviews_start
+  std::array<RAJA::idx_t, 3> perm1 {{0, 1, 2}};
+  auto layout1 =
+      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 );
+
+  RAJA::View<double, RAJA::Layout<3, int, 2>> Aview(A, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> Bview(B, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> Cview(C, layout1);
+  // _permutedlayout_defviews_end
+
+//
+// Allocate space for data in layout 2
+//
+  double *A2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *B2 = memoryManager::allocate<double>(N_c * N_r * N);
+  double *C2 = memoryManager::allocate<double>(N_c * N_r * N);
+
+//
+// Permuted layout - equivalent to indexing using the following macro
+// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)]
+// In this case the element index has unit stride (argument 0). 
+//
+  // _permutedlayout_permviews_start
+  std::array<RAJA::idx_t, 3> perm2 {{1, 2, 0}};
+  auto layout2 =
+      RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 );
+
+  RAJA::View<double, RAJA::Layout<3, int, 0>> Aview2(A2, layout2);
+  RAJA::View<double, RAJA::Layout<3, int, 0>> Bview2(B2, layout2);
+  RAJA::View<double, RAJA::Layout<3, int, 0>> Cview2(C2, layout2);
+  // _permutedlayout_permviews_end
+
+//
+// Initialize data for layout 1 and layout 2 arrays/views.
+//
+// When OpenMP is enabled, we use an OpenMP exec policy for
+// "first touch" initialization.
+//
+#if defined(RAJA_ENABLE_OPENMP)
+  using INIT_POL = RAJA::omp_parallel_for_exec;
+#else
+  using INIT_POL = RAJA::loop_exec;
+#endif
+
+  RAJA::forall<INIT_POL>(RAJA::TypedRangeSegment<int>(0, N), [=](int e) {
+    for (int row = 0; row < N_r; ++row) {
+      for (int col = 0; col < N_c; ++col) {
+        Aview(e, row, col) = row;
+        Bview(e, row, col) = col;
+        Cview(e, row, col) = 0;
+
+        Aview2(e, row, col) = row;
+        Bview2(e, row, col) = col;
+        Cview2(e, row, col) = 0;
+      }
+    }
+  });
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - sequential) ... " << std::endl;
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    // _permutedlayout_batchedmatmult_loop_start
+    RAJA::forall<RAJA::loop_exec>(RAJA::TypedRangeSegment<int>(0, N),
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+      }
+    );
+    // _permutedlayout_batchedmatmult_loop_end
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+    
+  std::cout << "\trun time : " << minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - sequential) ... " << std::endl;
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    // _permutedlayout2_batchedmatmult_loop_start
+    RAJA::forall<RAJA::loop_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    // _permutedlayout2_batchedmatmult_loop_end
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - omp parallel for) ... " << std::endl;
+
+  std::memset(C, 0, N_c * N_r * N * sizeof(double));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    // _permutedlayout_batchedmatmult_omp_start
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
+    // _permutedlayout_batchedmatmult_omp_end
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - omp parallel for) ... " << std::endl;
+
+  std::memset(C2, 0, N_c * N_r * N * sizeof(double));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=](int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : " << minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - cuda) ... " << std::endl;
+
+  std::memset(C, 0, N_c * N_r * N * sizeof(double));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0)
+                         + Aview(e, 0, 1) * Bview(e, 1, 0)
+                         + Aview(e, 0, 2) * Bview(e, 2, 0);
+        Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1)
+                         + Aview(e, 0, 1) * Bview(e, 1, 1)
+                         + Aview(e, 0, 2) * Bview(e, 2, 1);
+        Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2)
+                         + Aview(e, 0, 1) * Bview(e, 1, 2)
+                         + Aview(e, 0, 2) * Bview(e, 2, 2);
+
+        Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0)
+                         + Aview(e, 1, 1) * Bview(e, 1, 0)
+                         + Aview(e, 1, 2) * Bview(e, 2, 0);
+        Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1)
+                         + Aview(e, 1, 1) * Bview(e, 1, 1)
+                         + Aview(e, 1, 2) * Bview(e, 2, 1);
+        Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2)
+                         + Aview(e, 1, 1) * Bview(e, 1, 2)
+                         + Aview(e, 1, 2) * Bview(e, 2, 2);
+
+        Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0)
+                         + Aview(e, 2, 1) * Bview(e, 1, 0)
+                         + Aview(e, 2, 2) * Bview(e, 2, 0);
+        Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1)
+                         + Aview(e, 2, 1) * Bview(e, 1, 1)
+                         + Aview(e, 2, 2) * Bview(e, 2, 1);
+        Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2)
+                         + Aview(e, 2, 1) * Bview(e, 1, 2)
+                         + Aview(e, 2, 2) * Bview(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - cuda) ... " << std::endl;
+
+  std::memset(C2, 0, N_c * N_r * N * sizeof(double));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 0);
+        Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 1);
+        Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 0, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 0, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 0);
+        Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 1);
+        Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 1, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 1, 2) * Bview2(e, 2, 2);
+
+        Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 0)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 0);
+        Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 1)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 1);
+        Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2)
+                          + Aview2(e, 2, 1) * Bview2(e, 1, 2)
+                          + Aview2(e, 2, 2) * Bview2(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 1 (RAJA - hip) ... " << std::endl;
+
+  double *d_A = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_B = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_C = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+
+  double *d_A2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_B2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+  double *d_C2 = memoryManager::allocate_gpu<double>(N_c * N_r * N);
+
+  RAJA::View<double, RAJA::Layout<3, int, 2>> d_Aview(d_A, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> d_Bview(d_B, layout1);
+  RAJA::View<double, RAJA::Layout<3, int, 2>> d_Cview(d_C, layout1);
+
+  RAJA::View<double, RAJA::Layout<3, int, 0>> d_Aview2(d_A2, layout2);
+  RAJA::View<double, RAJA::Layout<3, int, 0>> d_Bview2(d_B2, layout2);
+  RAJA::View<double, RAJA::Layout<3, int, 0>> d_Cview2(d_C2, layout2);
+
+  hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice ));
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 0, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 0, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 1, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 1, 2) * d_Bview(e, 2, 2);
+
+        d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 0)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 0);
+        d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 1)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 1);
+        d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2)
+                           + d_Aview(e, 2, 1) * d_Bview(e, 1, 2)
+                           + d_Aview(e, 2, 2) * d_Bview(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+
+  hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+
+  std::cout<< "\trun time: "<< minRun << " seconds" << std::endl;
+  checkResult(Cview, N, N_r, N_c);
+
+//----------------------------------------------------------------------------//
+
+  std::cout << " \n Running batched matrix multiplication"
+            << " with layout 2 (RAJA - hip) ... " << std::endl;
+
+  minRun = std::numeric_limits<double>::max();
+  for (int i = 0; i < NITER; ++i) {
+
+    timer.start();
+    RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N), 
+      [=] RAJA_DEVICE(int e) {
+
+        d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2);
+
+        d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2);
+
+        d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0);
+        d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1);
+        d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2)
+                            + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2)
+                            + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2);
+
+      }
+    );
+    timer.stop();
+
+    RAJA::Timer::ElapsedType tMin = timer.elapsed();
+    if (tMin < minRun) minRun = tMin;
+    timer.reset();
+  }
+
+  hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost ));
+
+  std::cout<< "\trun time : "<< minRun << " seconds" << std::endl;
+  checkResult(Cview2, N, N_r, N_c);
+
+  memoryManager::deallocate_gpu(d_A);
+  memoryManager::deallocate_gpu(d_B);
+  memoryManager::deallocate_gpu(d_C);
+  memoryManager::deallocate_gpu(d_A2);
+  memoryManager::deallocate_gpu(d_B2);
+  memoryManager::deallocate_gpu(d_C2);
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(A);
+  memoryManager::deallocate(B);
+  memoryManager::deallocate(C);
+  memoryManager::deallocate(A2);
+  memoryManager::deallocate(B2);
+  memoryManager::deallocate(C2);
+
+  std::cout << "\n DONE!...\n";
+  return 0;
+}
+
+//
+// check result
+//
+template <typename T>
+void checkResult(T C, int nMat, int nRows, int nCols)
+{
+
+  bool status = true;
+  for (int e = 0; e < nMat; ++e) {
+    for (int row = 0; row < nRows; ++row) {
+      for (int col = 0; col < nCols; ++col) {
+        if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) {
+          status = false;
+        }
+      }
+    }
+  }
+
+  if ( status ) {
+    std::cout << "\tresult -- PASS\n";
+  } else {
+    std::cout << "\tresult -- FAIL\n";
+  }
+}
diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp
new file mode 100644
index 0000000000..40d58c287b
--- /dev/null
+++ b/exercises/reductions.cpp
@@ -0,0 +1,289 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Reduction Example
+ *
+ *  This example illustrates use of the RAJA reduction types: min, max,
+ *  sum, min-loc, and max-loc.
+ *
+ *  RAJA features shown:
+ *    - `forall` loop iteration template method
+ *    -  Index range segment
+ *    -  Execution policies
+ *    -  Reduction types
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+  Specify the number of threads in a GPU thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+//constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+constexpr int HIP_BLOCK_SIZE = 256;
+#endif
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA reductions example...\n";
+
+  // _reductions_array_init_start
+//
+// Define array length
+//
+  constexpr int N = 1000000;
+
+//
+// Allocate array data and initialize data to alternating sequence of 1, -1.
+//
+  int* a = memoryManager::allocate<int>(N);
+
+  for (int i = 0; i < N; ++i) {
+    if ( i % 2 == 0 ) {
+      a[i] = 1;
+    } else {
+      a[i] = -1; 
+    }
+  }
+
+//
+// Set min and max loc values
+//
+  constexpr int minloc_ref = N / 2;
+  a[minloc_ref] = -100;
+
+  constexpr int maxloc_ref = N / 2 + 1;
+  a[maxloc_ref] = 100;
+  // _reductions_array_init_end
+
+//
+// Note: with this data initialization scheme, the following results will
+//       be observed for all reduction kernels below:
+//
+//  - the sum will be zero
+//  - the min will be -100
+//  - the max will be 100
+//  - the min loc will be N/2
+//  - the max loc will be N/2 + 1
+//
+//
+
+//
+// Define index range for iterating over a elements in all examples
+//
+  // _reductions_range_start
+//RAJA::TypedRangeSegment<int> arange(0, N);
+  // _reductions_range_end
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running RAJA sequential reductions...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Define EXEC_POL1 and REDCUE_POL1 for executing sequentially.
+  ///
+ 
+  /// TODO...
+  ///
+  /// EXERCISE: Remove comments for remainder of sequential section.
+  ///
+  ///           Uncomment 'arange' variable above so it can be used in kernel.
+  ///
+  /*
+  RAJA::ReduceSum<REDUCE_POL1, int> seq_sum(0);
+  RAJA::ReduceMin<REDUCE_POL1, int> seq_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL1, int> seq_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<REDUCE_POL1, int> seq_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL1, int> seq_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL1>(arange, [=](int i) {
+    
+    seq_sum += a[i];
+
+    seq_min.min(a[i]);
+    seq_max.max(a[i]);
+
+    seq_minloc.minloc(a[i], i);
+    seq_maxloc.maxloc(a[i], i);
+
+  });
+
+  std::cout << "\tsum = " << seq_sum.get() << std::endl;
+  std::cout << "\tmin = " << seq_min.get() << std::endl;
+  std::cout << "\tmax = " << seq_max.get() << std::endl;
+  std::cout << "\tmin, loc = " << seq_minloc.get() << " , " 
+                               << seq_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " 
+                               << seq_maxloc.getLoc() << std::endl;
+  */
+  
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_OPENMP)
+  std::cout << "\n Running RAJA OpenMP reductions...\n";
+
+  // _reductions_raja_omppolicy_start
+  /*
+  using EXEC_POL2   = RAJA::omp_parallel_for_exec;
+  using REDUCE_POL2 = RAJA::omp_reduce;
+  */
+  // _reductions_raja_omppolicy_end
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise.
+  ///
+  ///           Uncomment 'arange' variable above so it can be used in kernel.
+  ///
+
+  /// TODO...
+  ///
+  /// EXERCISE: Remove comments for remainder of openmp section and uncomment
+  ///           policy types above to use in kernel.
+  ///
+  /*
+  RAJA::forall<EXEC_POL2>(arange, [=](int i) {
+
+    omp_sum += a[i];
+
+    omp_min.min(a[i]);
+    omp_max.max(a[i]);
+
+    omp_minloc.minloc(a[i], i);
+    omp_maxloc.maxloc(a[i], i);
+
+  });
+
+  std::cout << "\tsum = " << omp_sum.get() << std::endl;
+  std::cout << "\tmin = " << omp_min.get() << std::endl;
+  std::cout << "\tmax = " << omp_max.get() << std::endl;
+  std::cout << "\tmin, loc = " << omp_minloc.get() << " , "
+                               << omp_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << omp_maxloc.get() << " , "
+                               << omp_maxloc.getLoc() << std::endl; 
+  */
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+  std::cout << "\n Running RAJA CUDA reductions...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Define EXEC_POL3 and REDCUE_POL3 for executing on a CUDA device.
+  ///
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise.
+  ///
+  ///           Uncomment 'arange' variable above so it can be used in kernel.
+  ///
+
+  /// TODO...
+  ///
+  /// EXERCISE: Remove comments for remainder of cuda section.
+  ///
+  /*
+  RAJA::forall<EXEC_POL3>(arange, [=] RAJA_DEVICE (int i) {
+
+    cuda_sum += a[i];
+
+    cuda_min.min(a[i]);
+    cuda_max.max(a[i]);
+
+    cuda_minloc.minloc(a[i], i);
+    cuda_maxloc.maxloc(a[i], i);
+
+  });
+
+  std::cout << "\tsum = " << cuda_sum.get() << std::endl;
+  std::cout << "\tmin = " << cuda_min.get() << std::endl;
+  std::cout << "\tmax = " << cuda_max.get() << std::endl;
+  std::cout << "\tmin, loc = " << cuda_minloc.get() << " , "
+                               << cuda_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << cuda_maxloc.get() << " , "
+                               << cuda_maxloc.getLoc() << std::endl;
+  */
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  std::cout << "\n Running RAJA HIP reductions...\n";
+
+  RAJA::TypedRangeSegment<int> arange1(0, N);
+
+  int* d_a = memoryManager::allocate_gpu<int>(N);
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  // _reductions_raja_hippolicy_start
+  using EXEC_POL3   = RAJA::hip_exec<HIP_BLOCK_SIZE>;
+  using REDUCE_POL3 = RAJA::hip_reduce;
+  // _reductions_raja_hippolicy_end
+
+  RAJA::ReduceSum<REDUCE_POL3, int> hip_sum(0);
+  RAJA::ReduceMin<REDUCE_POL3, int> hip_min(std::numeric_limits<int>::max());
+  RAJA::ReduceMax<REDUCE_POL3, int> hip_max(std::numeric_limits<int>::min());
+  RAJA::ReduceMinLoc<REDUCE_POL3, int> hip_minloc(std::numeric_limits<int>::max(), -1);
+  RAJA::ReduceMaxLoc<REDUCE_POL3, int> hip_maxloc(std::numeric_limits<int>::min(), -1);
+
+  RAJA::forall<EXEC_POL3>(arange1, [=] RAJA_DEVICE (int i) {
+
+    hip_sum += d_a[i];
+
+    hip_min.min(d_a[i]);
+    hip_max.max(d_a[i]);
+
+    hip_minloc.minloc(d_a[i], i);
+    hip_maxloc.maxloc(d_a[i], i);
+
+  });
+
+  std::cout << "\tsum = " << hip_sum.get() << std::endl;
+  std::cout << "\tmin = " << hip_min.get() << std::endl;
+  std::cout << "\tmax = " << hip_max.get() << std::endl;
+  std::cout << "\tmin, loc = " << hip_minloc.get() << " , "
+                               << hip_minloc.getLoc() << std::endl;
+  std::cout << "\tmax, loc = " << hip_maxloc.get() << " , "
+                               << hip_maxloc.getLoc() << std::endl;
+
+  memoryManager::deallocate_gpu(d_a);
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(a);
+
+  std::cout << "\n DONE!...\n";
+ 
+  return 0;
+}
diff --git a/examples/tut_reductions.cpp b/exercises/reductions_solution.cpp
similarity index 96%
rename from examples/tut_reductions.cpp
rename to exercises/reductions_solution.cpp
index aa35f44e8f..068e8a0986 100644
--- a/examples/tut_reductions.cpp
+++ b/exercises/reductions_solution.cpp
@@ -29,14 +29,14 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+  Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
+constexpr int CUDA_BLOCK_SIZE = 256;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 256;
+constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
@@ -48,7 +48,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Define array length
 //
-  const int N = 1000000;
+  constexpr int N = 1000000;
 
 //
 // Allocate array data and initialize data to alternating sequence of 1, -1.
@@ -66,10 +66,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Set min and max loc values
 //
-  const int minloc_ref = N / 2;
+  constexpr int minloc_ref = N / 2;
   a[minloc_ref] = -100;
 
-  const int maxloc_ref = N / 2 + 1;
+  constexpr int maxloc_ref = N / 2 + 1;
   a[maxloc_ref] = 100;
   // _reductions_array_init_end
 
@@ -89,7 +89,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 // Define index range for iterating over a elements in all examples
 //
   // _reductions_range_start
-  RAJA::RangeSegment arange(0, N);
+  RAJA::TypedRangeSegment<int> arange(0, N);
   // _reductions_range_end
 
 //----------------------------------------------------------------------------//
diff --git a/exercises/scan.cpp b/exercises/scan.cpp
new file mode 100644
index 0000000000..e00b80d330
--- /dev/null
+++ b/exercises/scan.cpp
@@ -0,0 +1,409 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#define OP_PLUS_INT RAJA::operators::plus<int>
+#define OP_MIN_INT RAJA::operators::minimum<int>
+#define OP_MAX_INT RAJA::operators::maximum<int>
+#define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult<X>(in, out, N);
+#define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult<X>(in, out, N);
+
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+#include <numeric>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Scan Exercise
+ *
+ *  This exercise demonstrates RAJA inclusive and exclusive scan operations
+ *  for integer arrays, including in-place, using different operators.
+ *  Other array data types, operators, etc. are similar
+ *
+ *  RAJA features shown:
+ *    - `RAJA::inclusive_scan` and `RAJA::inclusive_scan_inplace` methods
+ *    - `RAJA::exclusive_scan` and `RAJA::exclusive_scan_inplace` methods
+ *    -  RAJA operators
+ *    -  Execution policies
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+  Specify the number of threads in a GPU thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+//constexpr int CUDA_BLOCK_SIZE = 16;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+//constexpr int HIP_BLOCK_SIZE = 16;
+#endif
+
+//
+// Functions for checking results and printing vectors
+//
+template <typename Function, typename T>
+void checkInclusiveScanResult(const T* in, const T* out, int N);
+//
+template <typename Function, typename T>
+void checkExclusiveScanResult(const T* in, const T* out, int N);
+//
+template <typename T>
+void printArray(const T* v, int N);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA scan example...\n";
+
+  // _scan_array_init_start
+//
+// Define array length
+//
+  constexpr int N = 20;
+
+//
+// Allocate and initialize vector data
+//
+  int* in = memoryManager::allocate<int>(N);
+  int* out = memoryManager::allocate<int>(N);
+
+  std::iota(in, in + N, -1);
+
+  std::cout << "\n in values...\n";
+  printArray(in, N);
+  std::cout << "\n";
+  // _scan_array_init_end
+
+
+
+//----------------------------------------------------------------------------//
+// Perform various sequential scans to illustrate inclusive/exclusive,
+// in-place, default scans with different operators
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential inclusive_scan (default)...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec
+  ///           execution policy type. 
+  ///
+  /// NOTE: We've done this one for you to help you get started...
+  ///
+
+  // _scan_inclusive_seq_start
+  RAJA::inclusive_scan<RAJA::seq_exec>(RAJA::make_span(in, N),
+                                       RAJA::make_span(out, N));
+  // _scan_inclusive_seq_end
+
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential inclusive_scan (plus)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec
+  ///           execution policy type and an explicit plus operator. 
+  ///
+
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential exclusive_scan (plus)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an exclusive RAJA scan with RAJA::seq_exec
+  ///           execution policy type and an explicit plus operator. 
+  ///
+
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::seq_exec
+  ///           execution policy type and an explicit minimum operator. 
+  ///
+
+  CHECK_INC_SCAN_RESULTS(OP_MIN_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::seq_exec
+  ///           execution policy type and an explicit maximum operator. 
+  ///
+
+  CHECK_EXC_SCAN_RESULTS(OP_MAX_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of OpenMP scans...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running OpenMP inclusive_scan (plus)...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an inclusive RAJA scan with RAJA::omp_parallel_for_exec
+  ///           execution policy type and an explicit plus operator. 
+  ///
+
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::omp_parallel_for_exec
+  ///           execution policy type and an explicit plus operator. 
+  ///
+
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of CUDA scans...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::cuda_exec
+  ///           execution policy type and an explicit plus operator.
+  ///
+  ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the top
+  ///                 of the file if you want to use it here.
+  ///
+
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::cuda_exec
+  ///           execution policy type and an explicit plus operator.
+  ///
+  ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the top
+  ///                 of the file if you want to use it here.
+  ///
+
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA exclusive_scan (plus)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an exclusive RAJA scan with RAJA::cuda_exec
+  ///           execution policy type and an explicit plus operator.
+  ///
+  ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the top
+  ///                 of the file if you want to use it here.
+  ///
+
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+
+#if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of HIP scans...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n";
+
+  std::copy_n(in, N, out);
+  int* d_in = memoryManager::allocate_gpu<int>(N);
+  int* d_out = memoryManager::allocate_gpu<int>(N);
+
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::hip_exec
+  ///           execution policy type and an explicit plus operator. 
+  ///
+  ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top
+  ///                 of the file if you want to use it here.
+  ///
+
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running HIP exclusive_scan (plus)...\n";
+
+  hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement an exclusive RAJA scan with RAJA::hip_exec
+  ///           execution policy type and an explicit plus operator. 
+  ///
+  ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top
+  ///                 of the file if you want to use it here.
+  ///
+
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+  memoryManager::deallocate_gpu(d_in);
+  memoryManager::deallocate_gpu(d_out);
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(in);
+  memoryManager::deallocate(out);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+
+//
+// Function to check inclusive scan result
+//
+template <typename Function, typename T>
+void checkInclusiveScanResult(const T* in, const T* out, int N)
+{
+  T val = Function::identity();
+  for (int i = 0; i < N; ++i) {
+    val = Function()(val, in[i]);
+    if (out[i] != val) {
+      std::cout << "\n\t result -- WRONG\n";
+      std::cout << "\t" << out[i] << " != " << val
+                << " (at index " << i << ")\n";
+    }
+  }
+  std::cout << "\n\t result -- CORRECT\n";
+}
+
+//
+// Function to check exclusive scan result
+//
+template <typename Function, typename T>
+void checkExclusiveScanResult(const T* in, const T* out, int N)
+{
+  T val = Function::identity();
+  for (int i = 0; i < N; ++i) {
+    if (out[i] != val) {
+      std::cout << "\n\t result -- WRONG\n";
+      std::cout << "\t" << out[i] << " != " << val
+                << " (at index " << i << ")\n";
+    }
+    val = Function()(val, in[i]);
+  }
+  std::cout << "\n\t result -- CORRECT\n";
+}
+
+//
+// Function to print vector.
+//
+template <typename T>
+void printArray(const T* v, int N)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; }
+  std::cout << std::endl;
+}
diff --git a/examples/tut_scan.cpp b/exercises/scan_solution.cpp
similarity index 79%
rename from examples/tut_scan.cpp
rename to exercises/scan_solution.cpp
index 1c89c16107..a72a4cdebd 100644
--- a/examples/tut_scan.cpp
+++ b/exercises/scan_solution.cpp
@@ -5,6 +5,12 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#define OP_PLUS_INT RAJA::operators::plus<int>
+#define OP_MIN_INT RAJA::operators::minimum<int>
+#define OP_MAX_INT RAJA::operators::maximum<int>
+#define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult<X>(in, out, N);
+#define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult<X>(in, out, N);
+
 #include <cstdlib>
 #include <iostream>
 #include <algorithm>
@@ -15,9 +21,9 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Scan Example
+ *  Scan Exercise
  *
- *  Example shows how to perform RAJA inclusive and exclusive scan operations
+ *  This exercise demonstrates RAJA inclusive and exclusive scan operations
  *  for integer arrays, including in-place, using different operators.
  *  Other array data types, operators, etc. are similar
  *
@@ -31,14 +37,14 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+  Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 16;
+  constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 16;
+  constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -63,7 +69,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Define array length
 //
-  const int N = 20;
+  constexpr int N = 20;
 
 //
 // Allocate and initialize vector data
@@ -73,11 +79,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::iota(in, in + N, -1);
 
-  // _scan_array_init_end
-
   std::cout << "\n in values...\n";
   printArray(in, N);
   std::cout << "\n";
+  // _scan_array_init_end
+
 
 
 //----------------------------------------------------------------------------//
@@ -92,7 +98,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::make_span(out, N));
   // _scan_inclusive_seq_end
 
-  checkInclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -108,7 +114,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::operators::plus<int>{});
   // _scan_inclusive_seq_plus_end
 
-  checkInclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -124,7 +130,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                        RAJA::operators::plus<int>{});
   // _scan_exclusive_seq_plus_end
 
-  checkExclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -132,14 +138,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n";
 
+  // _scan_inclusive_inplace_seq_min_start
   std::copy_n(in, N, out);
 
-  // _scan_inclusive_inplace_seq_min_start
   RAJA::inclusive_scan_inplace<RAJA::seq_exec>(RAJA::make_span(out, N),
                                                RAJA::operators::minimum<int>{});
   // _scan_inclusive_inplace_seq_min_end
 
-  checkInclusiveScanResult<RAJA::operators::minimum<int>>(in, out, N);
+  CHECK_INC_SCAN_RESULTS(OP_MIN_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -154,7 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                RAJA::operators::maximum<int>{});
   // _scan_exclusive_inplace_seq_max_end
 
-  checkExclusiveScanResult<RAJA::operators::maximum<int>>(in, out, N);
+  CHECK_EXC_SCAN_RESULTS(OP_MAX_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -173,7 +179,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                     RAJA::operators::plus<int>{});
   // _scan_inclusive_omp_plus_end
 
-  checkInclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -184,11 +190,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _scan_exclusive_inplace_omp_plus_start
-  RAJA::exclusive_scan_inplace<RAJA::omp_parallel_for_exec>(RAJA::make_span(out, N),
-                                                            RAJA::operators::plus<int>{});
+  RAJA::exclusive_scan_inplace<RAJA::omp_parallel_for_exec>(
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_exclusive_inplace_omp_plus_end
 
-  checkExclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -199,7 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 #if defined(RAJA_ENABLE_CUDA)
 
 //----------------------------------------------------------------------------//
-// Perform a couple of CUDA scans...
+// Perform a few CUDA scans...
 //----------------------------------------------------------------------------//
 
   std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n";
@@ -207,11 +214,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _scan_inclusive_inplace_cuda_plus_start
-  RAJA::inclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::make_span(out, N),
-                                                                 RAJA::operators::plus<int>{});
+  RAJA::inclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_inclusive_inplace_cuda_plus_end
 
-  checkInclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n";
+
+  std::copy_n(in, N, out);
+
+  // _scan_exclusive_inplace_cuda_plus_start
+  RAJA::exclusive_scan_inplace<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
+  // _scan_exclusive_inplace_cuda_plus_end
+
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -222,12 +246,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::copy_n(in, N, out);
 
   // _scan_exclusive_cuda_plus_start
-  RAJA::exclusive_scan<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(RAJA::make_span(in, N),
-                                                         RAJA::make_span(out, N),
-                                                         RAJA::operators::plus<int>{});
+  RAJA::exclusive_scan<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(
+      RAJA::make_span(in, N),
+      RAJA::make_span(out, N),
+      RAJA::operators::plus<int>{});
   // _scan_exclusive_cuda_plus_end
 
-  checkExclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
@@ -235,6 +260,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 //----------------------------------------------------------------------------//
 
+
 #if defined(RAJA_ENABLE_HIP)
 
 //----------------------------------------------------------------------------//
@@ -249,28 +275,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
-  RAJA::inclusive_scan_inplace<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::make_span(d_out, N),
-                                                               RAJA::operators::plus<int>{});
+  // _scan_inclusive_inplace_hip_plus_start
+  RAJA::inclusive_scan_inplace<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      RAJA::make_span(d_out, N),
+      RAJA::operators::plus<int>{});
+  // _scan_inclusive_inplace_hip_plus_end
 
   hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  checkInclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_INC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
 //----------------------------------------------------------------------------//
 
+  std::cout << "\n Running HIP exclusive_scan (plus)...\n";
+
   hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice ));
   hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
-  std::cout << "\n Running HIP exclusive_scan (plus)...\n";
-  RAJA::exclusive_scan<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::make_span(d_in, N),
-                                                       RAJA::make_span(d_out, N),
-                                                       RAJA::operators::plus<int>{});
+  RAJA::exclusive_scan<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+      RAJA::make_span(d_in, N),
+      RAJA::make_span(d_out, N),
+      RAJA::operators::plus<int>{});
 
   hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  checkExclusiveScanResult<RAJA::operators::plus<int>>(in, out, N);
+  CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT)
   printArray(out, N);
   std::cout << "\n";
 
diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp
new file mode 100644
index 0000000000..44546fb940
--- /dev/null
+++ b/exercises/segment-indexset-basics.cpp
@@ -0,0 +1,284 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/resource.hpp"
+
+/*
+ *  Segments and Index Sets exercise
+ *
+ *  In this exercise, you will learn how to create RAJA segments and index sets
+ *  and use them to execute kernels. There are no computations performed in the
+ *  exercises and no parallel execution. The kernels contain only print 
+ *  statements to illustrate various iteration patterns. Thus, all kernels
+ *  look the same. The only thing that changes in these versions is the object 
+ *  passed to the 'forall' method that defines the iteration space.
+ *
+ *  RAJA features shown:
+ *    - `forall` loop iteration template method
+ *    -  TypedRangeSegment iteration space
+ *    -  TypedRangeStrideSegment iteration space
+ *    -  TypedListSegment iteration space
+ *    -  TypedIndexSet segment container
+ *    -  Hierarchical execution policies
+ */
+
+//----------------------------------------------------------------------------//
+// Define aliases for types used in the exercises
+// (so example code is less verbose)
+//----------------------------------------------------------------------------//
+// _raja_segment_type_start
+using IdxType = int;
+using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
+using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
+using ListSegType = RAJA::TypedListSegment<IdxType>;
+using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >;
+// _raja_segment_type_end
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA segments index sets and index sets...\n";
+
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+
+//----------------------------------------------------------------------------//
+// Stride-1 iteration spaces
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version range kernel...\n";
+
+  // _cstyle_range1_start
+  for (IdxType i = 0; i < 20; i++) {
+    std::cout << i << "  "; 
+  }
+  // _cstyle_range1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA range kernel...\n";
+
+  // _raja_range1_start
+  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_range1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-1 range kernel...\n";
+
+  // _raja_striderange1_start
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_striderange1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-1 list kernel...\n";
+
+  // _raja_list1_start
+  //
+  // Collect indices in a vector to create list segment
+  //
+  std::vector<IdxType> idx;
+  for (IdxType i = 0; i < 20; ++i) {
+    idx.push_back(i); 
+  } 
+
+  ListSegType idx_list1( idx, host_res );
+
+  RAJA::forall<RAJA::seq_exec>(idx_list1, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_list1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running C-style stride-1 list kernel...\n";
+
+  // _cstyle_list1_start
+  IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
+  for (IdxType ii = 0; ii < iis; ++ii) { 
+    std::cout << idx[ ii ] << "  ";
+  }
+  // _cstyle_list1_end
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+// Negative stride iteration spaces
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version negative stride kernel...\n";
+
+  // _cstyle_negstriderange1_start
+  for (IdxType i = 19; i > -1; i--) {
+    std::cout << i << "  ";
+  }
+  // _cstyle_negstriderange1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA negative stride kernel...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Make a RAJA -1 stride version of the kernel.
+  ///
+
+  std::cout << std::endl;
+
+//----------------------------------//
+// List variant
+//----------------------------------//
+
+  std::cout << "\n Running RAJA negative stride list kernel...\n";
+
+  // _raja_negstridelist1_start
+  //
+  // Reverse the order of indices in the vector
+  //
+  std::reverse( idx.begin(), idx.end() );
+  ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res );
+
+  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_negstridelist1_end
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+// Non-unit uniform stride iteration spaces
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version stride-2 range kernel...\n";
+
+  // _cstyle_range2_start
+  for (IdxType i = 0; i < 20; i += 2) {
+    std::cout << i << "  ";
+  }
+  // _cstyle_range2_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-2 range kernel...\n";
+
+  // _raja_range2_start
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_range2_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-3 range kernel...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Make a RAJA stride-3 version of the kernel.
+  ///
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+// IndexSets: complex iteration spaces
+//----------------------------------------------------------------------------//
+
+//
+// Sequential index set execution policy used in several of the following
+// example implementations.
+//
+
+  // _raja_seq_indexset_policy_start
+  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
+                                            RAJA::seq_exec>;
+  // _raja_seq_indexset_policy__end
+
+  std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n";
+
+  // _raja_indexset_2ranges_start
+  IndexSetType is2;
+  is2.push_back( RangeSegType(0, 10) );
+  is2.push_back( RangeSegType(15, 20) );
+  
+  RAJA::forall<SEQ_ISET_EXECPOL>(is2, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_indexset_2ranges_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running C-version of two segment kernel...\n";
+
+  // _cstyle_2ranges_start
+  for (IdxType i = 0; i < 10; ++i) {
+    std::cout << i << "  ";
+  }
+  for (IdxType i = 15; i < 20; ++i) {
+    std::cout << i << "  ";
+  }
+  // _cstyle_2ranges_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA index set (3 segments) kernel...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Make a RAJA version of a kernel that prints the sequence
+  ///        
+  ///           0 1 2 3 4 5 6 7 10 11 14 20 22 24 25 26 27
+  ///
+  ///           using a RAJA::TypedIndexSet containing two 
+  ///           RAJA::TypedRangeSegment objects and on 
+  ///           RAJA::TypedListSegment object. 
+  ///
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n DONE!...\n";
+ 
+  return 0;
+}
+
diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp
new file mode 100644
index 0000000000..4e736bb9f7
--- /dev/null
+++ b/exercises/segment-indexset-basics_solution.cpp
@@ -0,0 +1,286 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+#include "RAJA/RAJA.hpp"
+
+#include "camp/resource.hpp"
+
+/*
+ *  Segments and Index Sets exercise
+ *
+ *  In this exercise, you will learn how to create RAJA segments and index sets
+ *  and use them to execute kernels. There are no computations performed in the
+ *  exercises and no parallel execution. The kernels contain only print 
+ *  statements to illustrate various iteration patterns. Thus, all kernels
+ *  look the same. The only thing that changes in these versions is the object 
+ *  passed to the 'forall' method that defines the iteration space.
+ *
+ *  RAJA features shown:
+ *    - `forall` loop iteration template method
+ *    -  TypedRangeSegment iteration space
+ *    -  TypedRangeStrideSegment iteration space
+ *    -  TypedListSegment iteration space
+ *    -  TypedIndexSet segment container
+ *    -  Hierarchical execution policies
+ */
+
+//----------------------------------------------------------------------------//
+// Define aliases for types used in the exercises
+// (so example code is less verbose)
+//----------------------------------------------------------------------------//
+// _raja_segment_type_start
+using IdxType = int;
+using RangeSegType = RAJA::TypedRangeSegment<IdxType>;
+using RangeStrideSegType = RAJA::TypedRangeStrideSegment<IdxType>;
+using ListSegType = RAJA::TypedListSegment<IdxType>;
+using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >;
+// _raja_segment_type_end
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA segments index sets and index sets...\n";
+
+// Resource object used to construct list segment objects with indices
+// living in host (CPU) memory.
+  camp::resources::Resource host_res{camp::resources::Host()};
+
+
+//----------------------------------------------------------------------------//
+// Stride-1 iteration spaces
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version range kernel...\n";
+
+// _cstyle_range1_start
+  for (IdxType i = 0; i < 20; i++) {
+    std::cout << i << "  "; 
+  }
+// _cstyle_range1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA range kernel...\n";
+
+  // _raja_range1_start
+  RAJA::forall<RAJA::seq_exec>(RangeSegType(0, 20), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_range1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-1 range kernel...\n";
+
+  // _raja_striderange1_start
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 1), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_striderange1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-1 list kernel...\n";
+
+  // _raja_list1_start
+  //
+  // Collect indices in a vector to create list segment
+  //
+  std::vector<IdxType> idx;
+  for (IdxType i = 0; i < 20; ++i) {
+    idx.push_back(i); 
+  } 
+
+  ListSegType idx_list1( idx, host_res );
+
+  RAJA::forall<RAJA::seq_exec>(idx_list1, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_list1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running C-style stride-1 list kernel...\n";
+
+  // _cstyle_list1_start
+  IdxType iis = static_cast<IdxType>(idx.size());  // to avoid compiler warning
+  for (IdxType ii = 0; ii < iis; ++ii) { 
+    std::cout << idx[ ii ] << "  ";
+  }
+  // _cstyle_list1_end
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+// Negative stride iteration spaces
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version negative stride kernel...\n";
+
+  // _cstyle_negstriderange1_start
+  for (IdxType i = 19; i > -1; i--) {
+    std::cout << i << "  ";
+  }
+  // _cstyle_negstriderange1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA negative stride kernel...\n";
+
+  // _raja_negstriderange1_start
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(19, -1, -1), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_negstriderange1_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+// List variant
+//----------------------------------//
+
+  std::cout << "\n Running RAJA negative stride list kernel...\n";
+
+  // _raja_negstridelist1_start
+  //
+  // Reverse the order of indices in the vector
+  //
+  std::reverse( idx.begin(), idx.end() );
+  ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res );
+
+  RAJA::forall<RAJA::seq_exec>(idx_list1_reverse, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_negstridelist1_end
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+// Non-unit uniform stride iteration spaces
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running C-version stride-2 range kernel...\n";
+
+  // _cstyle_range2_start
+  for (IdxType i = 0; i < 20; i += 2) {
+    std::cout << i << "  ";
+  }
+  // _cstyle_range2_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-2 range kernel...\n";
+
+  // _raja_range2_start
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 2), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_range2_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA stride-3 range kernel...\n";
+
+  // _raja_range3_start
+  RAJA::forall<RAJA::seq_exec>(RangeStrideSegType(0, 20, 3), [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_range3_end
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+// IndexSets: complex iteration spaces
+//----------------------------------------------------------------------------//
+
+//
+// Sequential index set execution policy used in several of the following
+// example implementations.
+//
+
+  std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n";
+
+  // _raja_indexset_2ranges_start
+  using SEQ_ISET_EXECPOL = RAJA::ExecPolicy<RAJA::seq_segit,
+                                            RAJA::seq_exec>;
+
+  IndexSetType is2;
+  is2.push_back( RangeSegType(0, 10) );
+  is2.push_back( RangeSegType(15, 20) );
+  
+  RAJA::forall<SEQ_ISET_EXECPOL>(is2, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_indexset_2ranges_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running C-version of two segment kernel...\n";
+
+  // _cstyle_2ranges_start
+  for (IdxType i = 0; i < 10; ++i) {
+    std::cout << i << "  ";
+  }
+  for (IdxType i = 15; i < 20; ++i) {
+    std::cout << i << "  ";
+  }
+  // _cstyle_2ranges_end
+
+  std::cout << std::endl;
+
+//----------------------------------//
+
+  std::cout << "\n Running RAJA index set (3 segments) kernel...\n";
+
+  // _raja_indexset_3segs_start
+  IndexSetType is3;
+
+  is3.push_back( RangeSegType(0, 8) );
+
+  IdxType indx[ ] = {10, 11, 14, 20, 22};
+  ListSegType list2( indx, 5, host_res );
+  is3.push_back( list2 );
+
+  is3.push_back( RangeSegType(24, 28) );
+ 
+  RAJA::forall<SEQ_ISET_EXECPOL>(is3, [=] (IdxType i) {
+    std::cout << i << "  ";
+  });
+  // _raja_indexset_3segs_end
+
+  std::cout << std::endl;
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n DONE!...\n";
+ 
+  return 0;
+}
+
diff --git a/exercises/sort.cpp b/exercises/sort.cpp
new file mode 100644
index 0000000000..26a0e6e1f4
--- /dev/null
+++ b/exercises/sort.cpp
@@ -0,0 +1,702 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#define OP_GREATER RAJA::operators::greater<int>
+#define OP_LESS RAJA::operators::less<int>
+
+#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N) 
+#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult<X>(in, out, in_vals, out_vals, N) 
+#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N) 
+#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult<X>(in, out, in_vals, out_vals, N) 
+
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <map>
+#include <unordered_map>
+#include <set>
+#include <unordered_set>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  Sort Exercise
+ *
+ *  Exercise demonstrates how to perform RAJA unstable and stable sort operations
+ *  for integer arrays, including pairs variant, using different comparators.
+ *  Other array data types, comparators, etc. are similar
+ *
+ *  RAJA features shown:
+ *    - `RAJA::sort` and `RAJA::sort_pairs` methods
+ *    - `RAJA::stable_sort` and `RAJA::stable_sort_pairs` methods
+ *    -  RAJA operators
+ *    -  Execution policies
+ *
+ * If CUDA is enabled, CUDA unified memory is used.
+ */
+
+/*
+  Specify the number of threads in a GPU thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+//constexpr int CUDA_BLOCK_SIZE = 16;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+//constexpr int HIP_BLOCK_SIZE = 16;
+#endif
+
+//
+// Functions for checking results and printing vectors
+//
+template <typename Function, typename T>
+void checkUnstableSortResult(const T* in, const T* out, int N);
+template <typename Function, typename T, typename U>
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N);
+//
+template <typename Function, typename T>
+void checkStableSortResult(const T* in, const T* out, int N);
+template <typename Function, typename T, typename U>
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N);
+//
+template <typename T>
+void printArray(const T* k, int N);
+template <typename T, typename U>
+void printArray(const T* k, const U* v, int N);
+
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA sort example...\n";
+
+  // _sort_array_init_start
+//
+// Define array length
+//
+  constexpr int N = 20;
+
+//
+// Allocate and initialize vector data
+//
+  int* in = memoryManager::allocate<int>(N);
+  int* out = memoryManager::allocate<int>(N);
+
+  unsigned* in_vals = memoryManager::allocate<unsigned>(N);
+  unsigned* out_vals = memoryManager::allocate<unsigned>(N);
+
+  std::iota(in      , in + N/2, 0);
+  std::iota(in + N/2, in + N  , 0);
+  std::shuffle(in      , in + N/2, std::mt19937{12345u});
+  std::shuffle(in + N/2, in + N  , std::mt19937{67890u});
+
+  std::fill(in_vals      , in_vals + N/2, 0);
+  std::fill(in_vals + N/2, in_vals + N  , 1);
+
+  std::cout << "\n in keys...\n";
+  printArray(in, N);
+  std::cout << "\n in (key, value) pairs...\n";
+  printArray(in, in_vals, N);
+  std::cout << "\n";
+
+  // _sort_array_init_end
+
+
+//----------------------------------------------------------------------------//
+// Perform various sequential sorts to illustrate unstable/stable,
+// pairs, default sorts with different comparators
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential sort (default)...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec
+  ///           execution policy type. 
+  ///
+  /// NOTE: We've done this one for you to help you get started...
+  ///
+
+  // _sort_seq_start
+  std::copy_n(in, N, out);
+
+  RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N));
+  // _sort_seq_end
+
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec execution
+  ///           policy type and an explicit less operation. 
+  ///
+
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution
+  ///           policy type and an explicit less operation. 
+  ///
+
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_LESS);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential stable_sort (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution
+  ///           policy type and an explicit greater operation. 
+  ///
+
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_GREATER);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a RAJA pair sort with RAJA::seq_exec execution
+  ///           policy type and an explicit less operation. 
+  ///
+
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a stable RAJA pair sort with RAJA::seq_exec execution
+  ///           policy type and an explicit greater operation. 
+  ///
+
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of OpenMP sorts...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running OpenMP sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a RAJA sort with RAJA::omp_parallel_for_exec execution
+  ///           policy type and an explicit less operation. 
+  ///
+
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
+  printArray(out, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec execution
+  ///           policy type and an explicit greater operation. 
+  ///
+
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_CUDA)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of CUDA sorts...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a RAJA pair sort with RAJA::cuda_exec execution
+  ///           policy type and an explicit greater operation. 
+  ///
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///                 top of the file if you want to use it here.
+  ///
+
+  //checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a stable RAJA pair sort with RAJA::cuda_exec execution
+  ///           policy type and an explicit less operation. 
+  ///
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///                 top of the file if you want to use it here.
+  ///
+
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_LESS);
+  printArray(out, N);
+  std::cout << "\n";
+
+#endif
+
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+//----------------------------------------------------------------------------//
+// Perform a couple of HIP sorts...
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n";
+
+  std::copy_n(in, N, out);
+  std::copy_n(in_vals, N, out_vals);
+
+  int* d_out = memoryManager::allocate_gpu<int>(N);
+  int* d_out_vals = memoryManager::allocate_gpu<int>(N);
+
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a RAJA pair sort with RAJA::hip_exec execution
+  ///           policy type and an explicit less operation. 
+  ///
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///                 top of the file if you want to use it here.
+  ///
+
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+  hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
+  printArray(out, out_vals, N);
+  std::cout << "\n";
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running HIP stable_sort (non-increasing)...\n";
+
+  std::copy_n(in, N, out);
+
+  hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a stable RAJA sort with RAJA::hip_exec execution
+  ///           policy type and an explicit less operation. 
+  ///
+  ///           NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the 
+  ///                 top of the file if you want to use it here.
+  ///
+
+  hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_GREATER);
+  printArray(out, N);
+  std::cout << "\n";
+
+  memoryManager::deallocate_gpu(d_out);
+  memoryManager::deallocate_gpu(d_out_vals);
+
+#endif
+
+
+//----------------------------------------------------------------------------//
+
+//
+// Clean up.
+//
+  memoryManager::deallocate(in);
+  memoryManager::deallocate(out);
+
+  memoryManager::deallocate(in_vals);
+  memoryManager::deallocate(out_vals);
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+template <typename Comparator, typename T>
+bool equivalent(T const& a, T const& b, Comparator comp)
+{
+  return !comp(a, b) && !comp(b, a);
+}
+
+//
+// Function to check unstable sort result
+//
+template <typename Comparator, typename T>
+void checkUnstableSortResult(const T* in, const T* out, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to keys
+  using val_map = std::unordered_multiset<T>;
+  std::unordered_map<T, val_map> keys;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys.find(in[i]);
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace(in[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order"
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is an item with this
+    auto key_iter = keys.find(out[i]);
+    if (key_iter == keys.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key"
+                << " (at index " << i << ")\n";
+    }
+    auto val_iter = key_iter->second.find(out[i]);
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate val"
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.erase(val_iter);
+    if (key_iter->second.size() == 0) {
+      keys.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+
+template <typename Comparator, typename T, typename U>
+void checkUnstableSortResult(const T* in, const T* out,
+                             const U* in_vals, const U* out_vals, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to vals
+  using val_map = std::unordered_multiset<U>;
+  std::unordered_map<T, val_map> keys_to_vals;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys_to_vals.find(in[i]);
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace(in_vals[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << " out of order"
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is a pair with this key and val
+    auto key_iter = keys_to_vals.find(out[i]);
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " unknown or duplicate key"
+                << " (at index " << i << ")\n";
+    }
+    auto val_iter = key_iter->second.find(out_vals[i]);
+    if (val_iter == key_iter->second.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " unknown or duplicate val"
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.erase(val_iter);
+    if (key_iter->second.size() == 0) {
+      keys_to_vals.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+
+//
+// Function to check stable sort result
+//
+template <typename Comparator, typename T>
+void checkStableSortResult(const T* in, const T* out, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to keys
+  using val_map = std::list<T>;
+  std::unordered_map<T, val_map> keys;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys.find(in[i]);
+    if (key_iter == keys.end()) {
+      auto ret = keys.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace_back(in[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i-1] << ", " << out[i]
+                << " out of order "
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is an item with this
+    auto key_iter = keys.find(out[i]);
+    if (key_iter == keys.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " unknown or duplicate key "
+                << " (at index " << i << ")\n";
+    }
+    if (key_iter->second.front() != out[i]) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << out[i]
+                << " out of stable order or unknown val "
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.pop_front();
+    if (key_iter->second.size() == 0) {
+      keys.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+
+template <typename Comparator, typename T, typename U>
+void checkStableSortResult(const T* in, const T* out,
+                           const U* in_vals, const U* out_vals, int N)
+{
+  Comparator comp;
+  bool correct = true;
+
+  // make map of keys to vals
+  using val_map = std::list<U>;
+  std::unordered_map<T, val_map> keys_to_vals;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    auto key_iter = keys_to_vals.find(in[i]);
+    if (key_iter == keys_to_vals.end()) {
+      auto ret = keys_to_vals.emplace(in[i], val_map{});
+      assert(ret.second);
+      key_iter = ret.first;
+    }
+    key_iter->second.emplace_back(in_vals[i]);
+  }
+
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    // test ordering
+    if (i > 0 && comp(out[i], out[i-1])) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "("  << out[i-1] << "," << out_vals[i-1] << "),"
+                << " (" << out[i]   << "," << out_vals[i]   << ")"
+                << " out of order "
+                << " (at index " << i-1 << ")\n";
+    }
+    // test there is a pair with this key and val
+    auto key_iter = keys_to_vals.find(out[i]);
+    if (key_iter == keys_to_vals.end()) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " unknown or duplicate key "
+                << " (at index " << i << ")\n";
+    }
+    if (key_iter->second.front() != out_vals[i]) {
+      if (correct) {
+        std::cout << "\n\t result -- WRONG\n";
+        correct = false;
+      }
+      std::cout << "\t"
+                << "(" << out[i]   << "," << out_vals[i]   << ")"
+                << " out of stable order or unknown val "
+                << " (at index " << i << ")\n";
+    }
+    key_iter->second.pop_front();
+    if (key_iter->second.size() == 0) {
+      keys_to_vals.erase(key_iter);
+    }
+  }
+  if (correct) {
+    std::cout << "\n\t result -- CORRECT\n";
+  }
+}
+
+
+//
+// Function to print vector.
+//
+template <typename T>
+void printArray(const T* k, int N)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; }
+  std::cout << std::endl;
+}
+///
+template <typename T, typename U>
+void printArray(const T* k, const U* v, int N)
+{
+  std::cout << std::endl;
+  for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; }
+  std::cout << std::endl;
+}
+
diff --git a/examples/tut_sort.cpp b/exercises/sort_solution.cpp
similarity index 88%
rename from examples/tut_sort.cpp
rename to exercises/sort_solution.cpp
index 18ec192de0..d86cd72b70 100644
--- a/examples/tut_sort.cpp
+++ b/exercises/sort_solution.cpp
@@ -5,6 +5,14 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
+#define OP_GREATER RAJA::operators::greater<int>
+#define OP_LESS RAJA::operators::less<int>
+
+#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult<X>(in, out, N) 
+#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult<X>(in, out, in_vals, out_vals, N) 
+#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult<X>(in, out, N) 
+#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult<X>(in, out, in_vals, out_vals, N) 
+
 #include <cstdlib>
 #include <iostream>
 #include <algorithm>
@@ -20,9 +28,9 @@
 #include "RAJA/RAJA.hpp"
 
 /*
- *  Sort Example
+ *  Sort Exercise
  *
- *  Example shows how to perform RAJA unstable and stable sort operations
+ *  Exercise demonstrates how to perform RAJA unstable and stable sort operations
  *  for integer arrays, including pairs variant, using different comparators.
  *  Other array data types, comparators, etc. are similar
  *
@@ -36,14 +44,14 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+  Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 16;
+constexpr int CUDA_BLOCK_SIZE = 16;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-const int HIP_BLOCK_SIZE = 16;
+constexpr int HIP_BLOCK_SIZE = 16;
 #endif
 
 //
@@ -76,7 +84,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 //
 // Define array length
 //
-  const int N = 20;
+  constexpr int N = 20;
 
 //
 // Allocate and initialize vector data
@@ -95,14 +103,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   std::fill(in_vals      , in_vals + N/2, 0);
   std::fill(in_vals + N/2, in_vals + N  , 1);
 
-  // _sort_array_init_end
-
   std::cout << "\n in keys...\n";
   printArray(in, N);
   std::cout << "\n in (key, value) pairs...\n";
   printArray(in, in_vals, N);
   std::cout << "\n";
 
+  // _sort_array_init_end
+
 
 //----------------------------------------------------------------------------//
 // Perform various sequential sorts to illustrate unstable/stable,
@@ -111,13 +119,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running sequential sort (default)...\n";
 
+  // _sort_seq_start
   std::copy_n(in, N, out);
 
-  // _sort_seq_start
   RAJA::sort<RAJA::seq_exec>(RAJA::make_span(out, N));
   // _sort_seq_end
 
-  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
@@ -132,7 +141,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                              RAJA::operators::less<int>{});
   // _sort_seq_less_end
 
-  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
@@ -147,13 +157,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                     RAJA::operators::less<int>{});
   // _sort_stable_seq_less_end
 
-  checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
 //----------------------------------------------------------------------------//
 
-  std::cout << "\n Running sequential stable_sort (non-decreasing)...\n";
+  std::cout << "\n Running sequential stable_sort (non-increasing)...\n";
 
   std::copy_n(in, N, out);
 
@@ -162,7 +173,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                     RAJA::operators::greater<int>{});
   // _sort_stable_seq_greater_end
 
-  checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
 
@@ -179,7 +191,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                    RAJA::operators::less<int>{});
   // _sort_pairs_seq_less_end
 
-  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
@@ -196,7 +209,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                           RAJA::operators::greater<int>{});
   // _sort_stable_pairs_seq_greater_end
 
-  checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
@@ -216,7 +230,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                           RAJA::operators::less<int>{});
   // _sort_omp_less_end
 
-  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_UNSTABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
@@ -233,7 +248,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                        RAJA::operators::greater<int>{});
   // _sort_stable_pairs_omp_greater_end
 
-  checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
@@ -258,7 +274,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                      RAJA::operators::greater<int>{});
   // _sort_pairs_cuda_greater_end
 
-  checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::greater<int>>(in, out, in_vals, out_vals, N);
+  CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
@@ -273,7 +290,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
                                                       RAJA::operators::less<int>{});
   // _sort_stable_cuda_less_end
 
-  checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::less<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_LESS);
   printArray(out, N);
   std::cout << "\n";
 
@@ -305,7 +323,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
   hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
   hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  //checkUnstableSortResult<RAJA::operators::less<int>>(in, out, in_vals, out_vals, N);
+  CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS);
   printArray(out, out_vals, N);
   std::cout << "\n";
 
@@ -317,12 +336,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
   hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice ));
 
-  RAJA::stable_sort<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::make_span(d_out, N),
-                                       RAJA::operators::greater<int>{});
+  // _sort_stable_hip_greater_start
+  RAJA::stable_sort<RAJA::hip_exec<HIP_BLOCK_SIZE>>(
+    RAJA::make_span(d_out, N),
+    RAJA::operators::greater<int>{});
+  // _sort_stable_hip_greater_end
 
   hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost ));
 
-  checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  //checkStableSortResult<RAJA::operators::greater<int>>(in, out, N);
+  CHECK_STABLE_SORT_RESULT(OP_GREATER);
   printArray(out, N);
   std::cout << "\n";
 
@@ -331,6 +354,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
+
 //----------------------------------------------------------------------------//
 
 //
@@ -419,7 +443,7 @@ void checkUnstableSortResult(const T* in, const T* out, int N)
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
-///
+
 template <typename Comparator, typename T, typename U>
 void checkUnstableSortResult(const T* in, const T* out,
                              const U* in_vals, const U* out_vals, int N)
@@ -551,7 +575,7 @@ void checkStableSortResult(const T* in, const T* out, int N)
     std::cout << "\n\t result -- CORRECT\n";
   }
 }
-///
+
 template <typename Comparator, typename T, typename U>
 void checkStableSortResult(const T* in, const T* out,
                            const U* in_vals, const U* out_vals, int N)
@@ -628,7 +652,7 @@ void printArray(const T* k, int N)
   for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; }
   std::cout << std::endl;
 }
-///
+
 template <typename T, typename U>
 void printArray(const T* k, const U* v, int N)
 {
diff --git a/exercises/supervisord.conf b/exercises/supervisord.conf
new file mode 100644
index 0000000000..f40fe78616
--- /dev/null
+++ b/exercises/supervisord.conf
@@ -0,0 +1,10 @@
+[supervisord]
+nodaemon = true
+user = XXX
+logfile = /tmp/supervisord.log
+
+[program:openvscode-server]
+environment=HOME="/home/XXX",USER="XXX"
+redirect_stderr = true
+stdout_logfile = /var/log/openvscode-server.log
+command = /opt/archives/openvscode-server-v1.69.1-linux-x64/bin/openvscode-server --without-connection-token --host 0.0.0.0
diff --git a/exercises/tutorial_halfday/CMakeLists.txt b/exercises/tutorial_halfday/CMakeLists.txt
index 2c9a0ab86b..7fbaa2437b 100644
--- a/exercises/tutorial_halfday/CMakeLists.txt
+++ b/exercises/tutorial_halfday/CMakeLists.txt
@@ -5,14 +5,6 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-raja_add_executable(
-  NAME ex1_vector-addition
-  SOURCES ex1_vector-addition.cpp)
-
-raja_add_executable(
-  NAME ex1_vector-addition_solution
-  SOURCES ex1_vector-addition_solution.cpp)
-
 raja_add_executable(
   NAME ex2_approx-pi
   SOURCES ex2_approx-pi.cpp)
@@ -21,22 +13,6 @@ raja_add_executable(
   NAME ex2_approx-pi_solution
   SOURCES ex2_approx-pi_solution.cpp)
 
-raja_add_executable(
-  NAME ex3_colored-indexset
-  SOURCES ex3_colored-indexset.cpp)
-
-raja_add_executable(
-  NAME ex3_colored-indexset_solution
-  SOURCES ex3_colored-indexset_solution.cpp)
-
-raja_add_executable(
-  NAME ex4_atomic-histogram
-  SOURCES ex4_atomic-histogram.cpp)
-
-raja_add_executable(
-  NAME ex4_atomic-histogram_solution
-  SOURCES ex4_atomic-histogram_solution.cpp)
-
 raja_add_executable(
   NAME ex5_line-of-sight
   SOURCES ex5_line-of-sight.cpp)
@@ -53,14 +29,6 @@ raja_add_executable(
   NAME ex6_stencil-offset-layout_solution
   SOURCES ex6_stencil-offset-layout_solution.cpp)
 
-raja_add_executable(
-  NAME ex7_nested-loop-reorder
-  SOURCES ex7_nested-loop-reorder.cpp)
-
-raja_add_executable(
-  NAME ex7_nested-loop-reorder_solution
-  SOURCES ex7_nested-loop-reorder_solution.cpp)
-
 raja_add_executable(
   NAME ex8_tiled-matrix-transpose
   SOURCES ex8_tiled-matrix-transpose.cpp)
diff --git a/exercises/tutorial_halfday/ex7_nested-loop-reorder.cpp b/exercises/tutorial_halfday/ex7_nested-loop-reorder.cpp
deleted file mode 100644
index 2c33f46344..0000000000
--- a/exercises/tutorial_halfday/ex7_nested-loop-reorder.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#include <cstdlib>
-#include <iostream>
-
-#include "RAJA/RAJA.hpp"
-
-/*
- * EXERCISE #6: Nested Loop Reordering
- *
- *  In this exercise, you will use RAJA::kernel execution policies 
- *  to permute the order of loops in a triple loop nest. In particular,
- *  you will reorder loop statements in execution policies. The exercise
- *  does no actual computation and just prints out the loop indices to show 
- *  the different orderings.
- *
- *  To avoid the complexity of interpreting parallel output, the execution
- *  policies you will write will use sequential execution.
- *
- *  RAJA features shown:
- *    - Index range segment
- *    - 'RAJA::kernel' loop abstractions and execution policies
- *    - Nested loop reordering
- *    - Strongly-typed loop indices
- */
-
-//
-// Define three named loop index types used in the triply-nested loops.
-// These will trigger compilation errors if lambda index argument ordering 
-// and types do not match the typed range index ordering.  See final
-// example in this file.
-//
-RAJA_INDEX_VALUE(KIDX, "KIDX");
-RAJA_INDEX_VALUE(JIDX, "JIDX"); 
-RAJA_INDEX_VALUE(IIDX, "IIDX"); 
-
-
-int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
-{
-
-  std::cout << "\n\nExercise #7: RAJA nested loop reorder example...\n";
-
-  std::cout << "\n Running C-style loop nest with loop ordering: K-outer, J-middle, I-inner" 
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
-
-  for (int k = 2; k < 4; ++k) {
-    for (int j = 1; j < 3; ++j) {
-      for (int i = 0; i < 2; ++i) {
-        printf( " (%d, %d, %d) \n", i, j, k);
-      }
-    }
-  }
-
-//
-// The RAJA variants of the loop nest used following typed range segments
-// based on the typed indices defined above, outside of main().
-// 
-  RAJA::TypedRangeSegment<KIDX> KRange(2, 4);
-  RAJA::TypedRangeSegment<JIDX> JRange(1, 3);
-  RAJA::TypedRangeSegment<IIDX> IRange(0, 2);
- 
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running RAJA nested loop example (K-outer, J-middle, I-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
-
-  using KJI_EXECPOL = RAJA::KernelPolicy<
-                        RAJA::statement::For<2, RAJA::seq_exec,    // k
-                          RAJA::statement::For<1, RAJA::seq_exec,  // j
-                            RAJA::statement::For<0, RAJA::seq_exec,// i 
-                              RAJA::statement::Lambda<0>
-                            > 
-                          > 
-                        > 
-                      >;
-
-  RAJA::kernel<KJI_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) { 
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-
-
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running RAJA nested loop example (J-outer, I-middle, K-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
-
-  ///
-  /// TODO...
-  ///
-  /// EXERCISE: Define an execution policy (JIK_EXECPOL) that reorders the
-  ///           loop nest so that the outer loop is the j-loop (slowest
-  ///           running index), the inner loop is the k-loop (fastest
-  ///           running index), and the i-loop is the middle loop.
-  ///
-  /// NOTE: You will have to enable this code section to compile and run it.
-  ///
-
-#if 0
-  using JIK_EXECPOL = 
-
-  RAJA::kernel<JIK_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) { 
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-#endif
-
-
-//----------------------------------------------------------------------------//
- 
-  std::cout << "\n Running RAJA nested loop example (I-outer, K-middle, J-inner)"
-            << "...\n\n" << " (I, J, K)\n" << " ---------\n";
-
-  ///
-  /// TODO...
-  ///
-  /// EXERCISE: Define an execution policy (IKJ_EXECPOL) that reorders the
-  ///           loop nest so that the outer loop is the i-loop (slowest
-  ///           running index), the inner loop is the j-loop (fastest
-  ///           running index), and the k-loop is the middle loop.
-  ///
-  /// NOTE: You will have to enable this code section to compile and run it.
-  ///
-
-#if 0
-  using IKJ_EXECPOL = 
-
-  RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (IIDX i, JIDX j, KIDX k) {
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-#endif
-
-
-#if 0
-//----------------------------------------------------------------------------//
-// The following demonstrates that code will not compile if lambda argument
-// types/order do not match the types/order For statements in the execution
-// policy. To see this, enable this code section and try to compile this file.
-//----------------------------------------------------------------------------//
-
-  RAJA::kernel<IKJ_EXECPOL>( RAJA::make_tuple(IRange, JRange, KRange),
-  [=] (JIDX i, IIDX j, KIDX k) {
-     printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k));
-  });
-
-#endif
-
-  std::cout << "\n DONE!...\n";
-
-  return 0;
-}
-
diff --git a/exercises/user-data.sh b/exercises/user-data.sh
new file mode 100644
index 0000000000..dd557ae116
--- /dev/null
+++ b/exercises/user-data.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+yum update -y
+amazon-linux-extras install docker
+systemctl start docer
+systemctl enable docker
+
+wget https://raw.githubusercontent.com/LLNL/RAJA/task/tut-reorg-aws/exercises/Dockerfile
+wget https://raw.githubusercontent.com/LLNL/RAJA/task/tut-reorg-aws/exercises/supervisord.conf
+
+env DOCKER_BUILDKIT=1 docker build . -t raja-aws-tut
+docker run --init --gpus all -p 3000:3000 raja-aws-tut
diff --git a/exercises/tutorial_halfday/ex1_vector-addition.cpp b/exercises/vector-addition.cpp
similarity index 59%
rename from exercises/tutorial_halfday/ex1_vector-addition.cpp
rename to exercises/vector-addition.cpp
index ae6a85c403..89b6e45fc4 100644
--- a/exercises/tutorial_halfday/ex1_vector-addition.cpp
+++ b/exercises/vector-addition.cpp
@@ -14,7 +14,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #1: Vector Addition
+ *  Vector Addition Exercise
  *
  *  In this exercise, you will compute c = a + b, where a, b, c are 
  *  integer vectors.
@@ -36,14 +36,19 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+  Specify the number of threads in a GPU thread block
+*/
+#if defined(RAJA_ENABLE_CUDA)
+//constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
 
-                    Uncomment to use when filling in exercises. 
+#if defined(RAJA_ENABLE_HIP)
+//constexpr int HIP_BLOCK_SIZE = 256;
+#endif
 
-#if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
+#if defined(RAJA_ENABLE_SYCL)
+//constexpr int SYCL_BLOCK_SIZE = 256;
 #endif
-*/
 
 //
 // Functions for checking and printing arrays
@@ -55,12 +60,17 @@ void printArray(int* v, int len);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nExercise #1: RAJA Vector Addition...\n";
+  std::cout << "\n\nExercise: RAJA Vector Addition...\n";
+
+#if defined(RAJA_ENABLE_SYCL)
+  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
+  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
+#endif
 
 //
 // Define vector length
 //
-  const int N = 1000000;
+  constexpr int N = 1000000;
 
 //
 // Allocate and initialize vector data to random numbers in [1, 10].
@@ -84,9 +94,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style sequential vector addition...\n";
 
+  // _cstyle_vector_add_start
   for (int i = 0; i < N; ++i) {
     c_ref[i] = a[i] + b[i];
   }
+  // _cstyle_vector_add_end
 
 //printArray(c_ref, N);
 
@@ -108,11 +120,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   /// NOTE: We've done this one for you to help you get started...
   ///
 
-  using EXEC_POL1 = RAJA::seq_exec;
-
-  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) {
-    c[i] = a[i] + b[i];
+  // _rajaseq_vector_add_start
+  RAJA::forall<RAJA::seq_exec>(RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+    c[i] = a[i] + b[i]; 
   });
+  // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
 //printArray(c, N);
@@ -212,15 +224,120 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA vector addition...\n";
 
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
+
+  cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice ));
+
   ///
   /// TODO...
   ///
   /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
   ///           method and RAJA::cuda_exec execution policy type.
   ///
+  ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here. 
+  ///
+
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
 //printArray(c, N);
+
+//----------------------------------------------------------------------------//
+// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a 
+// GPU device with 2 blocks per SM.
+//----------------------------------------------------------------------------//
+
+  std::memset(c, 0, N * sizeof(int));
+
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
+  ///           method and RAJA::cuda_exec execution policy type with 
+  ///           arguments defining 2 blocks per SM and asynchronous execution.
+  ///
+  ///           NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here. 
+  ///
+
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
+
+  checkResult(c, c_ref, N);
+//printResult(c, N);
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  std::cout << "\n Running RAJA HIP vector addition...\n";
+
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
+
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
+  ///           method and RAJA::hip_exec execution policy type.
+  ///
+  ///           NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here. 
+  ///
+
+  hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  checkResult(c, c_ref, N);
+//printResult(c, N);
+
+  memoryManager::deallocate_gpu(d_a);
+  memoryManager::deallocate_gpu(d_b);
+  memoryManager::deallocate_gpu(d_c);
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA::sycl_exec policy runs the loop as a SYCL kernel.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_SYCL)
+  std::cout << "\n Running RAJA SYCL vector addition...\n";
+
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
+
+  memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int));
+  memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement the vector addition kernel using a RAJA::forall
+  ///           method and RAJA::hip_exec execution policy type.
+  ///
+  ///           NOTE: You will have to uncomment 'SYCL_BLOCK_SIZE' near the
+  ///                 top of the file if you want to use it here. 
+  ///
+
+  memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int));
+
+  checkResult(c, c_ref, N);
+//printResult(c, N);
+
+  memoryManager::deallocate_gpu(d_a);
+  memoryManager::deallocate_gpu(d_b);
+  memoryManager::deallocate_gpu(d_c);
 #endif
 
 //----------------------------------------------------------------------------//
diff --git a/exercises/tutorial_halfday/ex1_vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp
similarity index 54%
rename from exercises/tutorial_halfday/ex1_vector-addition_solution.cpp
rename to exercises/vector-addition_solution.cpp
index d02c2cb26f..31bf643488 100644
--- a/exercises/tutorial_halfday/ex1_vector-addition_solution.cpp
+++ b/exercises/vector-addition_solution.cpp
@@ -14,7 +14,7 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #1: Vector Addition
+ *  Vector Addition Exercise
  *
  *  In this exercise, you will compute c = a + b, where a, b, c are 
  *  integer vectors.
@@ -36,10 +36,18 @@
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+  Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
+constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+constexpr int HIP_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_SYCL)
+constexpr int SYCL_BLOCK_SIZE = 256;
 #endif
 
 //
@@ -52,12 +60,17 @@ void printArray(int* v, int len);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nExercise #1: RAJA Vector Addition...\n";
+  std::cout << "\n\nExercise: RAJA Vector Addition...\n";
+
+#if defined(RAJA_ENABLE_SYCL)
+  memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()};
+  ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res);
+#endif
 
 //
 // Define vector length
 //
-  const int N = 1000000;
+  constexpr int N = 1000000;
 
 //
 // Allocate and initialize vector data to random numbers in [1, 10].
@@ -81,9 +94,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style sequential vector addition...\n";
 
+  // _cstyle_vector_add_start
   for (int i = 0; i < N; ++i) {
     c_ref[i] = a[i] + b[i];
   }
+  // _cstyle_vector_add_end
 
 //printArray(c_ref, N);
 
@@ -96,11 +111,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA sequential vector addition...\n";
 
-  using EXEC_POL1 = RAJA::seq_exec;
-
-  RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });    
+  // _rajaseq_vector_add_start
+  RAJA::forall< RAJA::seq_exec >(
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) {
+      c[i] = a[i] + b[i];
+    }
+  );
+  // _rajaseq_vector_add_end
 
   checkResult(c, c_ref, N);
 //printArray(c, N);
@@ -115,11 +132,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA SIMD vector addition...\n";
 
-  using EXEC_POL2 = RAJA::simd_exec;
-
-  RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });    
+  RAJA::forall<RAJA::simd_exec>(
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+      c[i] = a[i] + b[i]; 
+    }
+  );    
 
   checkResult(c, c_ref, N);
 //printArray(c, N);
@@ -134,11 +151,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA loop-exec vector addition...\n";
 
-  using EXEC_POL3 = RAJA::loop_exec;
-
-  RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i];
-  });
+  RAJA::forall< RAJA::loop_exec >(
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+      c[i] = a[i] + b[i];
+    }
+  );
 
   checkResult(c, c_ref, N);
 //printArray(c, N);
@@ -176,11 +193,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA OpenMP multithreaded vector addition...\n";
 
-  using EXEC_POL4 = RAJA::omp_parallel_for_exec;
-
-  RAJA::forall< EXEC_POL4 >(RAJA::RangeSegment(0, N), [=] (int i) { 
-    c[i] = a[i] + b[i]; 
-  });    
+  // _rajaomp_vector_add_start
+  RAJA::forall< RAJA::omp_parallel_for_exec >(
+    RAJA::TypedRangeSegment<int>(0, N), [=] (int i) { 
+      c[i] = a[i] + b[i]; 
+    }
+  );    
+  // _rajaomp_vector_add_end
 
   checkResult(c, c_ref, N);
 //printArray(c, N);
@@ -197,14 +216,109 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running RAJA CUDA vector addition...\n";
 
-  using EXEC_POL5 = RAJA::cuda_exec<CUDA_BLOCK_SIZE>;
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
 
-  RAJA::forall< EXEC_POL5 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) {
-    c[i] = a[i] + b[i];
+  cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice ));
+  cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice ));
+
+  // _rajacuda_vector_add_start
+  RAJA::forall< RAJA::cuda_exec<CUDA_BLOCK_SIZE> >(RAJA::TypedRangeSegment<int>(0, N), 
+    [=] RAJA_DEVICE (int i) {
+    d_c[i] = d_a[i] + d_b[i];
   });
+  // _rajacuda_vector_add_end
+
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
 
   checkResult(c, c_ref, N);
 //printArray(c, N);
+
+//----------------------------------------------------------------------------//
+// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a 
+// GPU device with 2 blocks per SM.
+//----------------------------------------------------------------------------//
+
+  std::memset(c, 0, N * sizeof(int));
+
+  std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n";
+
+  // _rajacuda_explicit_vector_add_start
+  const bool Asynchronous = true;
+
+  RAJA::forall<RAJA::cuda_exec_explicit<CUDA_BLOCK_SIZE, 2, Asynchronous>>(RAJA::TypedRangeSegment<int>(0, N), 
+    [=] RAJA_DEVICE (int i) { 
+    d_c[i] = d_a[i] + d_b[i]; 
+  });    
+  // _rajacuda_explicit_vector_add_end
+
+  cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost ));
+
+  checkResult(c, c_ref, N);
+//printResult(c, N);
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+  std::cout << "\n Running RAJA HIP vector addition...\n";
+
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
+
+  hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice ));
+  hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice ));
+
+  // _rajahip_vector_add_start
+  RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N),
+    [=] RAJA_DEVICE (int i) {
+    d_c[i] = d_a[i] + d_b[i];
+  });
+  // _rajahip_vector_add_end
+
+  hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost ));
+
+  checkResult(c, c_ref, N);
+//printResult(c, N);
+
+  memoryManager::deallocate_gpu(d_a);
+  memoryManager::deallocate_gpu(d_b);
+  memoryManager::deallocate_gpu(d_c);
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA::sycl_exec policy runs the loop as a SYCL kernel.
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_SYCL)
+  std::cout << "\n Running RAJA SYCL vector addition...\n";
+
+  int *d_a = memoryManager::allocate_gpu<int>(N);
+  int *d_b = memoryManager::allocate_gpu<int>(N);
+  int *d_c = memoryManager::allocate_gpu<int>(N);
+
+  memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int));
+  memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int));
+
+  // _rajasycl_vector_add_start
+  RAJA::forall<RAJA::sycl_exec<SYCL_BLOCK_SIZE>>(RAJA::TypedRangeSegment<int>(0, N),
+    [=] RAJA_DEVICE (int i) {
+    d_c[i] = d_a[i] + d_b[i];
+  });
+  // _rajasycl_vector_add_end
+
+  memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int));
+
+  checkResult(c, c_ref, N);
+//printResult(c, N);
+
+  memoryManager::deallocate_gpu(d_a);
+  memoryManager::deallocate_gpu(d_b);
+  memoryManager::deallocate_gpu(d_c);
 #endif
 
 //----------------------------------------------------------------------------//
diff --git a/exercises/tutorial_halfday/ex3_colored-indexset.cpp b/exercises/vertexsum-indexset.cpp
similarity index 51%
rename from exercises/tutorial_halfday/ex3_colored-indexset.cpp
rename to exercises/vertexsum-indexset.cpp
index f42d047648..60709ddee9 100644
--- a/exercises/tutorial_halfday/ex3_colored-indexset.cpp
+++ b/exercises/vertexsum-indexset.cpp
@@ -18,10 +18,10 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #3: Mesh vertex area with "colored" TypedIndexSet
+ *  Mesh vertex area exercise
  *
  *  In this exercise, you will use a RAJA TypedIndexSet containing 4 
- *  ListSegments to parallelize the mesh vertex area computation.
+ *  TypedListSegments to parallelize the mesh vertex area computation.
  *  A sum is computed at each vertex on a logically-Cartesian 2D mesh
  *  where the sum represents the vertex "area" as an average of the 4
  *  element areas surrounding the vertex. The computation is written as
@@ -29,7 +29,7 @@
  *  contributions may be written to the same vertex value at the same time,
  *  the elements are partitioned into 4 subsets, where no two elements in
  *  each subset share a vertex. A ListSegment enumerates the elements in
- *  each subset. When the ListSegments are put into an TypedIndexSet, the entire
+ *  each subset. When the ListSegments are put into an IndexSet, the entire
  *  computation can be executed with one RAJA::forall() statement, where
  *  you iterate over the segments sequentially and execute each segment in
  *  parallel. This exercise illustrates how RAJA can be used to enable one 
@@ -43,22 +43,23 @@
  *
  *  RAJA features you will use:
  *    - `forall` loop iteration template method
- *    -  Index list segment
- *    -  TypedIndexSet segment container
+ *    -  List segment
+ *    -  IndexSet segment container
  *    -  Hierarchical execution policies
  *
  * If CUDA is enabled, CUDA unified memory is used.
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
-
-                    Uncomment to use when filling in exercises.
-
+  Specify the number of threads in a GPU thread block
+*/
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
+constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+constexpr int HIP_BLOCK_SIZE = 256;
 #endif
-*/
 
 //
 // Functions to check and print result.
@@ -70,47 +71,50 @@ void printMeshData(double* v, int n, int joff);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nExercise #3: Mesh vertex area with 'colored' TypedIndexSet...\n";
+  std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n";
 
+// _vertexsum_define_start
 //
 // 2D mesh has N^2 elements (N+1)^2 vertices.
 //
-  const int N = 1000;
-  const int Nelem = N;
-  const int Nvert = N + 1;
-  double* areae = memoryManager::allocate<double>(Nelem*Nelem);
-  double* areav = memoryManager::allocate<double>(Nvert*Nvert);
-  double* areav_ref = memoryManager::allocate<double>(Nvert*Nvert);
-  int* e2v_map = memoryManager::allocate<int>(4*Nelem*Nelem);
-
+  constexpr int N = 1000;
+  constexpr int Nelem = N;
+  constexpr int Nelem_tot = Nelem * Nelem;
+  constexpr int Nvert = N + 1;
+  constexpr int Nvert_tot = Nvert * Nvert;
+// _vertexsum_define_end
+  double* areae = memoryManager::allocate<double>(Nelem_tot);
+  double* areav = memoryManager::allocate<double>(Nvert_tot);
+  double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
+  int* e2v_map = memoryManager::allocate<int>(4*Nelem_tot);
+
+// _vertexsum_elemarea_start
 //
 // Define mesh spacing factor 'h' and set up elem to vertex mapping array.
 //
-  double h = 0.1;
-
-  for (int j = 0 ; j < Nelem ; ++j) {
-    for (int i = 0 ; i < Nelem ; ++i) {
-      int ielem = i + j*Nelem ;
-      int imap = 4 * ielem ;
-      e2v_map[imap] = ielem + j;
-      e2v_map[imap+1] = ielem + j + 1;
-      e2v_map[imap+2] = ielem + j + Nvert;
-      e2v_map[imap+3] = ielem + j + 1 + Nvert;
-    }
+  constexpr double h = 0.1;
+
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int j = ie / Nelem;
+    int imap = 4 * ie ;
+    e2v_map[imap] = ie + j;
+    e2v_map[imap+1] = ie + j + 1;
+    e2v_map[imap+2] = ie + j + Nvert;
+    e2v_map[imap+3] = ie + j + 1 + Nvert;
   }
 
 //
 // Initialize element areas so each element area 
 // depends on the i,j coordinates of the element.
 //
-  std::memset(areae, 0, Nelem*Nelem * sizeof(double));
+  std::memset(areae, 0, Nelem_tot * sizeof(double));
 
-  for (int j = 0 ; j < Nelem ; ++j) {
-    for (int i = 0 ; i < Nelem ; ++i) {
-      int ielem = i + j*Nelem ;
-      areae[ielem] = h*(i+1) * h*(j+1);
-    }
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int i = ie % Nelem;
+    int j = ie / Nelem;
+    areae[ie] = h*(i+1) * h*(j+1);
   }
+// _vertexsum_elemarea_end
 
 //std::cout << "\n Element areas...\n";
 //printMeshData(areae, Nelem, Nelem);
@@ -121,15 +125,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running sequential C-style version of vertex sum...\n";
 
-  std::memset(areav_ref, 0, Nvert*Nvert * sizeof(double));
+// _cstyle_vertexarea_seq_start
+  std::memset(areav_ref, 0, Nvert_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem*Nelem; ++ie) {
-      int* iv = &(e2v_map[4*ie]);
-      areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
-      areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
-      areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
-      areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
+  for (int ie = 0; ie < Nelem_tot; ++ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
   }
+// _cstyle_vertexarea_seq_end
 
 //std::cout << "\n Vertex areas (reference)...\n";
 //printMeshData(areav_ref, Nvert, jvoff);
@@ -153,33 +159,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 // Since none of the elements with the same number share a common vertex,
 // we can iterate over each subset ("color") in parallel.
 //
-// We use RAJA ListSegments and a RAJA TypedIndexSet to define the element 
+// We use RAJA ListSegments and a RAJA IndexSet to define the element 
 // partitioning. 
 //
 
+// _vertexarea_color_start
 //
 // Gather the element indices for each color in a vector.
 //
   std::vector< std::vector<int> > idx(4);
 
-  for (int j = 0 ; j < Nelem ; ++j) {
-    for (int i = 0 ; i < Nelem ; ++i) {
-      int ie = i + j*Nelem ;
-      if ( i % 2 == 0 ) {
-        if ( j % 2 == 0 ) {
-          idx[0].push_back(ie);
-        } else {
-          idx[2].push_back(ie);  
-        }
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int i = ie % Nelem;
+    int j = ie / Nelem;
+    if ( i % 2 == 0 ) {
+      if ( j % 2 == 0 ) {
+        idx[0].push_back(ie);
+      } else {
+        idx[2].push_back(ie);
+      }
+    } else {
+      if ( j % 2 == 0 ) {
+        idx[1].push_back(ie);
       } else {
-        if ( j % 2 == 0 ) {
-          idx[1].push_back(ie);
-        } else {
-          idx[3].push_back(ie);
-        }
+        idx[3].push_back(ie);
       }
     }
   }
+// _vertexarea_color_end
 
 
 //----------------------------------------------------------------------------//
@@ -191,7 +198,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style OpenMP vertex sum...\n";
 
-  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
+
+// _cstyle_vertexarea_omp_start
+  std::memset(areav, 0, Nvert_tot * sizeof(double));
 
   for (int icol = 0; icol < 4; ++icol) {
      const std::vector<int>& ievec = idx[icol];
@@ -208,6 +217,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
      }
 
   }
+// _cstyle_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
 //std::cout << "\n Vertex areas (reference)...\n";
@@ -216,13 +226,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-// The TypedIndexSet is a variadic template, where the template arguments
-// are the segment types that the TypedIndexSet can hold.
-//
-#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA)
-#if 0  // needed for exercises, but if-def'd out to quiet compiler warnings.
+// The IndexSet is a variadic template, where the template arguments
+// are the segment types that the IndexSet can hold. 
+// 
+#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+// _vertexarea_listsegtype_start
   using SegmentType = RAJA::TypedListSegment<int>;
-#endif
+// _vertexarea_listsegtype_end
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP)
@@ -233,25 +243,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 //
   camp::resources::Resource host_res{camp::resources::Host()};
 
-//
-// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of
-// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// 
+// Create a RAJA IndexSet with four ListSegments, one for the indices of 
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA 
 // variants of the vertex sum calculation.
-#if 0  // needed for exercises, but if-def'd out to quiet compiler warnings.
+
   RAJA::TypedIndexSet<SegmentType> colorset;
-#endif
+
+  colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); 
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Add four SegmentType objects to the coloret, one for each of 
-  ///           the 'idx' arrays above. Remember to pass the 'host_res'
-  ///           object to the SegmentType constructor. 
+  /// EXERCISE: Add the three list segments to the index set to account
+  ///           for all mesh elements. Then, run the OpenMP kernel variant
+  ///           below to check if it's correct.
   ///
 
-
 //----------------------------------------------------------------------------//
-// RAJA OpenMP vertex sum calculation using TypedIndexSet (sequential iteration 
+// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration 
 // over segments, OpenMP parallel iteration of each segment)
 //----------------------------------------------------------------------------//
 
@@ -259,19 +269,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  ///
-  /// TODO...
-  ///
-  /// EXERCISE: Implement the vertex sum kernel a RAJA::forall
-  ///           method with execution policy type 
-  ///
-  ///            RAJA::ExecPolicy<RAJA::seq_segit, 
-  ///                             RAJA::omp_parallel_for_exec>
-  ///
-  ///            so that the kernel iterates over the segments sequentially
-  ///            and executes each segment in parallel using OpenMP.
-  ///
+// _raja_vertexarea_omp_start
+  using EXEC_POL1 = RAJA::ExecPolicy<RAJA::seq_segit, 
+                                     RAJA::omp_parallel_for_exec>;
 
+  RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav[ iv[0] ] += areae[ie] / 4.0 ;
+    areav[ iv[1] ] += areae[ie] / 4.0 ;
+    areav[ iv[2] ] += areae[ie] / 4.0 ;
+    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
 //std::cout << "\n Vertex volumes...\n";
@@ -281,7 +290,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 
 //----------------------------------------------------------------------------//
-// RAJA CUDA vertex sum calculation using TypedIndexSet (sequential iteration 
+// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration 
 // over segments, CUDA kernel launched for each segment)
 //----------------------------------------------------------------------------//
 
@@ -289,58 +298,112 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //
 // Resource object used to construct list segment objects with indices
-// living in host (CPU) memory.
+// living in device (GPU) memory.
 //
   camp::resources::Resource cuda_res{camp::resources::Cuda()};
 
-#if 0  // needed for exercises, but if-def'd out to quiet compiler warnings.
-  RAJA::TypedIndexSet<SegmentType> cuda_colorset;
-#endif
+//
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
 
-  ///
-  /// TODO...
-  ///
-  /// EXERCISE: Add four SegmentType objects to the cuda_coloret, one for 
-  ///           each of the 'idx' arrays above. Remember to pass the 'cuda_res'
-  ///           object to the SegmentType constructor.
-  ///
+  RAJA::TypedIndexSet<SegmentType> cuda_colorset;
 
+  cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) );
 
   ///
   /// TODO...
   ///
-  /// EXERCISE: Implement the vertex sum kernel a RAJA::forall
-  ///           method with execution policy type
-  ///
-  ///            RAJA::ExecPolicy<RAJA::seq_segit,
-  ///                             RAJA::cuda_exec<CUDA_BLOCK_SIZE>>
+  /// EXERCISE: Add the three list segments to the index set to account
+  ///           for all mesh elements. Then, run the CUDA kernel variant
+  ///           below to check if it's correct.
   ///
-  ///            so that the kernel iterates over the segments sequentially
-  ///            and executes each segment in parallel as a CUDA kernel.
-
 
   std::cout << "\n Running RAJA CUDA index set vertex sum...\n";
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  ///
-  /// TODO...
-  ///
-  /// EXERCISE: Implement the vertex sum kernel a RAJA::forall
-  ///           method with execution policy type 
-  ///
-  ///            RAJA::ExecPolicy<RAJA::seq_segit, 
-  ///                             RAJA::cuda_exec<CUDA_BLOCK_SIZE>>
-  ///
-  ///            so that the kernel iterates over the segments sequentially
-  ///            and executes each segment in parallel as a CUDA kernel.
-  ///
+// _raja_vertexarea_cuda_start
+  using EXEC_POL2 = RAJA::ExecPolicy<RAJA::seq_segit, 
+                                     RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
+
+  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav[ iv[0] ] += areae[ie] / 4.0 ;
+    areav[ iv[1] ] += areae[ie] / 4.0 ;
+    areav[ iv[2] ] += areae[ie] / 4.0 ;
+    areav[ iv[3] ] += areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_cuda_end
+
+  checkResult(areav, areav_ref, Nvert);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, jvoff);
+
+#endif
+
+//----------------------------------------------------------------------------//
+// RAJA HIP vertex sum calculation using IndexSet (sequential iteration
+// over segments, HIP kernel launched for each segment)
+//----------------------------------------------------------------------------//
 
+#if defined(RAJA_ENABLE_HIP)
 
+//
+// Allocate and initialize device memory arrays
+//
+  double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
+  double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
+  int* d_e2v_map  = memoryManager::allocate_gpu<int>(4*Nelem_tot);
+
+  hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice);
+
+  std::memset(areav, 0, Nvert_tot * sizeof(double));
+  hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice);
+
+//
+// Resource object used to construct list segment objects with indices
+// living in device (GPU) memory.
+//
+  camp::resources::Resource hip_res{camp::resources::Hip()};
+
+//
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
+
+  RAJA::TypedIndexSet<SegmentType> hip_colorset;
+
+  hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) );
+
+  std::cout << "\n Running RAJA HIP index set vertex sum...\n";
+
+// _raja_vertexarea_hip_start
+  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit,
+                                     RAJA::hip_exec<HIP_BLOCK_SIZE>>;
+
+  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE (int ie) {
+    int* iv = &(d_e2v_map[4*ie]);
+    d_areav[ iv[0] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[1] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[2] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[3] ] += d_areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_hip_end
+
+  hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost);
   checkResult(areav, areav_ref, Nvert);
 //std::cout << "\n Vertex volumes...\n";
 //printMeshData(areav, Nvert, jvoff);
 
+  memoryManager::deallocate_gpu(d_areae);
+  memoryManager::deallocate_gpu(d_areav);
+  memoryManager::deallocate_gpu(d_e2v_map);
+
 #endif
 
 //----------------------------------------------------------------------------//
diff --git a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp
similarity index 59%
rename from exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp
rename to exercises/vertexsum-indexset_solution.cpp
index 98804fb933..e941c7ec51 100644
--- a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp
+++ b/exercises/vertexsum-indexset_solution.cpp
@@ -18,10 +18,10 @@
 #include "memoryManager.hpp"
 
 /*
- *  EXERCISE #3: Mesh vertex area with "colored" TypedIndexSet
+ *  Mesh vertex area exercise
  *
  *  In this exercise, you will use a RAJA TypedIndexSet containing 4 
- *  ListSegments to parallelize the mesh vertex area computation.
+ *  TypedListSegments to parallelize the mesh vertex area computation.
  *  A sum is computed at each vertex on a logically-Cartesian 2D mesh
  *  where the sum represents the vertex "area" as an average of the 4
  *  element areas surrounding the vertex. The computation is written as
@@ -29,7 +29,7 @@
  *  contributions may be written to the same vertex value at the same time,
  *  the elements are partitioned into 4 subsets, where no two elements in
  *  each subset share a vertex. A ListSegment enumerates the elements in
- *  each subset. When the ListSegments are put into an TypedIndexSet, the entire
+ *  each subset. When the ListSegments are put into an IndexSet, the entire
  *  computation can be executed with one RAJA::forall() statement, where
  *  you iterate over the segments sequentially and execute each segment in
  *  parallel. This exercise illustrates how RAJA can be used to enable one 
@@ -43,18 +43,22 @@
  *
  *  RAJA features you will use:
  *    - `forall` loop iteration template method
- *    -  Index list segment
- *    -  TypedIndexSet segment container
+ *    -  List segment
+ *    -  IndexSet segment container
  *    -  Hierarchical execution policies
  *
  * If CUDA is enabled, CUDA unified memory is used.
  */
 
 /*
-  CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block
+  Specify the number of threads in a GPU thread block
 */
 #if defined(RAJA_ENABLE_CUDA)
-const int CUDA_BLOCK_SIZE = 256;
+constexpr int CUDA_BLOCK_SIZE = 256;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+constexpr int HIP_BLOCK_SIZE = 256;
 #endif
 
 //
@@ -67,47 +71,50 @@ void printMeshData(double* v, int n, int joff);
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  std::cout << "\n\nExercise #3: Mesh vertex area with 'colored' TypedIndexSet...\n";
+  std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n";
 
+// _vertexsum_define_start
 //
 // 2D mesh has N^2 elements (N+1)^2 vertices.
 //
-  const int N = 1000;
-  const int Nelem = N;
-  const int Nvert = N + 1;
-  double* areae = memoryManager::allocate<double>(Nelem*Nelem);
-  double* areav = memoryManager::allocate<double>(Nvert*Nvert);
-  double* areav_ref = memoryManager::allocate<double>(Nvert*Nvert);
-  int* e2v_map = memoryManager::allocate<int>(4*Nelem*Nelem);
-
+  constexpr int N = 1000;
+  constexpr int Nelem = N;
+  constexpr int Nelem_tot = Nelem * Nelem;
+  constexpr int Nvert = N + 1;
+  constexpr int Nvert_tot = Nvert * Nvert;
+// _vertexsum_define_end
+  double* areae = memoryManager::allocate<double>(Nelem_tot);
+  double* areav = memoryManager::allocate<double>(Nvert_tot);
+  double* areav_ref = memoryManager::allocate<double>(Nvert_tot);
+  int* e2v_map = memoryManager::allocate<int>(4*Nelem_tot);
+
+// _vertexsum_elemarea_start
 //
 // Define mesh spacing factor 'h' and set up elem to vertex mapping array.
 //
-  double h = 0.1;
-
-  for (int j = 0 ; j < Nelem ; ++j) {
-    for (int i = 0 ; i < Nelem ; ++i) {
-      int ielem = i + j*Nelem ;
-      int imap = 4 * ielem ;
-      e2v_map[imap] = ielem + j;
-      e2v_map[imap+1] = ielem + j + 1;
-      e2v_map[imap+2] = ielem + j + Nvert;
-      e2v_map[imap+3] = ielem + j + 1 + Nvert;
-    }
+  constexpr double h = 0.1;
+
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int j = ie / Nelem;
+    int imap = 4 * ie ;
+    e2v_map[imap] = ie + j;
+    e2v_map[imap+1] = ie + j + 1;
+    e2v_map[imap+2] = ie + j + Nvert;
+    e2v_map[imap+3] = ie + j + 1 + Nvert;
   }
 
 //
 // Initialize element areas so each element area 
 // depends on the i,j coordinates of the element.
 //
-  std::memset(areae, 0, Nelem*Nelem * sizeof(double));
+  std::memset(areae, 0, Nelem_tot * sizeof(double));
 
-  for (int j = 0 ; j < Nelem ; ++j) {
-    for (int i = 0 ; i < Nelem ; ++i) {
-      int ielem = i + j*Nelem ;
-      areae[ielem] = h*(i+1) * h*(j+1);
-    }
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int i = ie % Nelem;
+    int j = ie / Nelem;
+    areae[ie] = h*(i+1) * h*(j+1);
   }
+// _vertexsum_elemarea_end
 
 //std::cout << "\n Element areas...\n";
 //printMeshData(areae, Nelem, Nelem);
@@ -118,15 +125,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running sequential C-style version of vertex sum...\n";
 
-  std::memset(areav_ref, 0, Nvert*Nvert * sizeof(double));
+// _cstyle_vertexarea_seq_start
+  std::memset(areav_ref, 0, Nvert_tot * sizeof(double));
 
-  for (int ie = 0; ie < Nelem*Nelem; ++ie) {
-      int* iv = &(e2v_map[4*ie]);
-      areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
-      areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
-      areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
-      areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
+  for (int ie = 0; ie < Nelem_tot; ++ie) {
+    int* iv = &(e2v_map[4*ie]);
+    areav_ref[ iv[0] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[1] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[2] ] += areae[ie] / 4.0 ;
+    areav_ref[ iv[3] ] += areae[ie] / 4.0 ;
   }
+// _cstyle_vertexarea_seq_end
 
 //std::cout << "\n Vertex areas (reference)...\n";
 //printMeshData(areav_ref, Nvert, jvoff);
@@ -150,33 +159,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 // Since none of the elements with the same number share a common vertex,
 // we can iterate over each subset ("color") in parallel.
 //
-// We use RAJA ListSegments and a RAJA TypedIndexSet to define the element 
+// We use RAJA ListSegments and a RAJA IndexSet to define the element 
 // partitioning. 
 //
 
+// _vertexarea_color_start
 //
 // Gather the element indices for each color in a vector.
 //
   std::vector< std::vector<int> > idx(4);
 
-  for (int j = 0 ; j < Nelem ; ++j) {
-    for (int i = 0 ; i < Nelem ; ++i) {
-      int ie = i + j*Nelem ;
-      if ( i % 2 == 0 ) {
-        if ( j % 2 == 0 ) {
-          idx[0].push_back(ie);
-        } else {
-          idx[2].push_back(ie);  
-        }
+  for (int ie = 0; ie < Nelem_tot; ++ie) { 
+    int i = ie % Nelem;
+    int j = ie / Nelem;
+    if ( i % 2 == 0 ) {
+      if ( j % 2 == 0 ) {
+        idx[0].push_back(ie);
       } else {
-        if ( j % 2 == 0 ) {
-          idx[1].push_back(ie);
-        } else {
-          idx[3].push_back(ie);
-        }
+        idx[2].push_back(ie);
+      }
+    } else {
+      if ( j % 2 == 0 ) {
+        idx[1].push_back(ie);
+      } else {
+        idx[3].push_back(ie);
       }
     }
   }
+// _vertexarea_color_end
 
 
 //----------------------------------------------------------------------------//
@@ -188,7 +198,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::cout << "\n Running C-style OpenMP vertex sum...\n";
 
-  std::memset(areav, 0, Nvert*Nvert * sizeof(double));
+
+// _cstyle_vertexarea_omp_start
+  std::memset(areav, 0, Nvert_tot * sizeof(double));
 
   for (int icol = 0; icol < 4; ++icol) {
      const std::vector<int>& ievec = idx[icol];
@@ -205,6 +217,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
      }
 
   }
+// _cstyle_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
 //std::cout << "\n Vertex areas (reference)...\n";
@@ -213,11 +226,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 #endif
 
 
-// The TypedIndexSet is a variadic template, where the template arguments
-// are the segment types that the TypedIndexSet can hold. 
+// The IndexSet is a variadic template, where the template arguments
+// are the segment types that the IndexSet can hold. 
 // 
-#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA)
+#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) 
+// _vertexarea_listsegtype_start
   using SegmentType = RAJA::TypedListSegment<int>;
+// _vertexarea_listsegtype_end
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP)
@@ -229,19 +244,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   camp::resources::Resource host_res{camp::resources::Host()};
 
 // 
-// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of 
+// Create a RAJA IndexSet with four ListSegments, one for the indices of 
 // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA 
 // variants of the vertex sum calculation.
 
+// _vertexarea_indexset_start
   RAJA::TypedIndexSet<SegmentType> colorset;
 
   colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); 
   colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); 
   colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); 
-  colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) ); 
+  colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) );
+// _vertexarea_indexset_end
 
 //----------------------------------------------------------------------------//
-// RAJA OpenMP vertex sum calculation using TypedIndexSet (sequential iteration 
+// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration 
 // over segments, OpenMP parallel iteration of each segment)
 //----------------------------------------------------------------------------//
 
@@ -249,16 +266,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit, 
+// _raja_vertexarea_omp_start
+  using EXEC_POL1 = RAJA::ExecPolicy<RAJA::seq_segit, 
                                      RAJA::omp_parallel_for_exec>;
 
-  RAJA::forall<EXEC_POL3>(colorset, [=](int ie) {
+  RAJA::forall<EXEC_POL1>(colorset, [=](int ie) {
     int* iv = &(e2v_map[4*ie]);
     areav[ iv[0] ] += areae[ie] / 4.0 ;
     areav[ iv[1] ] += areae[ie] / 4.0 ;
     areav[ iv[2] ] += areae[ie] / 4.0 ;
     areav[ iv[3] ] += areae[ie] / 4.0 ;
   });
+// _raja_vertexarea_omp_end
 
   checkResult(areav, areav_ref, Nvert);
 //std::cout << "\n Vertex volumes...\n";
@@ -268,7 +287,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 
 //----------------------------------------------------------------------------//
-// RAJA CUDA vertex sum calculation using TypedIndexSet (sequential iteration 
+// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration 
 // over segments, CUDA kernel launched for each segment)
 //----------------------------------------------------------------------------//
 
@@ -276,12 +295,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 //
 // Resource object used to construct list segment objects with indices
-// living in host (CPU) memory.
+// living in device (GPU) memory.
 //
   camp::resources::Resource cuda_res{camp::resources::Cuda()};
 
 //
-// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
 // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
 // variants of the vertex sum calculation.
 
@@ -296,16 +315,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   std::memset(areav, 0, Nvert*Nvert * sizeof(double));
 
-  using EXEC_POL4 = RAJA::ExecPolicy<RAJA::seq_segit, 
+// _raja_vertexarea_cuda_start
+  using EXEC_POL2 = RAJA::ExecPolicy<RAJA::seq_segit, 
                                      RAJA::cuda_exec<CUDA_BLOCK_SIZE>>;
 
-  RAJA::forall<EXEC_POL4>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
+  RAJA::forall<EXEC_POL2>(cuda_colorset, [=] RAJA_DEVICE (int ie) {
     int* iv = &(e2v_map[4*ie]);
     areav[ iv[0] ] += areae[ie] / 4.0 ;
     areav[ iv[1] ] += areae[ie] / 4.0 ;
     areav[ iv[2] ] += areae[ie] / 4.0 ;
     areav[ iv[3] ] += areae[ie] / 4.0 ;
   });
+// _raja_vertexarea_cuda_end
 
   checkResult(areav, areav_ref, Nvert);
 //std::cout << "\n Vertex volumes...\n";
@@ -313,6 +334,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
 #endif
 
+//----------------------------------------------------------------------------//
+// RAJA HIP vertex sum calculation using IndexSet (sequential iteration
+// over segments, HIP kernel launched for each segment)
+//----------------------------------------------------------------------------//
+
+#if defined(RAJA_ENABLE_HIP)
+
+//
+// Allocate and initialize device memory arrays
+//
+  double* d_areae = memoryManager::allocate_gpu<double>(Nelem_tot);
+  double* d_areav = memoryManager::allocate_gpu<double>(Nvert_tot);
+  int* d_e2v_map  = memoryManager::allocate_gpu<int>(4*Nelem_tot);
+
+  hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice);
+  hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice);
+
+  std::memset(areav, 0, Nvert_tot * sizeof(double));
+  hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice);
+
+//
+// Resource object used to construct list segment objects with indices
+// living in device (GPU) memory.
+//
+  camp::resources::Resource hip_res{camp::resources::Hip()};
+
+//
+// Create a RAJA IndexSet with four ListSegments, one for the indices of
+// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA
+// variants of the vertex sum calculation.
+
+  RAJA::TypedIndexSet<SegmentType> hip_colorset;
+
+  hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) );
+  hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) );
+
+  std::cout << "\n Running RAJA HIP index set vertex sum...\n";
+
+// _raja_vertexarea_hip_start
+  using EXEC_POL3 = RAJA::ExecPolicy<RAJA::seq_segit,
+                                     RAJA::hip_exec<HIP_BLOCK_SIZE>>;
+
+  RAJA::forall<EXEC_POL3>(hip_colorset, [=] RAJA_DEVICE (int ie) {
+    int* iv = &(d_e2v_map[4*ie]);
+    d_areav[ iv[0] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[1] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[2] ] += d_areae[ie] / 4.0 ;
+    d_areav[ iv[3] ] += d_areae[ie] / 4.0 ;
+  });
+// _raja_vertexarea_hip_end
+
+  hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost);
+  checkResult(areav, areav_ref, Nvert);
+//std::cout << "\n Vertex volumes...\n";
+//printMeshData(areav, Nvert, jvoff);
+
+  memoryManager::deallocate_gpu(d_areae);
+  memoryManager::deallocate_gpu(d_areav);
+  memoryManager::deallocate_gpu(d_e2v_map);
+
+#endif
+
 //----------------------------------------------------------------------------//
 
   // Clean up...
diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp
new file mode 100644
index 0000000000..a96998d349
--- /dev/null
+++ b/exercises/view-layout.cpp
@@ -0,0 +1,625 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <cstring>
+#include <cmath>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  View and Layout Exercise
+ *
+ *  Examples illustrate the use of RAJA View and Layout types.
+ *
+ *  RAJA features shown:
+ *    - RAJA::View
+ *    - RAJA::Layout
+ *    - Layout permutations 
+ *    - OffsetLayout
+ *    - OffsetLayout permutations 
+ *
+ * NOTE: no RAJA kernel execution methods are used in these examples.
+ */
+
+//
+// Functions to check and print arrays
+//
+template <typename T>
+void checkResult(T* C, T* Cref, int N);
+
+template <typename T>
+void printValues(T* C, int N);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA view & layout exercises...\n";
+
+//----------------------------------------------------------------------------//
+//
+// Matrix-matrix multiplication: default layout
+//
+//----------------------------------------------------------------------------//
+
+  // _matmult_init_start
+  //
+  // Define the size of N x N of matrices.
+  //
+  constexpr int N = 4;
+
+  //
+  // Allocate storage for matrices and initialize matrix entries
+  //
+  double *A = new double[ N * N ];
+  double *B = new double[ N * N ];
+  double *C = new double[ N * N ];
+  double *Cref = new double[ N * N ];
+
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      A[ col + N*row ] = row + 1;
+      B[ col + N*row ] = col + 1;
+      C[ col + N*row ] = 0.0;
+      Cref[ col + N*row ] = 0.0;
+    }
+  }
+  // _matmult_init_end
+
+//printValues<double>(A, N*N); 
+//printValues<double>(B, N*N); 
+//printValues<double>(C, N*N); 
+//printValues<double>(Cref, N*N); 
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running matrix multiplication reference solution...\n";
+
+  // _cstyle_matmult_start
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
+        Cref[col + N*row] += A[k + N*row] * B[col + N*k];
+      }
+    }
+  }
+  // _cstyle_matmult_end
+
+//printValues<double>(Cref, N*N);
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running matrix multiplication w/Views...\n";
+
+  // 
+  // Define RAJA View objects to simplify access to the matrix entries.
+  // 
+  // Note: we use default Layout 
+  //
+  // _matmult_views_start
+  RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N);
+  // _matmult_views_end
+
+  // _cstyle_matmult_views_start
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
+        Cview(row, col) += Aview(row, k) * Bview(k, col);
+      }
+    }
+  }
+  // _cstyle_matmult_views_end
+
+  checkResult<double>(C, Cref, N*N);
+//printValues<double>(C, N*N);
+
+//
+// Clean up.
+//
+  delete [] A;
+  delete [] B;
+  delete [] C;
+  delete [] Cref;
+
+//----------------------------------------------------------------------------//
+//
+// Default layouts use row-major data ordering
+//
+//----------------------------------------------------------------------------//
+
+  //
+  // Define dimensions and allocate arrays
+  //
+  // _default_views_init_start
+  constexpr int Nx = 3;
+  constexpr int Ny = 5;
+  constexpr int Nz = 2;
+  constexpr int Ntot  = Nx*Ny*Nz;
+  int* a = new int[ Ntot ];
+  int* aref = new int[ Ntot ];
+
+  for (int i = 0; i < Ntot; ++i)
+  {
+    aref[i] = i;
+  }
+  // _default_views_init_end
+
+//printValues<int>(ref, Ntot);
+
+//----------------------------------------//
+
+  std::cout << "\n Running default layout view cases...\n";
+
+  std::cout << "\n\t Running 1D view case...\n";
+ 
+  std::memset(a, 0, Ntot * sizeof(int));
+ 
+  // _default_view1D_start 
+  RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot);
+
+  for (int i = 0; i < Ntot; ++i) {
+    view_1D(i) = i;
+  }
+  // _default_view1D_end 
+
+  checkResult<int>(a, aref, Ntot);
+//printValues<int>(a, Ntot);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D default layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+ 
+  // _default_view2D_start
+  RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny);
+
+  int iter{0};
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      view_2D(i, j) = iter;
+      ++iter;
+    }
+  }
+  // _default_view2D_end
+
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D default layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a triple loop nest using a RAJA::View and 
+  ///           three-dimensional RAJA::Layout that iterates over the
+  ///           data array 'a' with unit stride.
+  ///
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//----------------------------------------------------------------------------//
+//
+// Permuted layouts change the data striding order
+//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running permuted layout cases...\n";
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D default permutation view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _default_perm_view2D_start
+  std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
+  RAJA::Layout< 2, int > defperm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2);
+  RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout);
+
+  iter = 0;
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      defperm_view_2D(i, j) = iter;
+      ++iter;
+    }
+  }
+  // _default_perm_view2D_end
+
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D default permutation view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a triple loop nest using a RAJA::View and
+  ///           three-dimensional RAJA::Layout with the identity permutation.
+  ///
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//----------------------------------------//
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D permuted layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _perm_2D_start
+  std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
+  RAJA::Layout< 2, int > perm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, perm2);
+  RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout);
+
+  iter = 0;
+  for (int j = 0; j < Ny; ++j) {
+    for (int i = 0; i < Nx; ++i) {
+      perm_view_2D(i, j) = iter;
+      ++iter;
+    }
+  }
+  // _perm_2D_end
+
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D perma layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a triple loop nest using a RAJA::View and
+  ///           three-dimensional RAJA::Layout with the permutation
+  ///           {2, 1, 0}. 
+  ///
+  ///           Name the Layout object 'perm3a_layout' so it can be used
+  ///           with the index conversion methods in the section below.
+  ///           Uncomment those methods if you want to try them with the
+  ///           Layout object you create here.
+  ///
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D permb layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _permb_view3D_start
+  std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
+  RAJA::Layout< 3, int > perm3b_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b);
+  RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout);
+
+  iter = 0;
+  for (int j = 0; j < Ny; ++j) {
+    for (int k = 0; k < Nz; ++k) {
+      for (int i = 0; i < Nx; ++i) {
+        perm3b_view_3D(i, j, k) = iter;
+        ++iter;
+      }
+    }
+  }
+  // _permb_view3D_end
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//
+// Clean up.
+//
+  delete [] a;
+  delete [] aref;
+
+//----------------------------------------------------------------------------//
+//
+// Layouts: multi-dimensional indices vs. linear indicies
+//
+// RAJA::Layout type has methods that can be used to convert between
+// multi-dimensional and linear indices. We show these below using the
+// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
+// sizes defined earlier:
+//
+//  constexpr int Nx = 3;
+//  constexpr int Ny = 5;
+//  constexpr int Nz = 2;
+//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Multi-dimensional indices to linear indices...\n";
+
+
+  std::cout << "\nperm3a_layout...\n" << std::endl;
+
+  int lin = -1;
+  int i = -1; 
+  int j = -1; 
+  int k = -1; 
+
+/*
+  // _perm3d_layout_start
+  lin = perm3a_layout(1, 2, 0);
+  std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl;
+  std::cout << "\t  Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
+
+  perm3a_layout.toIndices(7, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+  // _perm3d_layout_end
+
+
+  lin = perm3a_layout(2, 3, 1);
+  std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
+
+  perm3a_layout.toIndices(26, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+
+
+  lin = perm3a_layout(0, 2, 1);
+  std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
+
+  perm3a_layout.toIndices(21, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+*/
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\nperm3b_layout...\n" << std::endl;
+
+  lin = perm3b_layout(1, 2, 0);
+  std::cout << "\tperm3b_layout(1, 2, 0) = " << lin << std::endl;
+  std::cout << "\t  Should be 13 = 1 + 0 * Nx + 2 * Nx * Nz "
+            << "(since perm is {1, 2, 0})" << std::endl;
+
+  perm3b_layout.toIndices(13, i, j, k);
+  std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+
+
+  lin = perm3b_layout(2, 3, 1);
+  std::cout << "\tperm3b_layout(2, 3, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 23 = 2 + 1 * Nx + 3 * Nx * Nz "
+            << "(since perm is {1, 2, 0})" << std::endl;
+
+  perm3b_layout.toIndices(23, i, j, k);
+  std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+
+
+  lin = perm3b_layout(0, 2, 1);
+  std::cout << "\tperm3b_layout(0, 2, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 15 = 0 + 1 * Nx + 2 * Nx * Nz "
+            << "(since perm is {1, 2, 0})" << std::endl;
+  perm3b_layout.toIndices(15, i, j, k);
+  std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; 
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a triple loop nest using a RAJA::View and
+  ///           three-dimensional RAJA::Layout that iterates over the
+  ///           data array 'a' with unit stride.
+  ///
+
+//----------------------------------------------------------------------------//
+//
+// Offset layouts apply offsets to indices
+//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running offset layout cases...\n";
+
+  //
+  // Define some dimensions, and allocate arrays
+  //
+  constexpr int Ntot_ao = 40;
+  int* ao = new int[ Ntot_ao ];
+  int* ao_ref = new int[ Ntot_ao ];
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 1D offset layout case...\n";
+
+  //
+  // Set reference solution to compare with
+  //
+
+  std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
+
+  // _cstyle_offlayout1D_start
+  int imin = -5;
+  int imax = 6;
+
+  for (int i = imin; i < imax; ++i) {
+    ao_ref[ i-imin ] = i;
+  }
+  // _cstyle_offlayout1D_end
+
+//printValues<int>(ao_ref, imax-imin);
+
+//----------------------------------------//
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  // _raja_offlayout1D_start
+  RAJA::OffsetLayout<1, int> offlayout_1D = 
+    RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); 
+
+  RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao,
+                                                             offlayout_1D);
+
+  for (int i = imin; i < imax; ++i) {
+    aoview_1Doff(i) = i;
+  }
+  // _raja_offlayout1D_end
+
+  checkResult<int>(ao, ao_ref, imax-imin);
+//printValues<int>(ao, 11);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D offset layout case...\n";
+
+  //
+  // Set reference solution to compare with
+  //
+
+  std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
+
+  // _cstyle_offlayout2D_start
+  imin = -1;
+  imax = 2;
+  int jmin = -5;
+  int jmax = 5;
+
+  iter = 0;
+  for (int i = imin; i < imax; ++i) {
+    for (int j = jmin; j < jmax; ++j) {
+      ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin)  ] = iter;
+      iter++;
+    }
+  }
+  // _cstyle_offlayout2D_end
+
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+
+//----------------------------------------//
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  ///
+  /// TODO...
+  ///
+  /// EXERCISE: Implement a double loop nest using a RAJA::View and
+  ///           two-dimensional RAJA::OffsetLayout which performs the
+  ///           same operations as the C-style example above.
+  ///
+
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin)); 
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D permuted offset layout case...\n";
+
+  //
+  // Set reference solution to compare with
+  //
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  // _cstyle_permofflayout2D_start
+  iter = 0;
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      ao_ref[ (i-imin) + (j-jmin) * (imax-imin)  ] = iter; 
+      iter++;
+    }
+  }
+  // _cstyle_permofflayout2D_end
+
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+
+//----------------------------------------//
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  // _raja_permofflayout2D_start
+  std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
+  RAJA::OffsetLayout<2> permofflayout_2D =
+    RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, 
+                                          {{imax, jmax}},
+                                          perm1D );
+
+  RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao,
+                                                            permofflayout_2D);
+
+  iter = 0;
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      aoview_2Dpermoff(i, j) = iter;
+      iter++;
+    }
+  }
+  // _raja_permofflayout2D_end
+
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin));
+
+//
+// Clean up.
+//
+  delete [] ao;
+  delete [] ao_ref;
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(T* C, T* Cref, int N)
+{
+  bool match = true;
+  for (int i = 0; i < N; ++i) {
+    if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) {
+      match = false;
+    }
+  }
+  if ( match ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+template <typename T>
+void printValues(T* C, int N)
+{
+  for (int i = 0; i < N; ++i) {
+    std::cout << "array[" << i << "] = " << C[i] << std::endl;
+    }
+};
diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp
new file mode 100644
index 0000000000..3da033953e
--- /dev/null
+++ b/exercises/view-layout_solution.cpp
@@ -0,0 +1,643 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cstdlib>
+#include <iostream>
+#include <cstring>
+#include <cmath>
+
+#include "memoryManager.hpp"
+
+#include "RAJA/RAJA.hpp"
+
+/*
+ *  View and Layout Exercise
+ *
+ *  Examples illustrate the use of RAJA View and Layout types.
+ *
+ *  RAJA features shown:
+ *    - RAJA::View
+ *    - RAJA::Layout
+ *    - Layout permutations 
+ *    - OffsetLayout
+ *    - OffsetLayout permutations 
+ *
+ * NOTE: no RAJA kernel execution methods are used in these examples.
+ */
+
+//
+// Functions to check and print arrays
+//
+template <typename T>
+void checkResult(T* C, T* Cref, int N);
+
+template <typename T>
+void printValues(T* C, int N);
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  std::cout << "\n\nRAJA view & layout exercises...\n";
+
+//----------------------------------------------------------------------------//
+//
+// Matrix-matrix multiplication: default layout
+//
+//----------------------------------------------------------------------------//
+
+  // _matmult_init_start
+  //
+  // Define the size of N x N of matrices.
+  //
+  constexpr int N = 4;
+
+  //
+  // Allocate storage for matrices and initialize matrix entries
+  //
+  double *A = new double[ N * N ];
+  double *B = new double[ N * N ];
+  double *C = new double[ N * N ];
+  double *Cref = new double[ N * N ];
+
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      A[ col + N*row ] = row + 1;
+      B[ col + N*row ] = col + 1;
+      C[ col + N*row ] = 0.0;
+      Cref[ col + N*row ] = 0.0;
+    }
+  }
+  // _matmult_init_end
+
+//printValues<double>(A, N*N); 
+//printValues<double>(B, N*N); 
+//printValues<double>(C, N*N); 
+//printValues<double>(Cref, N*N); 
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running matrix multiplication reference solution...\n";
+
+  // _cstyle_matmult_start
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
+        Cref[col + N*row] += A[k + N*row] * B[col + N*k];
+      }
+    }
+  }
+  // _cstyle_matmult_end
+
+//printValues<double>(Cref, N*N);
+
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running matrix multiplication w/Views...\n";
+
+  // 
+  // Define RAJA View objects to simplify access to the matrix entries.
+  // 
+  // Note: we use default Layout 
+  //
+  // _matmult_views_start
+  RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N);
+  RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N);
+  // _matmult_views_end
+
+  // _cstyle_matmult_views_start
+  for (int row = 0; row < N; ++row) {
+    for (int col = 0; col < N; ++col) {
+      for (int k = 0; k < N; ++k) {
+        Cview(row, col) += Aview(row, k) * Bview(k, col);
+      }
+    }
+  }
+  // _cstyle_matmult_views_end
+
+  checkResult<double>(C, Cref, N*N);
+//printValues<double>(C, N*N);
+
+//
+// Clean up.
+//
+  delete [] A;
+  delete [] B;
+  delete [] C;
+  delete [] Cref;
+
+//----------------------------------------------------------------------------//
+//
+// Default layouts use row-major data ordering
+//
+//----------------------------------------------------------------------------//
+
+  //
+  // Define dimensions and allocate arrays
+  //
+  // _default_views_init_start
+  constexpr int Nx = 3;
+  constexpr int Ny = 5;
+  constexpr int Nz = 2;
+  constexpr int Ntot  = Nx*Ny*Nz;
+  int* a = new int[ Ntot ];
+  int* aref = new int[ Ntot ];
+
+  for (int i = 0; i < Ntot; ++i)
+  {
+    aref[i] = i;
+  }
+  // _default_views_init_end
+
+//printValues<int>(ref, Ntot);
+
+//----------------------------------------//
+
+  std::cout << "\n Running default layout view cases...\n";
+
+  std::cout << "\n\t Running 1D view case...\n";
+ 
+  std::memset(a, 0, Ntot * sizeof(int));
+ 
+  // _default_view1D_start 
+  RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot);
+
+  for (int i = 0; i < Ntot; ++i) {
+    view_1D(i) = i;
+  }
+  // _default_view1D_end 
+
+  checkResult<int>(a, aref, Ntot);
+//printValues<int>(a, Ntot);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D default layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+ 
+  // _default_view2D_start
+  RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny);
+
+  int iter{0};
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      view_2D(i, j) = iter;
+      ++iter;
+    }
+  }
+  // _default_view2D_end
+
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D default layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _default_view3D_start    
+  RAJA::View< int, RAJA::Layout<3, int> > view_3D(a, Nx, Ny, Nz);
+
+  iter = 0;
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      for (int k = 0; k < Nz; ++k) {
+        view_3D(i, j, k) = iter;
+        ++iter;
+      }
+    }
+  }
+  // _default_view3D_end
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//----------------------------------------------------------------------------//
+//
+// Permuted layouts change the data striding order
+//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running permuted layout cases...\n";
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D default permutation view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _default_perm_view2D_start
+  std::array<RAJA::idx_t, 2> defperm2 {{0, 1}};
+  RAJA::Layout< 2, int > defperm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2);
+  RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout);
+
+  iter = 0;
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      defperm_view_2D(i, j) = iter;
+      ++iter;
+    }
+  }
+  // _default_perm_view2D_end
+
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D default permutation view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _default_perm_view3D_start
+  std::array<RAJA::idx_t, 3> defperm3 {{0, 1, 2}};
+  RAJA::Layout< 3, int > defperm3_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, defperm3);
+  RAJA::View< int, RAJA::Layout<3, int> > defperm_view_3D(a, defperm3_layout);
+
+  iter = 0;
+  for (int i = 0; i < Nx; ++i) {
+    for (int j = 0; j < Ny; ++j) {
+      for (int k = 0; k < Nz; ++k) {
+        defperm_view_3D(i, j, k) = iter;
+        ++iter;
+      }
+    }
+  }
+  // _default_perm_view3D_end
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//----------------------------------------//
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D permuted layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _perm_2D_start
+  std::array<RAJA::idx_t, 2> perm2 {{1, 0}};
+  RAJA::Layout< 2, int > perm2_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny}}, perm2);
+  RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout);
+
+  iter = 0;
+  for (int j = 0; j < Ny; ++j) {
+    for (int i = 0; i < Nx; ++i) {
+      perm_view_2D(i, j) = iter;
+      ++iter;
+    }
+  }
+  // _perm_2D_end
+
+  checkResult<int>(a, aref, Nx*Ny);
+//printValues<int>(a, Nx*Ny);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D perma layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _perma_view3D_start
+  std::array<RAJA::idx_t, 3> perm3a {{2, 1, 0}};
+  RAJA::Layout< 3, int > perm3a_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3a);
+  RAJA::View< int, RAJA::Layout<3, int> > perm3a_view_3D(a, perm3a_layout);
+
+  iter = 0;
+  for (int k = 0; k < Nz; ++k) {
+    for (int j = 0; j < Ny; ++j) {
+      for (int i = 0; i < Nx; ++i) {
+        perm3a_view_3D(i, j, k) = iter;
+        ++iter;
+      }
+    }
+  }
+  // _perma_view3D_end
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 3D permb layout view case...\n";
+
+  std::memset(a, 0, Ntot * sizeof(int));
+
+  // _permb_view3D_start
+  std::array<RAJA::idx_t, 3> perm3b {{1, 2, 0}};
+  RAJA::Layout< 3, int > perm3b_layout =
+    RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b);
+  RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout);
+
+  iter = 0;
+  for (int j = 0; j < Ny; ++j) {
+    for (int k = 0; k < Nz; ++k) {
+      for (int i = 0; i < Nx; ++i) {
+        perm3b_view_3D(i, j, k) = iter;
+        ++iter;
+      }
+    }
+  }
+  // _permb_view3D_end
+
+  checkResult<int>(a, aref, Nx*Ny*Nz);
+//printValues<int>(a, Nx*Ny*Nz);
+
+//
+// Clean up.
+//
+  delete [] a;
+  delete [] aref;
+
+//----------------------------------------------------------------------------//
+//
+// Layouts: multi-dimensional indices vs. linear indicies
+//
+// RAJA::Layout type has methods that can be used to convert between
+// multi-dimensional and linear indices. We show these below using the
+// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz
+// sizes defined earlier:
+//
+//  constexpr int Nx = 3;
+//  constexpr int Ny = 5;
+//  constexpr int Nz = 2;
+//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Multi-dimensional indices to linear indices...\n";
+
+
+  std::cout << "\nperm3a_layout...\n" << std::endl;
+
+  int lin = -1;
+  int i = -1;
+  int j = -1;
+  int k = -1;
+
+  // _perm3d_layout_start
+  lin = perm3a_layout(1, 2, 0);
+  std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl;
+  std::cout << "\t  Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
+
+  perm3a_layout.toIndices(7, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+  // _perm3d_layout_end
+
+
+  lin = perm3a_layout(2, 3, 1);
+  std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
+
+  perm3a_layout.toIndices(26, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+
+
+  lin = perm3a_layout(0, 2, 1);
+  std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny "
+            << "(since perm is {2, 1, 0})" << std::endl;
+
+  perm3a_layout.toIndices(21, i, j, k);
+  std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+
+//----------------------------------------------------------------------------//
+
+  std::cout << "\nperm3b_layout...\n" << std::endl;
+
+  lin = perm3b_layout(1, 2, 0);
+  std::cout << "\tperm3b_layout(1, 2, 0) = " << lin << std::endl;
+  std::cout << "\t  Should be 13 = 1 + 0 * Nx + 2 * Nx * Nz "
+            << "(since perm is {1, 2, 0})" << std::endl;
+
+  perm3b_layout.toIndices(13, i, j, k);
+  std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+
+
+  lin = perm3b_layout(2, 3, 1);
+  std::cout << "\tperm3b_layout(2, 3, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 23 = 2 + 1 * Nx + 3 * Nx * Nz "
+            << "(since perm is {1, 2, 0})" << std::endl;
+
+  perm3b_layout.toIndices(23, i, j, k);
+  std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl;
+
+
+  lin = perm3b_layout(0, 2, 1);
+  std::cout << "\tperm3b_layout(0, 2, 1) = " << lin << std::endl;
+  std::cout << "\t  Should be 15 = 0 + 1 * Nx + 2 * Nx * Nz "
+            << "(since perm is {1, 2, 0})" << std::endl;
+
+  perm3b_layout.toIndices(15, i, j, k);
+  std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = "
+            << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; 
+
+//----------------------------------------------------------------------------//
+//
+// Offset layouts apply offsets to indices
+//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n Running offset layout cases...\n";
+
+  //
+  // Define some dimensions, and allocate arrays
+  //
+  constexpr int Ntot_ao = 40;
+  int* ao = new int[ Ntot_ao ];
+  int* ao_ref = new int[ Ntot_ao ];
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 1D offset layout case...\n";
+
+  //
+  // Set reference solution to compare with
+  //
+
+  std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
+
+  // _cstyle_offlayout1D_start
+  int imin = -5;
+  int imax = 6;
+
+  for (int i = imin; i < imax; ++i) {
+    ao_ref[ i-imin ] = i;
+  }
+  // _cstyle_offlayout1D_end
+
+//printValues<int>(ao_ref, imax-imin);
+
+//----------------------------------------//
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  // _raja_offlayout1D_start
+  RAJA::OffsetLayout<1, int> offlayout_1D = 
+    RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); 
+
+  RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao,
+                                                             offlayout_1D);
+
+  for (int i = imin; i < imax; ++i) {
+    aoview_1Doff(i) = i;
+  }
+  // _raja_offlayout1D_end
+
+  checkResult<int>(ao, ao_ref, imax-imin);
+//printValues<int>(ao, 11);
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D offset layout case...\n";
+
+  //
+  // Set reference solution to compare with
+  //
+
+  std::memset(ao_ref, 0, Ntot_ao * sizeof(int));
+
+  // _cstyle_offlayout2D_start
+  imin = -1;
+  imax = 2;
+  int jmin = -5;
+  int jmax = 5;
+
+  iter = 0;
+  for (int i = imin; i < imax; ++i) {
+    for (int j = jmin; j < jmax; ++j) {
+      ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin)  ] = iter;
+      iter++;
+    }
+  }
+  // _cstyle_offlayout2D_end
+
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+
+//----------------------------------------//
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  // _raja_offlayout2D_start
+  RAJA::OffsetLayout<2, int> offlayout_2D =
+    RAJA::make_offset_layout<2, int>( {{imin, jmin}}, {{imax, jmax}} );
+
+  RAJA::View< int, RAJA::OffsetLayout<2, int> > aoview_2Doff(ao,
+                                                             offlayout_2D);
+  iter = 0;
+  for (int i = imin; i < imax; ++i) {
+    for (int j = jmin; j < jmax; ++j) {
+      aoview_2Doff(i, j) = iter;
+      iter++;
+    }
+  }
+  // _raja_offlayout2D_end
+
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin)); 
+
+//----------------------------------------//
+
+  std::cout << "\n\t Running 2D permuted offset layout case...\n";
+
+  //
+  // Set reference solution to compare with
+  //
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  // _cstyle_permofflayout2D_start
+  iter = 0;
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      ao_ref[ (i-imin) + (j-jmin) * (imax-imin)  ] = iter; 
+      iter++;
+    }
+  }
+  // _cstyle_permofflayout2D_end
+
+//printValues<int>(ao_ref, (imax-imin)*(jmax-jmin));
+
+//----------------------------------------//
+
+  std::memset(ao, 0, Ntot_ao * sizeof(int));
+
+  // _raja_permofflayout2D_start
+  std::array<RAJA::idx_t, 2> perm1D {{1, 0}};
+  RAJA::OffsetLayout<2> permofflayout_2D =
+    RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, 
+                                          {{imax, jmax}},
+                                          perm1D );
+
+  RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao,
+                                                            permofflayout_2D);
+
+  iter = 0;
+  for (int j = jmin; j < jmax; ++j) {
+    for (int i = imin; i < imax; ++i) {
+      aoview_2Dpermoff(i, j) = iter;
+      iter++;
+    }
+  }
+  // _raja_permofflayout2D_end
+
+  checkResult<int>(ao, ao_ref, (imax-imin)*(jmax-jmin));
+//printValues<int>(ao, (imax-imin)*(jmax-jmin));
+
+//
+// Clean up.
+//
+  delete [] ao;
+  delete [] ao_ref;
+
+//----------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
+
+  std::cout << "\n DONE!...\n";
+
+  return 0;
+}
+
+//
+// Function to check result and report P/F.
+//
+template <typename T>
+void checkResult(T* C, T* Cref, int N)
+{
+  bool match = true;
+  for (int i = 0; i < N; ++i) {
+    if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) {
+      match = false;
+    }
+  }
+  if ( match ) {
+    std::cout << "\n\t result -- PASS\n";
+  } else {
+    std::cout << "\n\t result -- FAIL\n";
+  }
+};
+
+template <typename T>
+void printValues(T* C, int N)
+{
+  for (int i = 0; i < N; ++i) {
+    std::cout << "array[" << i << "] = " << C[i] << std::endl;
+    }
+};
diff --git a/host-configs/alcf-builds/sycl.cmake b/host-configs/alcf-builds/sycl.cmake
index 35d7557bb8..f3efb32477 100755
--- a/host-configs/alcf-builds/sycl.cmake
+++ b/host-configs/alcf-builds/sycl.cmake
@@ -14,15 +14,16 @@
 
 set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
 
-set(CMAKE_CXX_COMPILER "clang++" CACHE PATH "")
-#set(CMAKE_CXX_COMPILER "dpcpp" CACHE PATH "")
+#set(CMAKE_CXX_COMPILER "clang++" CACHE PATH "")
+#set(CMAKE_CXX_COMPILER "g++" CACHE PATH "")
+set(CMAKE_CXX_COMPILER "dpcpp" CACHE PATH "")
 
 #set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend '-device skl' " CACHE STRING "")
 #set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g  -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend '-device skl'" CACHE STRING "")
 #set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend '-device skl'" CACHE STRING "")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3 -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g  -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "")
-set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -std=c++17 -fsycl -fsycl-unnamed-lambda" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g -std=c++17 -fsycl -fsycl-unnamed-lambda" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -std=c++17 -fsycl -fsycl-unnamed-lambda" CACHE STRING "")
 set(CMAKE_CXX_LINK_FLAGS "-fsycl -Wl,-rpath,/usr/tce/packages/oneapi/oneapi-2021.2/compiler/2021.2.0/linux/compiler/lib/intel64_lin/"  CACHE STRING "")
 
 set(RAJA_RANGE_ALIGN 4 CACHE INT "")
diff --git a/host-configs/lc-builds/toss3/oneapi_X.cmake b/host-configs/lc-builds/toss3/oneapi_X.cmake
new file mode 100644
index 0000000000..680cc0e25d
--- /dev/null
+++ b/host-configs/lc-builds/toss3/oneapi_X.cmake
@@ -0,0 +1,16 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -finline-functions" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE INT "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "")
+set(RAJA_DATA_ALIGN 64 CACHE INT "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/host-configs/lc-builds/toss4/corona_sycl.cmake b/host-configs/lc-builds/toss4/corona_sycl.cmake
new file mode 100755
index 0000000000..ea240f745f
--- /dev/null
+++ b/host-configs/lc-builds/toss4/corona_sycl.cmake
@@ -0,0 +1,25 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "")
+
+set(CMAKE_CXX_COMPILER "clang++" CACHE PATH "")
+#set(CMAKE_CXX_COMPILER "dpcpp" CACHE PATH "")
+
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -L${SYCL_LIB_PATH} -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -L${SYCL_LIB_PATH} -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -L${SYCL_LIB_PATH} -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a" CACHE STRING "")
+#set(CMAKE_CXX_FLAGS_RELEASE "-O3 -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "")
+#set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g  -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "")
+#set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "")
+#set(CMAKE_CXX_LINK_FLAGS "-fsycl -Wl,-rpath,/usr/tce/packages/oneapi/oneapi-2021.2/compiler/2021.2.0/linux/compiler/lib/intel64_lin/"  CACHE STRING "")
+
+set(RAJA_RANGE_ALIGN 4 CACHE STRING "")
+set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "")
+set(RAJA_DATA_ALIGN 64 CACHE STRING "")
+
+set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "")
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 7a2df45ffb..c01f9167bc 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -43,12 +43,14 @@
 //
 #include "RAJA/pattern/forall.hpp"
 #include "RAJA/pattern/kernel.hpp"
-#include "RAJA/pattern/teams.hpp"
+#include "RAJA/pattern/launch.hpp"
 
 //
 // Generic templates to describe SIMD/SIMT registers and vectors
 //
+#if defined(RAJA_ENABLE_VECTORIZATION)
 #include "RAJA/pattern/tensor.hpp"
+#endif
 
 //
 // All platforms must support sequential execution.
@@ -64,7 +66,9 @@
 // All platforms should support simd and vector execution.
 //
 #include "RAJA/policy/simd.hpp"
+#if defined(RAJA_ENABLE_VECTORIZATION)
 #include "RAJA/policy/tensor.hpp"
+#endif
 
 #if defined(RAJA_ENABLE_TBB)
 #include "RAJA/policy/tbb.hpp"
@@ -195,9 +199,9 @@
 
 namespace RAJA {
 namespace expt{}
-  // provide a RAJA::expt namespace for experimental work, but bring alias
-  // it into RAJA so it doesn't affect user code
-  using namespace expt;
+//  // provide a RAJA::expt namespace for experimental work, but bring alias
+//  // it into RAJA so it doesn't affect user code
+//  using namespace expt;
 }
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in
index 550fdc4198..404d8beebf 100644
--- a/include/RAJA/config.hpp.in
+++ b/include/RAJA/config.hpp.in
@@ -100,6 +100,12 @@ static_assert(RAJA_HAS_SOME_CXX14,
               "compiler and/or standard library does not claim support for "
               "C++14 features we need");
 
+#if defined(__cpp_lib_is_invocable) && (__cpp_lib_is_invocable >= 201703L)
+#define RAJA_HAS_CXX17_IS_INVOCABLE 1
+#else
+#define RAJA_HAS_CXX17_IS_INVOCABLE 0
+#endif
+
 /*!
  ******************************************************************************
  *
@@ -169,6 +175,7 @@ static_assert(RAJA_HAS_SOME_CXX14,
 #cmakedefine RAJA_ENABLE_CLANG_CUDA
 #cmakedefine RAJA_ENABLE_HIP
 #cmakedefine RAJA_ENABLE_SYCL
+#cmakedefine RAJA_ENABLE_VECTORIZATION
 
 #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT
 #cmakedefine RAJA_ENABLE_ROCTX
@@ -234,15 +241,15 @@ static_assert(RAJA_HAS_SOME_CXX14,
 
 namespace RAJA {
 
-#if defined(RAJA_ENABLE_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__)
 #if defined(_OPENMP)
-#if _OPENMP >= 200805
+#if (_OPENMP >= 200805)
 #define RAJA_ENABLE_OPENMP_TASK
 #endif
 #else
-#error RAJA configured with RAJA_ENABLE_OPENMP, but OpenMP not supported by current compiler
+#error RAJA configured with RAJA_ENABLE_OPENMP, but _OPENMP is not defined in this code section
 #endif // _OPENMP
-#endif // RAJA_ENABLE_OPENMP
+#endif // RAJA_ENABLE_OPENMP && __HIP_DEVICE_COMPILE__
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_CUDA_ACTIVE
@@ -252,10 +259,12 @@ namespace RAJA {
 #define RAJA_HIP_ACTIVE
 
 #include <hip/hip_version.h>
-#if (HIP_VERSION_MAJOR > 4) || \
-    (HIP_VERSION_MAJOR == 4 && HIP_VERSION_MINOR >= 3)
-// enable device function pointers with rocm version >= 4.3
+#if (HIP_VERSION_MAJOR > 5) || \
+    (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 1)
+// enable device function pointers with rocm version >= 5.1
+// this used to be set to 4.3, but tests start passing with 5.1
 #define RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL
+#define RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
 #endif
 #if (HIP_VERSION_MAJOR > 4) || \
     (HIP_VERSION_MAJOR == 4 && HIP_VERSION_MINOR >= 2)
@@ -265,8 +274,15 @@ namespace RAJA {
 #endif
 #endif // RAJA_ENABLE_HIP && __HIPCC__
 
+#if defined(RAJA_ENABLE_SYCL)
+#if defined(SYCL_LANGUAGE_VERSION)
+#define RAJA_SYCL_ACTIVE
+#endif
+#endif
+
 #if defined(RAJA_CUDA_ACTIVE) || \
-  defined(RAJA_HIP_ACTIVE)
+    defined(RAJA_HIP_ACTIVE) || \
+    defined(RAJA_SYCL_ACTIVE)
 #define RAJA_DEVICE_ACTIVE
 #endif
 
@@ -372,6 +388,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 #endif
 
 #define RAJA_UNROLL RAJA_PRAGMA(unroll)
+#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll(N))
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 #define RAJA_ALIGN_DATA(d) d
@@ -400,8 +417,10 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 
 #if !defined(__NVCC__)
 #define RAJA_UNROLL RAJA_PRAGMA(GCC unroll 10000)
+#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(GCC unroll N)
 #else
 #define RAJA_UNROLL RAJA_PRAGMA(unroll)
+#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll N)
 #endif
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
@@ -429,7 +448,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 //
 #define RAJA_FORCEINLINE_RECURSIVE
 #define RAJA_INLINE inline  __attribute__((always_inline))
-#define RAJA_UNROLL 
+#define RAJA_UNROLL
+#define RAJA_UNROLL_COUNT(N)
+
 // FIXME: alignx is breaking CUDA+xlc
 #if defined(RAJA_ENABLE_CUDA)
 #define RAJA_ALIGN_DATA(d) d
@@ -458,6 +479,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 #define RAJA_FORCEINLINE_RECURSIVE
 #define RAJA_INLINE inline  __attribute__((always_inline))
 #define RAJA_UNROLL RAJA_PRAGMA(clang loop unroll(enable))
+#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(clang loop unroll_count(N))
 
 
 // note that neither nvcc nor Apple Clang compiler currently doesn't support
@@ -499,6 +521,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 #define RAJA_SIMD
 #define RAJA_NO_SIMD
 #define RAJA_UNROLL
+#define RAJA_UNROLL_COUNT(N)
 
 #else
 
@@ -509,6 +532,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@;
 #define RAJA_SIMD
 #define RAJA_NO_SIMD
 #define RAJA_UNROLL
+#define RAJA_UNROLL_COUNT(N)
 
 #endif
 
@@ -546,10 +570,16 @@ T * align_hint(T * x)
 #define RAJA_UNROLL
 #endif
 
-// If we're in CUDA device code, we can use the nvcc unroll pragma
-#if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE)
+#ifndef RAJA_UNROLL_COUNT
+#define RAJA_UNROLL_COUNT(N)
+#endif
+
+// If we're in CUDA or HIP device code, we can use the unroll pragma
+#if (defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)) && defined(RAJA_DEVICE_ACTIVE)
 #undef RAJA_UNROLL
+#undef RAJA_UNROLL_COUNT
 #define RAJA_UNROLL RAJA_PRAGMA(unroll)
+#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll N)
 #endif
 
 #endif // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index d9d1de4010..52a1161577 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -299,16 +299,16 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Add copy of segment to back end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_back(Tnew const &val)
+  RAJA_INLINE void push_back(Tnew &&val)
   {
-    push_internal(new Tnew(val), PUSH_BACK, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK, PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
   template <typename Tnew>
-  RAJA_INLINE void push_front(Tnew const &val)
+  RAJA_INLINE void push_front(Tnew &&val)
   {
-    push_internal(new Tnew(val), PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index 105fa6650b..de998abbbe 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -49,12 +49,12 @@ namespace RAJA
  *  end() -- returns a StorageT*
  *  size() -- returns size of the Segment iteration space (RAJA::Index_type)
  *
- * NOTE: TypedListSegment supports the option for the segment to own the 
+ * NOTE: TypedListSegment supports the option for the segment to own the
  *       its index data or simply use the index array passed to the constructor.
- *       Owning the index data is the default; an array is created in the 
+ *       Owning the index data is the default; an array is created in the
  *       memory space specified by the camp resource object and the values are
- *       copied from the input array to that. Ownership of the indices is 
- *       determined by an optional ownership enum value passed to the 
+ *       copied from the input array to that. Ownership of the indices is
+ *       determined by an optional ownership enum value passed to the
  *       constructor.
  *
  * Usage:
@@ -62,14 +62,14 @@ namespace RAJA
  * A common C-style loop traversal pattern using an indirection array would be:
  *
  * \verbatim
- * const T* indices = ...; 
+ * const T* indices = ...;
  * for (T i = begin; i < end; ++i) {
  *   // loop body -- use indices[i] as index value
  * }
  * \endverbatim
  *
  * A TypedListSegment would be used with a RAJA forall execution template as:
- * 
+ *
  * \verbatim
  * camp::resources::Resource resource{ camp resource type };
  * TypedListSegment<T> listseg(indices, length, resource);
@@ -88,7 +88,7 @@ class TypedListSegment
 
   //@{
   //!   @name Types used in implementation based on template parameter.
- 
+
   //! The underlying value type for index storage
   using value_type = StorageT;
 
@@ -107,11 +107,12 @@ class TypedListSegment
    * \brief Construct a list segment from given array with specified length
    *        and use given camp resource to allocate list segment index data
    *        if owned by this list segment.
-   * 
+   *
    * \param values array of indices defining iteration space of segment
    * \param length number of indices
    * \param resource camp resource defining memory space where index data live
-   * \param owned optional enum value indicating whether segment owns indices (Owned or Unowned). Default is Owned.   
+   * \param owned optional enum value indicating whether segment owns indices 
+   * (Owned or Unowned). Default is Owned.
    *
    * If 'Unowned' is passed as last argument, the segment will not own its
    * index data. In this case, caller must manage array lifetime properly.
@@ -120,9 +121,9 @@ class TypedListSegment
                    Index_type length,
                    camp::resources::Resource resource,
                    IndexOwnership owned = Owned)
-    : m_resource(resource)
+    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
   {
-    initIndexData(values, length, owned);
+    initIndexData(values, length, resource, owned);
   }
 
   /*!
@@ -140,8 +141,7 @@ class TypedListSegment
   template <typename Container>
   TypedListSegment(const Container& container,
                    camp::resources::Resource resource)
-    : m_resource(resource),
-      m_owned(Unowned), m_data(nullptr), m_size(container.size())
+    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size())
   {
     if (m_size > 0) {
 
@@ -158,8 +158,9 @@ class TypedListSegment
         ++src;
       }
 
-      m_data = m_resource.allocate<value_type>(m_size);
-      m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size);
+      m_resource = new camp::resources::Resource(resource);
+      m_data = m_resource->allocate<value_type>(m_size);
+      m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
       m_owned = Owned;
 
       host_res.deallocate(tmp);
@@ -171,29 +172,74 @@ class TypedListSegment
   TypedListSegment() = delete;
 
   //! Copy constructor for list segment
-  TypedListSegment(const TypedListSegment& other)
-    : m_resource(other.m_resource),
-      m_owned(Unowned), m_data(nullptr), m_size(0)
+  //  As this may be called from a lambda in a
+  //  RAJA method we perform a shallow copy
+  RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other)
+    : m_resource(nullptr),
+      m_owned(Unowned), m_data(other.m_data), m_size(other.m_size)
+  {
+  }
+
+  //! Copy assignment for list segment
+  //  As this may be called from a lambda in a
+  //  RAJA method we perform a shallow copy
+  RAJA_HOST_DEVICE TypedListSegment& operator=(const TypedListSegment& other)
   {
-    bool from_copy_ctor = true;
-    initIndexData(other.m_data, other.m_size, other.m_owned, from_copy_ctor);
+    clear();
+    m_resource = nullptr;
+    m_owned = Unowned;
+    m_data = other.m_data;
+    m_size = other.m_size;
+  }
+
+    //! move assignment for list segment
+  //  As this may be called from a lambda in a
+  //  RAJA method we perform a shallow copy
+  RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs)
+  {
+    clear();
+    m_resource = rhs.m_resource;
+    m_owned = rhs.m_owned;
+    m_data = rhs.m_data;
+    m_size = rhs.m_size;
+
+    rhs.m_resource = nullptr;
+    rhs.m_owned = Unowned;
+    rhs.m_data = nullptr;
+    rhs.m_size = 0;
   }
 
   //! Move constructor for list segment
-  TypedListSegment(TypedListSegment&& rhs)
+  RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs)
     : m_resource(rhs.m_resource),
       m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
   {
-    // make the rhs non-owning so it's destructor won't have any side effects
     rhs.m_owned = Unowned;
+    rhs.m_resource = nullptr;
+    rhs.m_size = 0;
+    rhs.m_data = nullptr;
   }
 
   //! List segment destructor
-  ~TypedListSegment()
+  RAJA_HOST_DEVICE ~TypedListSegment()
   {
+    clear();
+  }
+
+  //! Clear method to be called
+  RAJA_HOST_DEVICE void clear()
+  {
+
+#if !defined(RAJA_DEVICE_CODE)
     if (m_data != nullptr && m_owned == Owned) {
-      m_resource.deallocate(m_data);
+      m_resource->deallocate(m_data);
+      delete m_resource;
     }
+#endif
+    m_data = nullptr;
+    m_resource = nullptr;
+    m_owned = Unowned;
+    m_size = 0;
   }
 
   //@}
@@ -235,7 +281,7 @@ class TypedListSegment
    * \return true if segment size is same as given length value and values in
    *         given array match segment index values, else false
    *
-   * Method assumes values in given array and segment indices both live in host 
+   * Method assumes values in given array and segment indices both live in host
    * memory space.
    */
   RAJA_HOST_DEVICE bool indicesEqual(const value_type* container,
@@ -252,9 +298,9 @@ class TypedListSegment
   /*!
    * \brief Compare this segment to another for equality
    *
-   * \return true if both segments are the same size and indices match, 
+   * \return true if both segments are the same size and indices match,
    *         else false
-   * 
+   *
    * Method assumes indices in both segments live in host memory space.
    */
   RAJA_HOST_DEVICE bool operator==(const TypedListSegment& other) const
@@ -265,9 +311,9 @@ class TypedListSegment
   /*!
    * \brief Compare this segment to another for inequality
    *
-   * \return true if segments are not the same size or indices do not match, 
+   * \return true if segments are not the same size or indices do not match,
    *         else false
-   * 
+   *
    * Method assumes indices in both segments live in host memory space.
    */
   RAJA_HOST_DEVICE bool operator!=(const TypedListSegment& other) const
@@ -294,8 +340,8 @@ class TypedListSegment
   //
   void initIndexData(const value_type* container,
                      Index_type len,
-                     IndexOwnership container_own,
-                     bool from_copy_ctor = false)
+                     camp::resources::Resource resource_,
+                     IndexOwnership container_own)
   {
 
     // empty list segment
@@ -311,12 +357,7 @@ class TypedListSegment
     m_owned = container_own;
     if (m_owned == Owned) {
 
-      if ( from_copy_ctor ) {
-
-        m_data = m_resource.allocate<value_type>(m_size);
-        m_resource.memcpy(m_data, container, sizeof(value_type) * m_size); 
-
-      } else {
+        m_resource = new camp::resources::Resource(resource_);
 
         camp::resources::Resource host_res{camp::resources::Host()};
 
@@ -326,16 +367,14 @@ class TypedListSegment
           tmp[i] = container[i];
         }
 
-        m_data = m_resource.allocate<value_type>(m_size);
-        m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size);
+        m_data = m_resource->allocate<value_type>(m_size);
+        m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
 
         host_res.deallocate(tmp);
 
-      }
-
       return;
     }
- 
+
     // list segment accesses container data directly.
     // Uh-oh. Using evil const_cast....
     m_data = const_cast<value_type*>(container);
@@ -343,7 +382,7 @@ class TypedListSegment
 
 
   // Copy of camp resource passed to ctor
-  camp::resources::Resource m_resource;
+  camp::resources::Resource *m_resource;
 
   // Ownership flag to guide data copying/management
   IndexOwnership m_owned;
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index b64508fd11..9775f4771a 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -21,6 +21,7 @@
 
 #include "RAJA/config.hpp"
 
+#include <algorithm>
 #include <cstddef>
 #include <memory>
 #include <utility>
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index cc0d7af4e3..78f18ee0d3 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -48,6 +48,28 @@ struct foldl_impl<Op, Arg1> {
   using Ret = Arg1;
 };
 
+#if RAJA_HAS_CXX17_IS_INVOCABLE
+
+template <typename Op, typename Arg1, typename Arg2>
+struct foldl_impl<Op, Arg1, Arg2> {
+  using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
+};
+
+template <typename Op,
+          typename Arg1,
+          typename Arg2,
+          typename Arg3,
+          typename... Rest>
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
+  using Ret = typename foldl_impl<
+      Op,
+      typename std::invoke_result<Op, typename std::invoke_result<Op, Arg1, Arg2>::type,
+                                      Arg3>::type,
+      Rest...>::Ret;
+};
+
+#else
+
 template <typename Op, typename Arg1, typename Arg2>
 struct foldl_impl<Op, Arg1, Arg2> {
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
@@ -66,6 +88,8 @@ struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
       Rest...>::Ret;
 };
 
+#endif
+
 } // namespace detail
 
 template <typename Op, typename Arg1>
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 15b4b94e77..65b8cd53a7 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -193,12 +193,14 @@ struct WorkSite {
 template <typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
           typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename INDEX_T,
           typename ... Args,
           typename ALLOCATOR_T>
 struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
-                                STORAGE_POLICY_T>,
+                                STORAGE_POLICY_T,
+                                DISPATCH_POLICY_T>,
                 INDEX_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
@@ -206,7 +208,8 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy>;
+  using dispatch_policy = DISPATCH_POLICY_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -216,9 +219,9 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 
 private:
   using workrunner_type = detail::WorkRunner<
-      exec_policy, order_policy, Allocator, index_type, Args...>;
+      exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>;
   using storage_type = detail::WorkStorage<
-      storage_policy, Allocator, typename workrunner_type::vtable_type>;
+      storage_policy, Allocator, typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -302,12 +305,14 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 template <typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
           typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename INDEX_T,
           typename ... Args,
           typename ALLOCATOR_T>
 struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  ORDER_POLICY_T,
-                                 STORAGE_POLICY_T>,
+                                 STORAGE_POLICY_T,
+                                 DISPATCH_POLICY_T>,
                  INDEX_T,
                  xargs<Args...>,
                  ALLOCATOR_T>
@@ -315,7 +320,8 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy>;
+  using dispatch_policy = DISPATCH_POLICY_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -372,12 +378,14 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 template <typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
           typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename INDEX_T,
           typename ... Args,
           typename ALLOCATOR_T>
 struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
-                                STORAGE_POLICY_T>,
+                                STORAGE_POLICY_T,
+                                DISPATCH_POLICY_T>,
                 INDEX_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
@@ -385,7 +393,8 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
   using storage_policy = STORAGE_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy>;
+  using dispatch_policy = DISPATCH_POLICY_T;
+  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
   using index_type = INDEX_T;
   using xarg_type = xargs<Args...>;
   using Allocator = ALLOCATOR_T;
@@ -439,17 +448,18 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 template <typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
           typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename INDEX_T,
           typename ... Args,
           typename ALLOCATOR_T>
 inline
 typename WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
     ALLOCATOR_T>::workgroup_type
 WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
     ALLOCATOR_T>::instantiate()
@@ -465,21 +475,22 @@ WorkPool<
 template <typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
           typename STORAGE_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename INDEX_T,
           typename ... Args,
           typename ALLOCATOR_T>
 inline
 typename WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
     ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
     ALLOCATOR_T>::run(typename WorkGroup<
-                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T>,
+                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
                           INDEX_T,
                           xargs<Args...>,
                           ALLOCATOR_T>::resource_type r,
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
new file mode 100644
index 0000000000..221f900b98
--- /dev/null
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -0,0 +1,725 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file providing RAJA Dispatcher for workgroup.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_PATTERN_WORKGROUP_Dispatcher_HPP
+#define RAJA_PATTERN_WORKGROUP_Dispatcher_HPP
+
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/WorkGroup.hpp"
+
+#include "camp/number.hpp"
+#include "camp/list.hpp"
+#include "camp/helpers.hpp"
+
+#include <utility>
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+template < typename >
+struct DispatcherVoidPtrWrapper
+{
+  void* ptr;
+  DispatcherVoidPtrWrapper() = default;
+  // implicit constructor from void*
+  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { }
+};
+
+template < typename >
+struct DispatcherVoidConstPtrWrapper
+{
+  const void* ptr;
+  DispatcherVoidConstPtrWrapper() = default;
+  // implicit constructor from const void*
+  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { }
+};
+
+
+constexpr bool dispatcher_use_host_invoke(Platform platform) {
+  return !(platform == Platform::cuda || platform == Platform::hip);
+}
+
+// Transforms one dispatch policy into another by creating a dispatch policy
+// of holder_type objects. See usage in WorkRunner for more explanation.
+template < typename dispatch_policy, typename holder_type >
+struct dispatcher_transform_types;
+///
+template < typename dispatch_policy, typename holder_type >
+using dispatcher_transform_types_t =
+    typename dispatcher_transform_types<dispatch_policy, holder_type>::type;
+
+/*!
+ * A dispatcher abstraction that provides an interface to some basic
+ * functionality that is implemented differently based on the dispatch_policy.
+ *
+ * DispatcherID is used to differentiate function pointers based on their
+ * function signature.
+ */
+template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher;
+
+
+template < typename holder_type >
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> {
+  using type = ::RAJA::indirect_function_call_dispatch;
+};
+
+/*!
+ * Version of Dispatcher that acts essentially like a vtable. It implements
+ * the interface with function pointers.
+ *
+ * DispatcherID can be helpful to avoid function signature collisions
+ * with functions that will not be used through this class. This is useful
+ * during device linking when functions with high register counts may cause
+ * device linking to fail.
+ */
+template < Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherID, CallArgs...> {
+  static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
+  using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
+
+  ///
+  /// move construct an object of type T in dest as a copy of a T from src and
+  /// destroy the T obj in src
+  ///
+  template < typename T >
+  static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
+  {
+    T* dest_as_T = static_cast<T*>(dest.ptr);
+    T* src_as_T = static_cast<T*>(src.ptr);
+    new(dest_as_T) T(std::move(*src_as_T));
+    (*src_as_T).~T();
+  }
+
+  ///
+  /// invoke the call operator of the object of type T in obj with args
+  ///
+  template < typename T >
+  static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args)
+  {
+    const T* obj_as_T = static_cast<const T*>(obj.ptr);
+    (*obj_as_T)(std::forward<CallArgs>(args)...);
+  }
+  ///
+  template < typename T >
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
+  {
+    const T* obj_as_T = static_cast<const T*>(obj.ptr);
+    (*obj_as_T)(std::forward<CallArgs>(args)...);
+  }
+
+  ///
+  /// destroy the object of type T in obj
+  ///
+  template < typename T >
+  static void s_destroy(void_ptr_wrapper obj)
+  {
+    T* obj_as_T = static_cast<T*>(obj.ptr);
+    (*obj_as_T).~T();
+  }
+
+  using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
+  using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
+  using destroyer_type = void(*)(void_ptr_wrapper /*obj*/);
+
+  // This can't be a cuda device lambda due to compiler limitations
+  template < typename T >
+  struct DeviceInvokerFactory {
+    using value_type = invoker_type;
+    RAJA_DEVICE value_type operator()() {
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+      return nullptr;
+#else
+      return &s_device_invoke<T>;
+#endif
+    }
+  };
+
+  ///
+  /// create a Dispatcher that can be used on the host for objects of type T
+  ///
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    return { mover_type{&s_move_construct_destroy<T>},
+             invoker_type{&s_host_invoke<T>},
+             destroyer_type{&s_destroy<T>},
+             sizeof(T)
+           };
+  }
+  ///
+  /// create a Dispatcher that can be used on the device for objects of type T
+  ///
+  /// To do this the invoker_type must be created on the device to get the
+  /// device function pointer. The createOnDevice parameter is responsible for
+  /// providing the device context and returning the invoker object created.
+  /// The createOnDevice object uses an invoker factory provided as an argument
+  /// to create the invoker object. This allows for a separation between
+  /// object creation and the device context (cuda, hip, etc) and copying.
+  ///
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+    return { mover_type{&s_move_construct_destroy<T>},
+             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(DeviceInvokerFactory<T>{})},
+             destroyer_type{&s_destroy<T>},
+             sizeof(T)
+           };
+  }
+
+  mover_type move_construct_destroy;
+  invoker_type invoke;
+  destroyer_type destroy;
+  size_t size;
+};
+
+
+template < typename holder_type >
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> {
+  using type = ::RAJA::indirect_virtual_function_dispatch;
+};
+
+/*!
+ * Version of Dispatcher that uses a class hierarchy and virtual functions to
+ * implement the interface.
+ *
+ * DispatcherID can be helpful to avoid function signature collisions
+ * with functions that will not be used through this class. This is useful
+ * during device linking when functions with high register counts may cause
+ * device linking to fail.
+ */
+template < Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, DispatcherID, CallArgs...> {
+  static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
+  using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
+
+  struct impl_base {
+    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const = 0;
+  };
+
+  struct host_impl_base {
+    virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+  };
+
+  struct device_impl_base {
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+  };
+
+  template < typename T >
+  struct base_impl_type : impl_base
+  {
+    ///
+    /// move construct an object of type T in dest as a copy of a T from src and
+    /// destroy the T obj in src
+    ///
+    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
+    {
+      T* dest_as_T = static_cast<T*>(dest.ptr);
+      T* src_as_T = static_cast<T*>(src.ptr);
+      new(dest_as_T) T(std::move(*src_as_T));
+      (*src_as_T).~T();
+    }
+
+    ///
+    /// destroy the object of type T in obj
+    ///
+    virtual void destroy(void_ptr_wrapper obj) const override
+    {
+      T* obj_as_T = static_cast<T*>(obj.ptr);
+      (*obj_as_T).~T();
+    }
+  };
+
+  template < typename T >
+  struct host_impl_type : host_impl_base
+  {
+    ///
+    /// invoke the call operator of the object of type T in obj with args
+    ///
+    virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    {
+      const T* obj_as_T = static_cast<const T*>(obj.ptr);
+      (*obj_as_T)(std::forward<CallArgs>(args)...);
+    }
+  };
+
+  template < typename T >
+  struct device_impl_type : device_impl_base
+  {
+    ///
+    /// invoke the call operator of the object of type T in obj with args
+    ///
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    {
+      const T* obj_as_T = static_cast<const T*>(obj.ptr);
+      (*obj_as_T)(std::forward<CallArgs>(args)...);
+    }
+  };
+
+  struct mover_type {
+    impl_base* m_impl;
+    void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
+    {
+      m_impl->move_destroy(dest, src);
+    }
+  };
+
+  struct host_invoker_type {
+    host_impl_base* m_impl;
+    void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      m_impl->invoke(obj, std::forward<CallArgs>(args)...);
+    }
+  };
+  ///
+  struct device_invoker_type {
+    device_impl_base* m_impl;
+    RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      m_impl->invoke(obj, std::forward<CallArgs>(args)...);
+    }
+  };
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
+
+  struct destroyer_type {
+    impl_base* m_impl;
+    void operator()(void_ptr_wrapper obj) const
+    {
+      m_impl->destroy(obj);
+    }
+  };
+
+  // This can't be a cuda device lambda due to compiler limitations
+  template < typename T >
+  struct DeviceImplTypeFactory {
+    using value_type = device_impl_type<T>*;
+    RAJA_DEVICE value_type operator()() {
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+      return nullptr;
+#else
+      static device_impl_type<T> s_device_impl;
+      return &s_device_impl;
+#endif
+    }
+  };
+
+  ///
+  /// create a Dispatcher that can be used on the host for objects of type T
+  ///
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    static base_impl_type<T> s_base_impl;
+    static host_impl_type<T> s_host_impl;
+    return { mover_type{&s_base_impl},
+             host_invoker_type{&s_host_impl},
+             destroyer_type{&s_base_impl},
+             sizeof(T)
+           };
+  }
+  ///
+  /// create a Dispatcher that can be used on the device for objects of type T
+  ///
+  /// To do this the invoker_type must be created on the device to get the
+  /// device function pointer. The createOnDevice parameter is responsible for
+  /// providing the device context and returning the invoker object created.
+  /// The createOnDevice object uses an invoker factory provided as an argument
+  /// to create the invoker object. This allows for a separation between
+  /// object creation and the device context (cuda, hip, etc) and copying.
+  ///
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+    static base_impl_type<T> s_base_impl;
+    static device_impl_type<T>* s_device_impl_ptr{
+        std::forward<CreateOnDevice>(createOnDevice)(DeviceImplTypeFactory<T>{}) };
+    return { mover_type{&s_base_impl},
+             device_invoker_type{s_device_impl_ptr},
+             destroyer_type{&s_base_impl},
+             sizeof(T)
+           };
+  }
+
+  mover_type move_construct_destroy;
+  invoker_type invoke;
+  destroyer_type destroy;
+  size_t size;
+};
+
+
+// direct_dispatch expects a list of types
+template < typename ... Ts, typename holder_type >
+struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type> {
+  using type = ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
+};
+
+/*!
+ * Version of Dispatcher that does direct dispatch to zero callable types.
+ * It implements the interface with callable objects.
+ */
+template < Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...> {
+  static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
+  using dispatch_policy = ::RAJA::direct_dispatch<>;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
+
+  ///
+  /// move construct an object of type T in dest as a copy of a T from src and
+  /// destroy the T obj in src
+  ///
+  struct mover_type {
+    void operator()(void_ptr_wrapper, void_ptr_wrapper) const
+    { }
+  };
+
+  ///
+  /// invoke the call operator of the object of type T in obj with args
+  ///
+  struct host_invoker_type {
+    void operator()(void_cptr_wrapper, CallArgs...) const
+    { }
+  };
+  struct device_invoker_type {
+    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const
+    { }
+  };
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
+
+  ///
+  /// destroy the object of type T in obj
+  ///
+  struct destroyer_type {
+    void operator()(void_ptr_wrapper) const
+    { }
+  };
+
+  ///
+  /// create a Dispatcher that can be used on the host for objects of type T
+  ///
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  }
+  ///
+  /// create a Dispatcher that can be used on the device for objects of type T
+  ///
+  /// Ignore the CreateOnDevice object as the same invoker object can be used
+  /// on the host and device.
+  ///
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  }
+
+  mover_type move_construct_destroy;
+  invoker_type invoke;
+  destroyer_type destroy;
+  size_t size;
+};
+
+/*!
+ * Version of Dispatcher that does direct dispatch to a single callable type.
+ * It implements the interface with callable objects.
+ */
+template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs...> {
+  static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
+  using dispatch_policy = ::RAJA::direct_dispatch<T>;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
+
+  ///
+  /// move construct an object of type T in dest as a copy of a T from src and
+  /// destroy the T obj in src
+  ///
+  struct mover_type {
+    void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
+    {
+      T* dest_as_T = static_cast<T*>(dest.ptr);
+      T* src_as_T = static_cast<T*>(src.ptr);
+      new(dest_as_T) T(std::move(*src_as_T));
+      (*src_as_T).~T();
+    }
+  };
+
+  ///
+  /// invoke the call operator of the object of type T in obj with args
+  ///
+  struct host_invoker_type {
+    void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      const T* obj_as_T = static_cast<const T*>(obj.ptr);
+      (*obj_as_T)(std::forward<CallArgs>(args)...);
+    }
+  };
+  struct device_invoker_type {
+    RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      const T* obj_as_T = static_cast<const T*>(obj.ptr);
+      (*obj_as_T)(std::forward<CallArgs>(args)...);
+    }
+  };
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
+
+  ///
+  /// destroy the object of type T in obj
+  ///
+  struct destroyer_type {
+    void operator()(void_ptr_wrapper obj) const
+    {
+      T* obj_as_T = static_cast<T*>(obj.ptr);
+      (*obj_as_T).~T();
+    }
+  };
+
+  ///
+  /// create a Dispatcher that can be used on the host for objects of type T
+  ///
+  template< typename U,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  }
+  ///
+  /// create a Dispatcher that can be used on the device for objects of type T
+  ///
+  /// Ignore the CreateOnDevice object as the same invoker object can be used
+  /// on the host and device.
+  ///
+  template< typename U, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
+    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  }
+
+  mover_type move_construct_destroy;
+  invoker_type invoke;
+  destroyer_type destroy;
+  size_t size;
+};
+
+/*!
+ * Version of Dispatcher that does direct dispatch to multiple callable types.
+ * It implements the interface with callable objects.
+ */
+template < typename T0, typename T1, typename ... TNs,
+           Platform platform, typename DispatcherID, typename ... CallArgs >
+struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID, CallArgs...> {
+  static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
+  using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
+  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
+
+  using id_type = int;
+  using callable_indices = camp::make_int_seq_t<id_type, 2+sizeof...(TNs)>;
+  using callable_types = camp::list<T0, T1, TNs...>;
+
+  ///
+  /// move construct an object of type T in dest as a copy of a T from src and
+  /// destroy the T obj in src
+  ///
+  struct mover_type {
+    id_type id;
+
+    void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
+    {
+      impl_helper(callable_indices{}, callable_types{},
+                  dest, src);
+    }
+
+  private:
+    template < int ... id_types, typename ... Ts >
+    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_ptr_wrapper dest, void_ptr_wrapper src) const
+    {
+      camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
+    }
+
+    template < typename T >
+    void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
+    {
+      T* dest_as_T = static_cast<T*>(dest.ptr);
+      T* src_as_T = static_cast<T*>(src.ptr);
+      new(dest_as_T) T(std::move(*src_as_T));
+      (*src_as_T).~T();
+    }
+  };
+
+  ///
+  /// invoke the call operator of the object of type T in obj with args
+  ///
+  struct host_invoker_type {
+    id_type id;
+
+    void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      impl_helper(callable_indices{}, callable_types{},
+                  obj, std::forward<CallArgs>(args)...);
+    }
+
+  private:
+    template < int ... id_types, typename ... Ts >
+    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_cptr_wrapper obj, CallArgs... args) const
+    {
+      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+    }
+
+    template < typename T >
+    void impl(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      const T* obj_as_T = static_cast<const T*>(obj.ptr);
+      (*obj_as_T)(std::forward<CallArgs>(args)...);
+    }
+  };
+  struct device_invoker_type {
+    id_type id;
+
+    RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      impl_helper(callable_indices{}, callable_types{},
+                  obj, std::forward<CallArgs>(args)...);
+    }
+
+  private:
+    template < int ... id_types, typename ... Ts >
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_cptr_wrapper obj, CallArgs... args) const
+    {
+      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+    }
+
+    template < typename T >
+    RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const
+    {
+      const T* obj_as_T = static_cast<const T*>(obj.ptr);
+      (*obj_as_T)(std::forward<CallArgs>(args)...);
+    }
+  };
+  using invoker_type = std::conditional_t<use_host_invoke,
+                                          host_invoker_type,
+                                          device_invoker_type>;
+
+  ///
+  /// destroy the object of type T in obj
+  ///
+  struct destroyer_type {
+    id_type id;
+
+    void operator()(void_ptr_wrapper obj) const
+    {
+      impl_helper(callable_indices{}, callable_types{},
+                  obj);
+    }
+
+  private:
+    template < int ... id_types, typename ... Ts >
+    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
+              void_ptr_wrapper obj) const
+    {
+      camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
+    }
+
+    template < typename T >
+    void impl(void_ptr_wrapper obj) const
+    {
+      T* obj_as_T = static_cast<T*>(obj.ptr);
+      (*obj_as_T).~T();
+    }
+  };
+
+  ///
+  /// get the id of type T
+  ///
+  /// The id is just the index of T in the list of callable_types.
+  /// If T is not in Ts return -1.
+  ///
+  template < typename T, int ... id_types, typename ... Ts >
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
+  {
+    id_type id{-1};
+    // quiet UB warning by sequencing assignment to id with list initialization
+    int unused[] {0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    camp::sink(unused); // quiet unused var warning
+    return id;
+  }
+
+  ///
+  /// create a Dispatcher that can be used on the host for objects of type T
+  ///
+  template< typename T,
+            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher() {
+    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+    static_assert(id != id_type(-1), "T must be in direct_dispatch types");
+    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+  }
+  ///
+  /// create a Dispatcher that can be used on the device for objects of type T
+  ///
+  /// Ignore the CreateOnDevice object as the same invoker object can be used
+  /// on the host and device.
+  ///
+  template< typename T, typename CreateOnDevice,
+            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
+    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+    static_assert(id != id_type(-1), "T must be in direct_dispatch types");
+    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+  }
+
+  mover_type move_construct_destroy;
+  invoker_type invoke;
+  destroyer_type destroy;
+  size_t size;
+};
+
+/*!
+ * Populate and return a pointer to a Dispatcher object for the given policy.
+ * NOTE: there is a function overload is in each policy/WorkGroup/Dispatcher.hpp
+ */
+// template < typename T, typename Dispatcher_T >
+// inline const Dispatcher_T* get_Dispatcher(work_policy const&);
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/Vtable.hpp b/include/RAJA/pattern/WorkGroup/Vtable.hpp
deleted file mode 100644
index 4dfd9c6718..0000000000
--- a/include/RAJA/pattern/WorkGroup/Vtable.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file providing RAJA Vtable for workgroup.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_PATTERN_WORKGROUP_Vtable_HPP
-#define RAJA_PATTERN_WORKGROUP_Vtable_HPP
-
-
-#include "RAJA/config.hpp"
-
-#include <utility>
-
-
-namespace RAJA
-{
-
-namespace detail
-{
-
-template < typename >
-struct VtableVoidPtrWrapper
-{
-  void* ptr;
-  VtableVoidPtrWrapper() = default;
-  // implicit constructor from void*
-  RAJA_HOST_DEVICE VtableVoidPtrWrapper(void* p) : ptr(p) { }
-};
-
-template < typename >
-struct VtableVoidConstPtrWrapper
-{
-  const void* ptr;
-  VtableVoidConstPtrWrapper() = default;
-  // implicit constructor from const void*
-  RAJA_HOST_DEVICE VtableVoidConstPtrWrapper(const void* p) : ptr(p) { }
-};
-
-/*!
- * A vtable abstraction
- *
- * Provides function pointers for basic functions.
- *
- * VtableID is used to differentiate function pointers based on their
- * function signature. This is helpful to avoid function signature collisions
- * with functions that will not be used through this class. This is useful
- * during device linking when functions with high register counts may cause
- * device linking to fail.
- */
-template < typename VtableID, typename ... CallArgs >
-struct Vtable {
-  using void_ptr_wrapper = VtableVoidPtrWrapper<VtableID>;
-  using void_cptr_wrapper = VtableVoidConstPtrWrapper<VtableID>;
-  using move_sig = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
-  using call_sig = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
-  using destroy_sig = void(*)(void_ptr_wrapper /*obj*/);
-
-  ///
-  /// move construct an object of type T in dest as a copy of a T from src and
-  /// destroy the T obj in src
-  ///
-  template < typename T >
-  static void move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
-  {
-    T* dest_as_T = static_cast<T*>(dest.ptr);
-    T* src_as_T = static_cast<T*>(src.ptr);
-    new(dest_as_T) T(std::move(*src_as_T));
-    (*src_as_T).~T();
-  }
-
-  ///
-  /// call the call operator of the object of type T in obj with args
-  ///
-  template < typename T >
-  static void host_call(void_cptr_wrapper obj, CallArgs... args)
-  {
-    const T* obj_as_T = static_cast<const T*>(obj.ptr);
-    (*obj_as_T)(std::forward<CallArgs>(args)...);
-  }
-  ///
-  template < typename T >
-  static RAJA_DEVICE void device_call(void_cptr_wrapper obj, CallArgs... args)
-  {
-    const T* obj_as_T = static_cast<const T*>(obj.ptr);
-    (*obj_as_T)(std::forward<CallArgs>(args)...);
-  }
-
-  ///
-  /// destoy the object of type T in obj
-  ///
-  template < typename T >
-  static void destroy(void_ptr_wrapper obj)
-  {
-    T* obj_as_T = static_cast<T*>(obj.ptr);
-    (*obj_as_T).~T();
-  }
-
-  move_sig move_construct_destroy_function_ptr;
-  call_sig call_function_ptr;
-  destroy_sig destroy_function_ptr;
-  size_t size;
-};
-
-/*!
- * Populate and return a pointer to a Vtable object for the given policy.
- * NOTE: there is a function overload is in each policy/WorkGroup/Vtable.hpp
- */
-// template < typename T, typename Vtable_T >
-// inline const Vtable_T* get_Vtable(work_policy const&);
-
-}  // namespace detail
-
-}  // namespace RAJA
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 3168b22ca0..b2775b3226 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -27,7 +27,7 @@
 
 #include "RAJA/pattern/forall.hpp"
 
-#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+#include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/WorkGroup.hpp"
 
 
@@ -140,6 +140,7 @@ struct HoldForall
  */
 template <typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
@@ -152,6 +153,7 @@ struct WorkRunner;
 template <typename FORALL_EXEC_POLICY,
           typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
@@ -159,12 +161,34 @@ struct WorkRunnerForallOrdered_base
 {
   using exec_policy = EXEC_POLICY_T;
   using order_policy = ORDER_POLICY_T;
+  using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
   using resource_type = typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
   using forall_exec_policy = FORALL_EXEC_POLICY;
-  using vtable_type = Vtable<void, resource_type, Args...>;
+
+  // The type that will hold the segment and loop body in work storage
+  struct holder_type {
+    template < typename T >
+    using type = HoldForall<forall_exec_policy,
+                            typename camp::at<T, camp::num<0>>::type, // segment_type
+                            typename camp::at<T, camp::num<1>>::type, // loop_type
+                            index_type, Args...>;
+  };
+  ///
+  template < typename T >
+  using holder_type_t = typename holder_type::template type<T>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the host in a loop
+  using dispatcher_exec_policy = RAJA::loop_work;
+
+  // The Dispatcher policy with holder_types used internally to handle the
+  // ranges and callables passed in by the user.
+  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+
+  using dispatcher_type = Dispatcher<Platform::host, dispatcher_holder_policy, void, resource_type, Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
@@ -174,24 +198,15 @@ struct WorkRunnerForallOrdered_base
   WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
   WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
 
-  // The type  that will hold the segment and loop body in work storage
-  template < typename segment_type, typename loop_type >
-  using holder_type = HoldForall<forall_exec_policy, segment_type, loop_type,
-                                 index_type, Args...>;
-
-  // The policy indicating where the call function is invoked
-  // in this case the values are called on the host in a loop
-  using vtable_exec_policy = RAJA::loop_work;
-
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
   template < typename WorkContainer, typename segment_T, typename loop_T >
   inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
   {
-    using holder = holder_type<camp::decay<segment_T>, camp::decay<loop_T>>;
+    using holder = holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
     storage.template emplace<holder>(
-        get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
         std::forward<segment_T>(seg), std::forward<loop_T>(loop));
   }
 
@@ -209,6 +224,7 @@ struct WorkRunnerForallOrdered_base
 template <typename FORALL_EXEC_POLICY,
           typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
@@ -217,6 +233,7 @@ struct WorkRunnerForallOrdered
       FORALL_EXEC_POLICY,
       EXEC_POLICY_T,
       ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
       ALLOCATOR_T,
       INDEX_T,
       Args...>
@@ -225,6 +242,7 @@ struct WorkRunnerForallOrdered
       FORALL_EXEC_POLICY,
       EXEC_POLICY_T,
       ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
       ALLOCATOR_T,
       INDEX_T,
       Args...>;
@@ -242,7 +260,7 @@ struct WorkRunnerForallOrdered
 
     auto end = storage.end();
     for (auto iter = storage.begin(); iter != end; ++iter) {
-      value_type::call(&*iter, r, args...);
+      value_type::host_call(&*iter, r, args...);
     }
 
     return run_storage;
@@ -255,6 +273,7 @@ struct WorkRunnerForallOrdered
 template <typename FORALL_EXEC_POLICY,
           typename EXEC_POLICY_T,
           typename ORDER_POLICY_T,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
@@ -263,6 +282,7 @@ struct WorkRunnerForallReverse
       FORALL_EXEC_POLICY,
       EXEC_POLICY_T,
       ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
       ALLOCATOR_T,
       INDEX_T,
       Args...>
@@ -271,6 +291,7 @@ struct WorkRunnerForallReverse
       FORALL_EXEC_POLICY,
       EXEC_POLICY_T,
       ORDER_POLICY_T,
+      DISPATCH_POLICY_T,
       ALLOCATOR_T,
       INDEX_T,
       Args...>;
@@ -288,7 +309,7 @@ struct WorkRunnerForallReverse
 
     auto begin = storage.begin();
     for (auto iter = storage.end(); iter != begin; --iter) {
-      value_type::call(&*(iter-1), r, args...);
+      value_type::host_call(&*(iter-1), r, args...);
     }
 
     return run_storage;
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index b4e0bb4632..8cc442c01e 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -191,11 +191,11 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Vtable_T >
+template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T >
 class WorkStorage;
 
-template < typename ALLOCATOR_T, typename Vtable_T >
-class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Vtable_T>
+template < typename ALLOCATOR_T, typename Dispatcher_T >
+class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
   using propagate_on_container_copy_assignment =
@@ -208,12 +208,12 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Vtable_T>
       "WorkStorage expects an allocator for 'char's.");
 public:
   using storage_policy = RAJA::array_of_pointers;
-  using vtable_type = Vtable_T;
+  using dispatcher_type = Dispatcher_T;
 
   template < typename holder >
-  using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
+  using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<vtable_type>;
+  using value_type = GenericWorkStruct<dispatcher_type>;
   using allocator_type = ALLOCATOR_T;
   using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
@@ -338,10 +338,10 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Vtable_T>
   }
 
   template < typename holder, typename ... holder_ctor_args >
-  void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
-        vtable, std::forward<holder_ctor_args>(ctor_args)...));
+        dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
   }
 
   // destroy all stored loops, deallocates all storage
@@ -390,7 +390,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Vtable_T>
 
   // allocate and construct value in storage
   template < typename holder, typename ... holder_ctor_args >
-  pointer_and_size create_value(const vtable_type* vtable,
+  pointer_and_size create_value(const dispatcher_type* dispatcher,
                                 holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
@@ -399,7 +399,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Vtable_T>
         allocator_traits_type::allocate(m_aloc, value_size));
 
     value_type::template construct<holder>(
-        value_ptr, vtable, std::forward<holder_ctor_args>(ctor_args)...);
+        value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
 
     return pointer_and_size{value_ptr, value_size};
   }
@@ -429,8 +429,8 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Vtable_T>
   }
 };
 
-template < typename ALLOCATOR_T, typename Vtable_T >
-class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Vtable_T>
+template < typename ALLOCATOR_T, typename Dispatcher_T >
+class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
   using propagate_on_container_copy_assignment =
@@ -443,12 +443,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Vtable_T>
       "WorkStorage expects an allocator for 'char's.");
 public:
   using storage_policy = RAJA::ragged_array_of_objects;
-  using vtable_type = Vtable_T;
+  using dispatcher_type = Dispatcher_T;
 
   template < typename holder >
-  using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
+  using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<vtable_type>;
+  using value_type = GenericWorkStruct<dispatcher_type>;
   using allocator_type = ALLOCATOR_T;
   using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
@@ -568,11 +568,11 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Vtable_T>
   }
 
   template < typename holder, typename ... holder_ctor_args >
-  void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
     size_type value_size   = create_value<holder>(value_offset,
-        vtable, std::forward<holder_ctor_args>(ctor_args)...);
+        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
   }
@@ -698,7 +698,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Vtable_T>
   // and store the loop body
   template < typename holder, typename ... holder_ctor_args >
   size_type create_value(size_type value_offset,
-                         const vtable_type* vtable,
+                         const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
@@ -710,7 +710,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Vtable_T>
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
 
     value_type::template construct<holder>(
-        value_ptr, vtable, std::forward<holder_ctor_args>(ctor_args)...);
+        value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
 
     return value_size;
   }
@@ -732,10 +732,10 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Vtable_T>
   }
 };
 
-template < typename ALLOCATOR_T, typename Vtable_T >
+template < typename ALLOCATOR_T, typename Dispatcher_T >
 class WorkStorage<RAJA::constant_stride_array_of_objects,
                   ALLOCATOR_T,
-                  Vtable_T>
+                  Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
   using propagate_on_container_copy_assignment =
@@ -748,12 +748,12 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       "WorkStorage expects an allocator for 'char's.");
 public:
   using storage_policy = RAJA::constant_stride_array_of_objects;
-  using vtable_type = Vtable_T;
+  using dispatcher_type = Dispatcher_T;
 
   template < typename holder >
-  using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
+  using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<vtable_type>;
+  using value_type = GenericWorkStruct<dispatcher_type>;
   using allocator_type = ALLOCATOR_T;
   using size_type = std::size_t;
   using difference_type = std::ptrdiff_t;
@@ -873,9 +873,9 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 
   template < typename holder, typename ... holder_ctor_args >
-  void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(vtable, std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -1003,7 +1003,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
   template < typename holder, typename ... holder_ctor_args >
-  void create_value(const vtable_type* vtable,
+  void create_value(const dispatcher_type* dispatcher,
                     holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
@@ -1020,7 +1020,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
 
     value_type::template construct<holder>(
-        value_ptr, vtable, std::forward<holder_ctor_args>(ctor_args)...);
+        value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
   }
 
   // move construct the loop body in value from other and
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 09399d43c6..6bdd56a3c3 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -23,7 +23,7 @@
 #include <utility>
 #include <cstddef>
 
-#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+#include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 
 
 namespace RAJA
@@ -35,7 +35,7 @@ namespace detail
 /*!
  * A struct that gives a generic way to layout memory for different loops
  */
-template < size_t size, typename Vtable_T >
+template < size_t size, typename Dispatcher_T >
 struct WorkStruct;
 
 /*!
@@ -44,22 +44,22 @@ struct WorkStruct;
  *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
-template < typename Vtable_T >
-using GenericWorkStruct = WorkStruct<alignof(std::max_align_t), Vtable_T>;
+template < typename Dispatcher_T >
+using GenericWorkStruct = WorkStruct<alignof(std::max_align_t), Dispatcher_T>;
 
-template < size_t size, typename VtableID, typename ... CallArgs >
-struct WorkStruct<size, Vtable<VtableID, CallArgs...>>
+template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
 {
-  using vtable_type = Vtable<VtableID, CallArgs...>;
+  using dispatcher_type = Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
 
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
   template < typename holder, typename ... holder_ctor_args >
   static RAJA_INLINE
-  void construct(void* ptr, const vtable_type* vtable, holder_ctor_args&&... ctor_args)
+  void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
   {
-    using true_value_type = WorkStruct<sizeof(holder), vtable_type>;
-    using value_type = GenericWorkStruct<vtable_type>;
+    using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
+    using value_type = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
         "holder must fit in WorkStruct::obj");
@@ -74,8 +74,8 @@ struct WorkStruct<size, Vtable<VtableID, CallArgs...>>
 
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
-    value_ptr->vtable = vtable;
-    value_ptr->call_function_ptr = vtable->call_function_ptr;
+    value_ptr->dispatcher = dispatcher;
+    value_ptr->invoke = dispatcher->invoke;
     new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
@@ -84,27 +84,34 @@ struct WorkStruct<size, Vtable<VtableID, CallArgs...>>
   void move_destroy(WorkStruct* value_dst,
                     WorkStruct* value_src)
   {
-    value_dst->vtable = value_src->vtable;
-    value_dst->call_function_ptr = value_src->call_function_ptr;
-    value_dst->vtable->move_construct_destroy_function_ptr(&value_dst->obj, &value_src->obj);
+    value_dst->dispatcher = value_src->dispatcher;
+    value_dst->invoke = value_src->invoke;
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj);
   }
 
   // destroy the value ptr
   static RAJA_INLINE
   void destroy(WorkStruct* value_ptr)
   {
-    value_ptr->vtable->destroy_function_ptr(&value_ptr->obj);
+    value_ptr->dispatcher->destroy(&value_ptr->obj);
   }
 
-  // call the call operator of the value ptr with args
-  static RAJA_HOST_DEVICE RAJA_INLINE
-  void call(const WorkStruct* value_ptr, CallArgs... args)
+  // invoke the call operator of the value ptr with args
+  static RAJA_INLINE
+  void host_call(const WorkStruct* value_ptr, CallArgs... args)
+  {
+    value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
+  }
+  ///
+  // invoke the call operator of the value ptr with args
+  static RAJA_DEVICE RAJA_INLINE
+  void device_call(const WorkStruct* value_ptr, CallArgs... args)
   {
-    value_ptr->call_function_ptr(&value_ptr->obj, std::forward<CallArgs>(args)...);
+    value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
 
-  const vtable_type* vtable;
-  typename vtable_type::call_sig call_function_ptr;
+  const dispatcher_type* dispatcher;
+  typename dispatcher_type::invoker_type invoke;
   typename std::aligned_storage<size, alignof(std::max_align_t)>::type obj;
 };
 
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 0daf58ab1e..f0a2d85bbe 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -120,15 +120,15 @@ struct icount_adapter {
 };
 
 struct CallForall {
-  template <typename T, typename ExecPol, typename Body, typename Res>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res) const;
+  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 };
 
 struct CallForallIcount {
   constexpr CallForallIcount(int s);
 
-  template <typename T, typename ExecPol, typename Body, typename Res>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res) const;
+  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
 
   const int start;
 };
@@ -152,6 +152,21 @@ namespace wrap
  *
  ******************************************************************************
  */
+template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody, typename ForallParams>
+RAJA_INLINE concepts::enable_if_t<
+    RAJA::resources::EventProxy<Res>,
+    concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
+    type_traits::is_range<Container>>
+forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params)
+{
+  RAJA_FORCEINLINE_RECURSIVE
+  return forall_impl(r,
+                     std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body),
+                     std::forward<ForallParams>(f_params));
+}
+
 template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
@@ -163,7 +178,8 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
   return forall_impl(r,
                      std::forward<ExecutionPolicy>(p),
                      std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body));
+                     std::forward<LoopBody>(loop_body),
+                     expt::get_empty_forall_param_pack());
 }
 
 
@@ -178,12 +194,14 @@ template <typename Res,
           typename ExecutionPolicy,
           typename Container,
           typename IndexType,
-          typename LoopBody>
+          typename LoopBody,
+          typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
                                                       ExecutionPolicy&& p,
                                                       Container&& c,
                                                       IndexType&& icount,
-                                                      LoopBody&& loop_body)
+                                                      LoopBody&& loop_body,
+                                                      ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
@@ -194,7 +212,7 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
                                                                  icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted);
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted, std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -210,12 +228,14 @@ template <typename Res,
           typename SegmentIterPolicy,
           typename SegmentExecPolicy,
           typename... SegmentTypes,
-          typename LoopBody>
+          typename LoopBody,
+          typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
                                                 ExecPolicy<SegmentIterPolicy,
                                                 SegmentExecPolicy>,
                                                 const TypedIndexSet<SegmentTypes...>& iset,
-                                                LoopBody loop_body)
+                                                LoopBody loop_body,
+                                                ForallParams f_params)
 {
   // no need for icount variant here
   auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
@@ -224,7 +244,8 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
                      SegmentExecPolicy(),
                      loop_body,
-                     r);
+                     r,
+                     f_params);
   });
   return RAJA::resources::EventProxy<Res>(r);
 }
@@ -233,16 +254,18 @@ template <typename Res,
           typename SegmentIterPolicy,
           typename SegmentExecPolicy,
           typename LoopBody,
-          typename... SegmentTypes>
+          typename... SegmentTypes,
+          typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall(Res r,
                                          ExecPolicy<SegmentIterPolicy,
                                          SegmentExecPolicy>,
                                          const TypedIndexSet<SegmentTypes...>& iset,
-                                         LoopBody loop_body)
+                                         LoopBody loop_body,
+                                         ForallParams f_params)
 {
   auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
   wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r);
+    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params);
   });
   return RAJA::resources::EventProxy<Res>(r);
 }
@@ -271,16 +294,20 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename LoopBody>
+template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
                                                      Res r,
                                                      IdxSet&& c,
-                                                     LoopBody&& loop_body)
+                                                     Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
                 "a TypedIndexSet policy by mistake?");
 
+  auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
+  //expt::check_forall_optional_args(loop_body, f_params);
+
   util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -295,7 +322,8 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
       r,
       std::forward<ExecutionPolicy>(p),
       std::forward<IdxSet>(c),
-      std::move(body));
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -321,16 +349,20 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename LoopBody>
+template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_indexset_policy<ExecutionPolicy>>
-forall(ExecutionPolicy&& p, Res r, IdxSet&& c, LoopBody&& loop_body)
+forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 {
   static_assert(type_traits::is_index_set<IdxSet>::value,
                 "Expected a TypedIndexSet but did not get one. Are you using "
                 "a TypedIndexSet policy by mistake?");
 
+  auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
+  expt::check_forall_optional_args(loop_body, f_params);
+
   util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -345,7 +377,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, LoopBody&& loop_body)
       r,
       std::forward<ExecutionPolicy>(p),
       std::forward<IdxSet>(c),
-      std::move(body));
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -374,7 +407,8 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
  */
 template <typename ExecutionPolicy, typename Container, typename LoopBody,
           typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if<
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
     type_traits::is_multi_policy<ExecutionPolicy>,
     type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
@@ -385,7 +419,7 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  forall_impl(r,
+  return forall_impl(r,
               std::forward<ExecutionPolicy>(p),
               std::forward<Container>(c),
               std::forward<LoopBody>(loop_body));
@@ -402,7 +436,8 @@ template <typename ExecutionPolicy,
           typename Res,
           typename Container,
           typename IndexType,
-          typename LoopBody>
+          typename FirstParam,
+          typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_range<Container>,
@@ -411,11 +446,16 @@ forall_Icount(ExecutionPolicy&& p,
               Res r,
               Container&& c,
               IndexType icount,
-              LoopBody&& loop_body)
+              FirstParam&& first,
+              Params&&... params)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first), std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first), std::forward<Params>(params)...);
+  //expt::check_forall_optional_args(loop_body, f_params);
+
   util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -431,7 +471,8 @@ forall_Icount(ExecutionPolicy&& p,
       std::forward<ExecutionPolicy>(p),
       std::forward<Container>(c),
       icount,
-      std::move(body));
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
@@ -467,17 +508,22 @@ forall_Icount(ExecutionPolicy&& p,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename Container, typename LoopBody>
+
+template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     concepts::negate<type_traits::is_multi_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(ExecutionPolicy&& p, Res r, Container&& c, LoopBody&& loop_body)
+forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
+  auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
+  expt::check_forall_optional_args(loop_body, f_params);
+
   util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
@@ -492,11 +538,13 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, LoopBody&& loop_body)
       r,
       std::forward<ExecutionPolicy>(p),
       std::forward<Container>(c),
-      std::move(body));
+      std::move(body),
+      f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
+
 template <typename ExecutionPolicy, typename Container, typename LoopBody,
           typename Res = typename resources::get_resource<ExecutionPolicy>::type >
 RAJA_INLINE concepts::enable_if_t<
@@ -563,32 +611,142 @@ forall_Icount(Res r, Args&&... args)
 namespace detail
 {
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res>
+template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
 RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
                                                                ExecutionPolicy,
                                                                LoopBody body,
-                                                               Res r) const
+                                                               Res r,
+                                                               ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, ExecutionPolicy(), segment, body);
+  return forall_impl(r, ExecutionPolicy(), segment, body, f_params);
 }
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res>
+template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
 RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
                                                                      ExecutionPolicy,
                                                                      LoopBody body,
-                                                                     Res r) const
+                                                                     Res r,
+                                                                     ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params);
 }
 
 }  // namespace detail
 
+//
+// Experimental support for dynamic policy selection
+//
+// Future directions:
+// - Tuple of resources one for each platform
+// - Returns a generic event proxy only if a resource is provided
+//   avoids overhead of constructing a typed erased resource
+//
+namespace expt
+{
+
+  template<camp::idx_t IDX, typename POLICY_LIST>
+  struct dynamic_helper
+  {
+    template<typename SEGMENT, typename BODY>
+    static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    {
+      if(IDX==pol){
+        using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
+        RAJA::forall<t_pol>(seg, body);
+        return;
+      }
+      dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+    }
+
+    template<typename SEGMENT, typename BODY>
+    static resources::EventProxy<resources::Resource>
+    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
+    {
+
+      using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
+      using resource_type = typename resources::get_resource<t_pol>::type;
+
+      if(IDX==pol){
+        RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+
+        //Return a generic event proxy from r,
+        //because forall returns a typed event proxy
+        return {r};
+      }
+
+      return dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+    }
+
+  };
+
+  template<typename POLICY_LIST>
+  struct dynamic_helper<0, POLICY_LIST>
+  {
+    template<typename SEGMENT, typename BODY>
+    static void
+    invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    {
+      if(0==pol){
+        using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
+        RAJA::forall<t_pol>(seg, body);
+        return;
+      }
+      RAJA_ABORT_OR_THROW("Policy enum not supported ");
+    }
+
+    template<typename SEGMENT, typename BODY>
+    static resources::EventProxy<resources::Resource>
+    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
+    {
+      if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
+
+      using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
+      using resource_type = typename resources::get_resource<t_pol>::type;
+
+      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+
+      //Return a generic event proxy from r,
+      //because forall returns a typed event proxy
+      return {r};
+    }
+
+  };
+
+  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+  void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body)
+  {
+    constexpr int N = camp::size<POLICY_LIST>::value;
+    static_assert(N > 0, "RAJA policy list must not be empty");
+
+    if(pol > N-1)  {
+      RAJA_ABORT_OR_THROW("Policy enum not supported");
+    }
+    dynamic_helper<N-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+  }
+
+  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+  resources::EventProxy<resources::Resource>
+  dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
+  {
+    constexpr int N = camp::size<POLICY_LIST>::value;
+    static_assert(N > 0, "RAJA policy list must not be empty");
+
+    if(pol > N-1)  {
+      RAJA_ABORT_OR_THROW("Policy value out of range");
+    }
+
+    return dynamic_helper<N-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+  }
+
+}  // namespace expt
+
+
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 36b72f9fe8..1f5165a9e3 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -103,7 +103,7 @@ struct StatementExecutor<
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 1f28fb4740..efb4b6fb10 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -112,7 +112,7 @@ struct StatementExecutor<
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper);
+    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 9501876bdc..403ae905ff 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -148,7 +148,8 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
     forall_impl(r, HpExecPolicy{},
                 TypedRangeSegment<idx_t>(0, hp_len),
-                outer_wrapper);
+                outer_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index db3e7fb8e3..013babbaef 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -243,7 +243,7 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper);
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
@@ -277,7 +277,7 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper);
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
     
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 4068e36904..f8bc431cfe 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -124,7 +124,7 @@ struct StatementExecutor<
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper);
+    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/teams.hpp b/include/RAJA/pattern/launch.hpp
similarity index 56%
rename from include/RAJA/pattern/teams.hpp
rename to include/RAJA/pattern/launch.hpp
index a61b1fde8c..e590bf33b2 100644
--- a/include/RAJA/pattern/teams.hpp
+++ b/include/RAJA/pattern/launch.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   RAJA header file containing headers for RAJA::Teams backends
+ * \brief   RAJA header file containing headers for RAJA::Launch backends
  *
  ******************************************************************************
  */
@@ -15,28 +15,32 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_HPP
-#define RAJA_pattern_teams_HPP
+#ifndef RAJA_pattern_launch_HPP
+#define RAJA_pattern_launch_HPP
 
-#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 
 //
 // All platforms must support host execution.
 //
-#include "RAJA/policy/sequential/teams.hpp"
-#include "RAJA/policy/loop/teams.hpp"
-#include "RAJA/policy/simd/teams.hpp"
+#include "RAJA/policy/sequential/launch.hpp"
+#include "RAJA/policy/loop/launch.hpp"
+#include "RAJA/policy/simd/launch.hpp"
 
 #if defined(RAJA_CUDA_ACTIVE)
-#include "RAJA/policy/cuda/teams.hpp"
+#include "RAJA/policy/cuda/launch.hpp"
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
-#include "RAJA/policy/hip/teams.hpp"
+#include "RAJA/policy/hip/launch.hpp"
 #endif
 
 #if defined(RAJA_ENABLE_OPENMP)
-#include "RAJA/policy/openmp/teams.hpp"
+#include "RAJA/policy/openmp/launch.hpp"
 #endif
 
-#endif /* RAJA_pattern_teams_HPP */
+#if defined(RAJA_ENABLE_SYCL)
+#include "RAJA/policy/sycl/launch.hpp"
+#endif
+
+#endif /* RAJA_pattern_launch_HPP */
diff --git a/include/RAJA/pattern/teams/teams_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
similarity index 74%
rename from include/RAJA/pattern/teams/teams_core.hpp
rename to include/RAJA/pattern/launch/launch_core.hpp
index 0c178df3b7..12d6f16f6f 100644
--- a/include/RAJA/pattern/teams/teams_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -15,8 +15,8 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_core_HPP
-#define RAJA_pattern_teams_core_HPP
+#ifndef RAJA_pattern_launch_core_HPP
+#define RAJA_pattern_launch_core_HPP
 
 #include "RAJA/config.hpp"
 #include "RAJA/internal/get_platform.hpp"
@@ -28,7 +28,10 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-#if defined(RAJA_DEVICE_CODE)
+//Odd dependecy with atomics is breaking CI builds
+//#include "RAJA/util/View.hpp"
+
+#if defined(RAJA_DEVICE_CODE) && !defined(RAJA_ENABLE_SYCL)
 #define RAJA_TEAM_SHARED __shared__
 #else
 #define RAJA_TEAM_SHARED
@@ -37,11 +40,9 @@
 namespace RAJA
 {
 
-namespace expt
-{
-
 // GPU or CPU threads available
-enum ExecPlace { HOST, DEVICE, NUM_PLACES };
+//strongly type the ExecPlace (guards agaist errors)
+enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES };
 
 struct null_launch_t {
 };
@@ -128,18 +129,17 @@ struct Lanes {
   constexpr Lanes(int i) : value(i) {}
 };
 
-struct Grid {
+struct LaunchParams {
 public:
   Teams teams;
   Threads threads;
-  Lanes lanes;
-  const char *kernel_name{nullptr};
+  size_t shared_mem_size;
 
   RAJA_INLINE
-  Grid() = default;
+  LaunchParams() = default;
 
-  Grid(Teams in_teams, Threads in_threads, const char *in_kernel_name = nullptr)
-    : teams(in_teams), threads(in_threads), kernel_name(in_kernel_name){};
+  LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0)
+    : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {};
 
 private:
   RAJA_HOST_DEVICE
@@ -149,26 +149,63 @@ struct Grid {
   RAJA_HOST_DEVICE
   RAJA_INLINE
   Threads apply(Threads const &a) { return (threads = a); }
-
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  Lanes apply(Lanes const &a) { return (lanes = a); }
 };
 
-
-class LaunchContext : public Grid
+class LaunchContext
 {
 public:
 
-  LaunchContext(Grid const &base)
-      : Grid(base)
+  //Bump style allocator used to
+  //get memory from the pool
+  size_t shared_mem_offset;
+
+  void *shared_mem_ptr;
+
+#if defined(RAJA_ENABLE_SYCL)
+  mutable cl::sycl::nd_item<3> *itm;
+#endif
+
+  RAJA_HOST_DEVICE LaunchContext()
+    : shared_mem_offset(0), shared_mem_ptr(nullptr)
   {
   }
 
+  //TODO handle alignment
+  template<typename T>
+  RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
+  {
+    T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
+
+    shared_mem_offset += bytes*sizeof(T);
+    return mem_ptr;
+  }
+
+  /*
+  //Odd dependecy with atomics is breaking CI builds
+  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t z_stride=DIM-1, typename arg, typename... args>
+  RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs)
+  {
+    T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
+
+    shared_mem_offset += bytes*sizeof(T);
+    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx, idxs...);
+  }
+  */
+
+  RAJA_HOST_DEVICE void releaseSharedMemory()
+  {
+    //On the cpu/gpu we want to restart the count
+    shared_mem_offset = 0;
+  }
+
   RAJA_HOST_DEVICE
   void teamSync()
   {
-#if defined(RAJA_DEVICE_CODE)
+#if defined(RAJA_DEVICE_CODE) && defined(RAJA_ENABLE_SYCL)
+    itm->barrier(sycl::access::fence_space::local_space);
+#endif
+
+#if defined(RAJA_DEVICE_CODE) && !defined(RAJA_ENABLE_SYCL)
     __syncthreads();
 #endif
   }
@@ -177,31 +214,44 @@ class LaunchContext : public Grid
 template <typename LAUNCH_POLICY>
 struct LaunchExecute;
 
+//Policy based launch without name argument
+template <typename LAUNCH_POLICY, typename BODY>
+void launch(LaunchParams const &params, BODY const &body)
+{
+  launch<LAUNCH_POLICY>(params, nullptr, body);
+}
+
 //Policy based launch
 template <typename LAUNCH_POLICY, typename BODY>
-void launch(Grid const &grid, BODY const &body)
+void launch(LaunchParams const &params, const char *kernel_name, BODY const &body)
 {
   //Take the first policy as we assume the second policy is not user defined.
   //We rely on the user to pair launch and loop policies correctly.
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
-  launch_t::exec(LaunchContext(grid), body);
+  launch_t::exec(params, kernel_name, body);
 }
 
 
 //Run time based policy launch
 template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, Grid const &grid, BODY const &body)
+void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
+{
+  launch<POLICY_LIST>(place, params, nullptr, body);
+}
+
+template <typename POLICY_LIST, typename BODY>
+void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
 {
   switch (place) {
-    case HOST: {
+    case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      launch_t::exec(LaunchContext(grid), body);
+      launch_t::exec(params, kernel_name, body);
       break;
     }
 #ifdef RAJA_DEVICE_ACTIVE
-    case DEVICE: {
+  case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      launch_t::exec(LaunchContext(grid), body);
+      launch_t::exec(params, kernel_name, body);
       break;
     }
 #endif
@@ -211,16 +261,16 @@ void launch(ExecPlace place, Grid const &grid, BODY const &body)
 }
 
 // Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_DEVICE_ACTIVE)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 template<typename T, typename U>
-RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::expt::ExecPlace device){
-  if(device == RAJA::expt::DEVICE) {return RAJA::resources::Resource(device_res);}
+RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
+  if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
   else { return RAJA::resources::Resource(host_res); }
 }
 #else
 template<typename T>
-RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::expt::ExecPlace device){
-  if(device == RAJA::expt::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
+RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
+  if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
 
   return RAJA::resources::Resource(host_res);
 }
@@ -230,25 +280,32 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::expt::ExecPlace de
 //Launch API which takes team resource struct
 template <typename POLICY_LIST, typename BODY>
 resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, Grid const &grid, BODY const &body)
+launch(RAJA::resources::Resource res, LaunchParams const &params, BODY const &body)
+{
+  return launch<POLICY_LIST>(res, params, nullptr, body);
+}
+
+template <typename POLICY_LIST, typename BODY>
+resources::EventProxy<resources::Resource>
+launch(RAJA::resources::Resource res, LaunchParams const &params, const char *kernel_name, BODY const &body)
 {
 
   ExecPlace place;
   if(res.get_platform() == camp::resources::v1::Platform::host) {
-    place = RAJA::expt::HOST;
+    place = RAJA::ExecPlace::HOST;
   }else{
-    place = RAJA::expt::DEVICE;
+    place = RAJA::ExecPlace::DEVICE;
   }
 
   switch (place) {
-    case HOST: {
+    case ExecPlace::HOST: {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      return launch_t::exec(res, LaunchContext(grid), body); break;
+      return launch_t::exec(res, params, kernel_name, body); break;
     }
 #ifdef RAJA_DEVICE_ACTIVE
-    case DEVICE: {
+    case ExecPlace::DEVICE: {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      return launch_t::exec(res, LaunchContext(grid), body); break;
+      return launch_t::exec(res, params, kernel_name, body); break;
     }
 #endif
     default: {
@@ -301,6 +358,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
                                                           body);
 }
 
+namespace expt
+{
+
 RAJA_SUPPRESS_HD_WARN
 template <typename POLICY_LIST,
           typename CONTEXT,
@@ -334,8 +394,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
                            segment0, segment1, segment2, body);
 }
 
-
-
+} //namespace expt
 
 template <typename POLICY, typename SEGMENT>
 struct TileExecute;
@@ -376,6 +435,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile_icount(CONTEXT const &ctx,
                                                           body);
 }
 
+namespace expt
+{
+
 template <typename POLICY_LIST,
           typename CONTEXT,
           typename TILE_T,
@@ -418,7 +480,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile_icount(CONTEXT const &ctx,
                                                           body);
 }
 
-}  // namespace expt
+} //namespace expt
 
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
new file mode 100644
index 0000000000..499b442cc5
--- /dev/null
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -0,0 +1,363 @@
+#ifndef FORALL_PARAM_HPP
+#define FORALL_PARAM_HPP
+
+#include "RAJA/policy/sequential/params/reduce.hpp"
+#include "RAJA/policy/tbb/params/reduce.hpp"
+#include "RAJA/policy/openmp/params/reduce.hpp"
+#include "RAJA/policy/openmp_target/params/reduce.hpp"
+#include "RAJA/policy/cuda/params/reduce.hpp"
+#include "RAJA/policy/cuda/params/kernel_name.hpp"
+#include "RAJA/policy/hip/params/reduce.hpp"
+
+#include "RAJA/util/CombiningAdapter.hpp"
+
+namespace RAJA
+{
+namespace expt
+{
+
+  //
+  //
+  // Forall Parameter Packing type
+  //
+  //
+  struct ParamMultiplexer;
+
+  template<typename... Params>
+  struct ForallParamPack {
+
+    friend struct ParamMultiplexer;
+
+    using Base = camp::tuple<Params...>;
+    Base param_tup;
+
+    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
+    using params_seq = camp::make_idx_seq_t< param_tup_sz >;
+
+  private:
+
+    // Init
+    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
+    static constexpr void detail_init(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
+      CAMP_EXPAND(expt::detail::init<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
+    }
+
+    // Combine
+    template<typename EXEC_POL, camp::idx_t... Seq>
+    RAJA_HOST_DEVICE
+    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& out, const ForallParamPack& in ) {
+      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
+    }
+
+    template<typename EXEC_POL, camp::idx_t... Seq>
+    RAJA_HOST_DEVICE
+    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
+      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
+    }
+    
+    // Resolve
+    template<typename EXEC_POL, camp::idx_t... Seq>
+    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
+      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
+    }
+
+    // Used to construct the argument TYPES that will be invoked with the lambda.
+    template<typename null_t = camp::nil>
+    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; };
+    template<typename null_t = camp::nil, typename First>
+    static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); };
+    template<typename null_t = camp::nil, typename First, typename Second, typename... Rest>
+    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
+
+    using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
+    
+    //Use the size of param_tup to generate the argument list.
+    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
+    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
+    template<camp::idx_t N>
+    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>) {
+      return camp::tuple_cat_pair(  camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num<N-1>())  );
+    }
+
+  public:
+    ForallParamPack(){}
+
+    RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());}
+
+    using lambda_arg_seq = camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
+
+    template<typename... Ts>
+    ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
+  }; // struct ForallParamPack 
+  
+
+
+  //===========================================================================
+  //
+  //
+  // ParamMultiplexer is how we hook into the individual calls within forall_impl.
+  //
+  //
+  struct ParamMultiplexer {
+    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
+    static void constexpr init( ForallParamPack<Params...>& f_params, Args&& ...args) {
+      FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward<Args>(args)... );
+    }
+    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
+    static void constexpr combine(ForallParamPack<Params...>& f_params, Args&& ...args){
+      FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
+    }
+    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
+    static void constexpr resolve( ForallParamPack<Params...>& f_params, Args&& ...args){
+      FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
+    }
+  };
+  //===========================================================================
+
+
+
+  //===========================================================================
+  //
+  //
+  // ForallParamPack generators.
+  //
+  //
+  RAJA_INLINE static auto get_empty_forall_param_pack(){
+    static ForallParamPack<> p;
+    return p;
+  }
+
+  namespace detail {
+    // all_true trick to perform variadic expansion in static asserts.
+    // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
+    template<bool...> struct bool_pack;
+    template<bool... bs>
+    using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+    template<typename Base, typename... Ts>
+    using check_types_derive_base = all_true<std::is_convertible<Ts, Base>::value...>;
+  } // namespace detail
+
+
+  template<typename... Ts>
+  constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple) {
+    static_assert(detail::check_types_derive_base<detail::ForallParamBase, camp::decay<Ts>...>::value,
+        "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ;
+    return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+  }
+
+  
+
+  namespace detail {
+    // Maybe we should do a lot of these with structs...
+    template<camp::idx_t... Seq, typename TupleType>
+    constexpr auto tuple_from_seq (const camp::idx_seq<Seq...>&, TupleType&& tuple){
+      return camp::forward_as_tuple( camp::get< Seq >(std::forward<TupleType>(tuple))... );
+    };
+
+    template<typename... Ts>
+    constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple){
+      return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts)-1>{},std::move(tuple));
+    };
+  } // namespace detail
+
+
+  // Make a tuple of the param pack except the final element...
+  template<typename... Args>
+  constexpr auto make_forall_param_pack(Args&&... args){
+    // We assume the last element of the pack is the lambda so we need to strip it from the list.
+    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
+    return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+  }
+  //===========================================================================
+
+
+
+  //===========================================================================
+  //
+  //
+  // Callable should be the last argument in the param pack, just extract it...
+  //
+  //
+  template<typename... Args>
+  constexpr auto&& get_lambda(Args&&... args){
+    return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
+  } 
+  //===========================================================================
+
+
+
+  //===========================================================================
+  //
+  //
+  // Checking expected argument list against the assumed lambda.
+  //
+  //
+  namespace detail {
+
+    // 
+    //
+    // Lambda traits Utilities
+    // 
+    //
+    template<class F>
+    struct lambda_traits;
+
+    template<class R, class C, class First, class... Rest>
+    struct lambda_traits<R (C::*)(First, Rest...)>
+    {  // non-const specialization
+      using arg_type = First; 
+    };
+    template<class R, class C, class First, class... Rest>
+    struct lambda_traits<R (C::*)(First, Rest...) const>
+    {  // const specialization
+      using arg_type = First; 
+    };
+
+    template<class T>
+    typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
+
+
+    // 
+    //
+    // List manipulation Utilities
+    // 
+    //
+    template<typename... Ts>
+    constexpr auto list_remove_pointer(const camp::list<Ts...>&){
+      return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
+    }
+    
+    template<typename... Ts>
+    constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
+      return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
+    }
+
+    template<typename... Ts>
+    constexpr auto tuple_to_list(const camp::tuple<Ts...>&) {
+      return camp::list<Ts...>{};
+    }
+
+    // TODO : Change to std::is_invocable at c++17
+    template <typename F, typename... Args>
+    struct is_invocable :
+      std::is_constructible<
+        std::function<void(Args ...)>,
+        std::reference_wrapper<typename std::remove_reference<F>::type>
+      >{};
+
+    template<class...>
+    using void_t = void;
+
+    template<class F, class=void>
+    struct has_empty_op : std::false_type{};
+
+    template<class F>
+    struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>> : std::true_type{};
+
+    template<class F>
+    struct get_lambda_index_type {
+      typedef typename std::remove_pointer<
+                decltype(lambda_arg_helper(
+                      &camp::decay<F>::operator())
+                )
+              >::type type;
+    };
+
+    // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args.
+    template<typename LAMBDA, typename... EXPECTED_ARGS>
+    constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {}
+
+    template<typename LAMBDA, typename... EXPECTED_ARGS>
+    constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
+#if !defined(RAJA_ENABLE_HIP)
+      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS."); 
+#endif
+    }
+
+  } // namespace detail
+
+
+  template<typename Lambda, typename ForallParams>
+  constexpr 
+  void
+  check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
+
+    using expected_arg_type_list = decltype( detail::list_add_lvalue_ref(
+                                               detail::list_remove_pointer(
+                                                 detail::tuple_to_list(
+                                                   fpp.lambda_args()
+                                                 )
+                                               )
+                                            ));
+
+    detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
+  }
+  //===========================================================================
+  
+
+
+  //===========================================================================
+  //
+  //
+  // Type trailts for SFINAE work.
+  //
+  //
+  namespace type_traits
+  {
+    template <typename T> struct is_ForallParamPack : std::false_type {};
+    template <typename... Args> struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {};
+
+    template <typename T> struct is_ForallParamPack_empty : std::true_type {};
+    template <typename First, typename... Rest> struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>> : std::false_type {};
+    template <> struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {};
+  }
+  //===========================================================================
+
+
+
+  //===========================================================================
+  //
+  //
+  // Invoke Forall with Params.
+  //
+  //
+  namespace detail {
+    template<camp::idx_t Idx, typename FP>
+    RAJA_HOST_DEVICE
+    constexpr
+    auto get_lambda_args(FP& fpp)
+        -> decltype(  *camp::get<Idx>( fpp.lambda_args() )  ) {
+      return (  *camp::get<Idx>( fpp.lambda_args() )  );
+    }
+
+    CAMP_SUPPRESS_HD_WARN
+    template <typename Fn,
+              camp::idx_t... Sequence,
+              typename Params,
+              typename... Ts>
+    RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                      Fn&& f,
+                                                      camp::idx_seq<Sequence...>,
+                                                      Ts&&... extra)
+    {
+      return f(std::forward<Ts...>(extra...), ( get_lambda_args<Sequence>(params) )...);
+    }
+  } // namespace detail
+
+  //CAMP_SUPPRESS_HD_WARN
+  template <typename Params, typename Fn, typename... Ts>
+  RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra)
+  {
+    return detail::invoke_with_order(
+        camp::forward<Params>(params),
+        camp::forward<Fn>(f),
+        typename camp::decay<Params>::lambda_arg_seq(),
+        camp::forward<Ts...>(extra)...);
+  }
+  //===========================================================================
+
+} //  namespace expt
+} //  namespace RAJA
+
+#endif //  FORALL_PARAM_HPP
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
new file mode 100644
index 0000000000..a3cb18cf46
--- /dev/null
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -0,0 +1,32 @@
+#ifndef RAJA_KERNEL_NAME_HPP
+#define RAJA_KERNEL_NAME_HPP
+
+#include "RAJA/pattern/params/params_base.hpp"
+
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+  struct KernelName : public ForallParamBase {
+    RAJA_HOST_DEVICE KernelName() {}
+    KernelName(const char* name_in) : name(name_in) {}
+    const char* name;
+  };
+
+} // namespace detail
+
+auto KernelName(const char * n)
+{
+  return detail::KernelName(n);
+}
+} // namespace expt
+
+
+} //  namespace RAJA
+
+
+
+#endif // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
new file mode 100644
index 0000000000..51e96260f8
--- /dev/null
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -0,0 +1,29 @@
+#ifndef RAJA_PARAMS_BASE
+#define RAJA_PARAMS_BASE
+
+
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+  struct ForallParamBase {
+
+    // Some of this can be made virtual in c++20, for now must be defined in each child class
+    // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.)
+    using ARG_TUP_T = camp::tuple<>; 
+    using ARG_LIST_T = typename ARG_TUP_T::TList;
+    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
+    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+  
+  };
+
+} // namespace detail
+
+} // namespace expt
+
+} //  namespace RAJA
+
+#endif //  RAJA_PARAMS_BASE
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
new file mode 100644
index 0000000000..5c4858a14a
--- /dev/null
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -0,0 +1,145 @@
+#ifndef NEW_REDUCE_HPP
+#define NEW_REDUCE_HPP
+
+#include "RAJA/pattern/params/params_base.hpp"
+#include "RAJA/util/SoAPtr.hpp"
+
+#if defined(RAJA_CUDA_ACTIVE)
+#define DEVICE cuda
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#elif defined(RAJA_HIP_ACTIVE)
+#define DEVICE hip
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#endif
+
+namespace RAJA
+{
+
+namespace expt
+{
+
+template<typename T>
+struct ValLoc {
+  using index_type = RAJA::Index_type;
+  using value_type = T;
+
+  RAJA_HOST_DEVICE ValLoc() {}
+  RAJA_HOST_DEVICE ValLoc(value_type v) : val(v) {}
+  RAJA_HOST_DEVICE ValLoc(value_type v, RAJA::Index_type l) : val(v), loc(l) {}
+
+  RAJA_HOST_DEVICE void min(value_type v, index_type l) { if (v <  val) { val = v; loc = l; } }
+  RAJA_HOST_DEVICE void max(value_type v, index_type l) { if (v >  val) { val = v; loc = l; } }
+
+  bool constexpr operator < (const ValLoc& rhs) const { return val <= rhs.val; }
+  bool constexpr operator <=(const ValLoc& rhs) const { return val < rhs.val; }
+  bool constexpr operator > (const ValLoc& rhs) const { return val >= rhs.val; }
+  bool constexpr operator >=(const ValLoc& rhs) const { return val > rhs.val; }
+
+  value_type getVal() {return val;}
+  RAJA::Index_type getLoc() {return loc;}
+
+private:
+  value_type val;
+  index_type loc = -1;
+};
+
+} //  namespace expt
+
+namespace operators
+{
+
+template <typename T>
+struct limits<RAJA::expt::ValLoc<T>> {
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T> min()
+  {
+    return RAJA::expt::ValLoc<T>(RAJA::operators::limits<T>::min());
+  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T> max()
+  {
+    return RAJA::expt::ValLoc<T>(RAJA::operators::limits<T>::max());
+  }
+};
+
+} //  namespace operators
+
+} //  namespace RAJA
+
+namespace RAJA
+{
+
+namespace expt
+{
+namespace detail
+{
+
+  //
+  //
+  // Basic Reducer
+  //
+  //
+  template <typename Op, typename T>
+  struct Reducer : public ForallParamBase {
+    using op = Op;
+    using value_type = T;
+
+    RAJA_HOST_DEVICE Reducer() {}
+    Reducer(value_type *target_in) : target(target_in), val(op::identity()) {}
+
+    value_type *target = nullptr;
+    value_type val = op::identity();
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE)
+    // Device related attributes.
+    value_type * devicetarget = nullptr;
+    RAJA::detail::SoAPtr<value_type, RAJA::DEVICE::device_mempool_type> device_mem;
+    unsigned int * device_count = nullptr;
+#endif
+
+    using ARG_TUP_T = camp::tuple<value_type*>; 
+    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&val); }
+
+    using ARG_LIST_T = typename ARG_TUP_T::TList;
+    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
+  };
+
+} // namespace detail
+
+template <template <typename, typename, typename> class Op, typename T>
+auto constexpr Reduce(T *target)
+{
+  return detail::Reducer<Op<T, T, T>, T>(target);
+}
+
+
+
+namespace detail
+{
+
+  //
+  //
+  // Basic ReducerLoc
+  //
+  //
+  template <typename Op, typename T>
+  struct ReducerLoc : public Reducer<Op, T> {
+    using Base = Reducer<Op, T>;
+    using value_type = typename Base::value_type;
+    ReducerLoc(value_type *target_in) {
+      Base::target = target_in;
+      Base::val = value_type(Op::identity());
+    }
+  };
+
+} // namespace detail
+
+template <template <typename, typename, typename> class Op, typename T>
+auto constexpr ReduceLoc(T *target)
+{
+  return detail::ReducerLoc<Op<T, T, T>, T>(target);
+}
+} // namespace expt
+
+
+} //  namespace RAJA
+
+#endif //  NEW_REDUCE_HPP
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 9afa7622a5..228d350584 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -27,6 +27,15 @@ namespace RAJA
 {
 namespace expt
 {
+
+
+  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
+  struct StaticTensorIndexInner;
+
+  template<typename INNER_TYPE>
+  struct StaticTensorIndex;
+
+
   template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
   class TensorIndex {
     public:
@@ -43,6 +52,14 @@ namespace expt
         return self_type(index_type(-1), value_type(-1));
       }
 
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>> static_all(){
+        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,index_type(-1),value_type(-1)>>();
+      }
+
       RAJA_INLINE
       RAJA_HOST_DEVICE
       static
@@ -51,6 +68,15 @@ namespace expt
         return self_type(begin, value_type(stripIndexType(end-begin)));
       }
 
+      template<value_type BEGIN, value_type END>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,BEGIN,END-BEGIN>> static_range(){
+        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,BEGIN,END-BEGIN>>();
+      }
+
 
       RAJA_INLINE
       RAJA_HOST_DEVICE
@@ -65,7 +91,6 @@ namespace expt
       m_index(*seg.begin()), m_length(seg.size())
       {}
 
-
       RAJA_INLINE
       RAJA_HOST_DEVICE
       constexpr
@@ -78,6 +103,16 @@ namespace expt
       TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
 
 
+      template<IDX IDX_VAL, strip_index_type_t<IDX> LEN_VAL>
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
+          : m_index(IDX_VAL)
+          , m_length(LEN_VAL)
+      {}
+
+
       RAJA_INLINE
       RAJA_HOST_DEVICE
       constexpr
@@ -85,6 +120,13 @@ namespace expt
         return m_index;
       }
 
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr
+      explicit operator index_type() const {
+        return m_index;
+      }
+
       RAJA_INLINE
       RAJA_HOST_DEVICE
       constexpr
@@ -112,6 +154,27 @@ namespace expt
   };
 
 
+  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
+  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
+
+      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
+      using value_type = strip_index_type_t<IDX>;
+      using index_type = IDX;
+      using tensor_type = TENSOR_TYPE;
+
+      static const index_type s_index  = INDEX_VALUE;
+      static const index_type s_length = LENGTH_VALUE;
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      constexpr operator base_type() {
+        return base_type(s_index,s_length);
+      }
+    
+  };
+
+
+
   /*!
    * Index that specifies the starting element index of a Vector
    */
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index bdb87eda70..b7922b4bb8 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -48,6 +48,9 @@ namespace expt
         using left_operand_type = LEFT_OPERAND;
         using right_operand_type = RIGHT_OPERAND;
 
+        using element_type = typename LEFT_OPERAND::element_type;
+        using index_type = typename LEFT_OPERAND::index_type;
+
         using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
         using result_type = typename operator_traits::result_type;
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index 5d06d21d86..ccfc3ad7a4 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -319,6 +319,10 @@ namespace expt
       }
 
     private:
+
+      template<typename STORAGE, typename TILE_TYPE, typename INDEX=void>
+      struct MultiplyBridge;
+
       template<typename STORAGE, typename TILE_TYPE>
       RAJA_INLINE
       RAJA_HOST_DEVICE
@@ -335,12 +339,14 @@ namespace expt
         // how do we provide checking for this kind of error?
 
         // tile over row of left and column of right
-        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile();
+        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
         left_tile.m_begin[0] = tile.m_begin[0];
         left_tile.m_size[0] = tile.m_size[0];
         left_tile.m_size[1] = tile_size;
 
-        TILE_TYPE right_tile = tile;
+        using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+        RightType right_tile = tile;
         right_tile.m_size[0] = tile_size;
 
         // Do full tiles in k
@@ -375,7 +381,305 @@ namespace expt
 
       }
 
-    };
+
+      template<typename T>
+      struct Diag{
+          static_assert(!std::is_same<T,void>::value,"diag");
+      };
+
+      template<typename I, TensorTileSize TTS, typename B, typename S>
+      struct Diag< StaticTensorTile<I,TTS,B,S> >{
+          static_assert(std::is_same<I,void>::value,"diag");
+      };
+
+      template<typename STORAGE, typename TILE_TYPE, typename INDEX>
+      struct MultiplyBridge {
+
+          Diag<TILE_TYPE> diag;
+
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+            //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+    
+            // get tile size from matrix type
+            auto tile_size = left_type::result_type::s_dim_elem(1);
+            auto k_size = et_left.getDimSize(1);
+            // TODO: check that left and right are compatible
+            // m_left.getDimSize(1) == m_right.getDimSize(0)
+            // how do we provide checking for this kind of error?
+    
+            // tile over row of left and column of right
+            auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+            left_tile.m_begin[0] = tile.m_begin[0];
+            left_tile.m_size[0] = tile.m_size[0];
+            left_tile.m_size[1] = tile_size;
+    
+            using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+            RightType right_tile = tile;
+            right_tile.m_size[0] = tile_size;
+    
+            // Do full tiles in k
+            decltype(k_size) k = 0;
+            for(;k+tile_size <= k_size; k+= tile_size){
+    
+              // evaluate both sides of operator
+              left_tile.m_begin[1] = k;
+              auto left = et_left.eval(left_tile);
+    
+              right_tile.m_begin[0] = k;
+              auto right = et_right.eval(right_tile);
+    
+              // accumulate product
+              result = left.right_multiply_vector_accumulate(right, result);
+            }
+            // remainder tile in k
+            if(k < k_size){
+              auto &left_part_tile = make_tensor_tile_partial(left_tile);
+              left_part_tile.m_begin[1] = k;
+              left_part_tile.m_size[1] = k_size-k;
+              auto left = et_left.eval(left_part_tile);
+    
+              auto &right_part_tile = make_tensor_tile_partial(right_tile);
+              right_part_tile.m_begin[0] = k;
+              right_part_tile.m_size[0] = k_size-k;
+              auto right = et_right.eval(right_part_tile);
+    
+              // accumulate product of partial tile
+              result = left.right_multiply_vector_accumulate(right, result);
+            }
+    
+          }
+      };
+
+
+
+
+      template<
+          size_t INDEX,
+          typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
+          INDEX_TYPE  Size0, INDEX_TYPE... SizeTail
+      >
+      struct MultiplyBridge <
+          STORAGE,
+          StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >,
+          camp::integral_constant<size_t,INDEX>
+      > {
+
+          using TileType = StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >;
+              
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+
+              // get tile size from matrix type
+              const auto tile_size = left_type::result_type::s_dim_elem(1);
+              const auto k_size = et_left.getDimSize(1);
+             
+              auto const offset = INDEX*tile_size;
+
+              if( (offset + tile_size) <= k_size ) {
+    
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
+                    >;
+                    // evaluate both sides of operator
+                    auto left = et_left.eval(LeftType());
+
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE,    offset>,
+                        camp::int_seq<INDEX_TYPE, tile_size>
+                    >;
+    
+                    auto right = et_right.eval(RightType());
+    
+                    // accumulate product
+                    auto temp = left.right_multiply_vector_accumulate(right, result);
+                    MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,INDEX-1>>::multiply_into_result(result,tile,et_left,et_right);
+                    result += temp;
+                    
+              } else {
+
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
+                    >;
+		    auto left = et_left.eval(LeftType());
+	    
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE,        offset>,
+                        camp::int_seq<INDEX_TYPE, k_size-offset>
+                    >;
+		    auto right = et_right.eval(RightType());
+	    
+		    // accumulate product of partial tile
+		    result = left.right_multiply_vector_accumulate(right, result);
+
+              }
+
+
+            }
+          };
+
+
+      template<
+          typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
+          INDEX_TYPE  Size0, INDEX_TYPE...  SizeTail
+      >
+      struct MultiplyBridge <
+          STORAGE,
+          StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >,
+          camp::integral_constant<size_t,0>
+      > {
+
+          using TileType = StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >;
+              
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TileType const &, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+
+              // get tile size from matrix type
+              const auto tile_size = left_type::result_type::s_dim_elem(1);
+              const auto k_size = et_left.getDimSize(1);
+             
+              auto const offset = 0;
+
+              if( (offset + tile_size) <= k_size ) {
+    
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
+                    >;
+                    // evaluate both sides of operator
+                    auto left = et_left.eval(LeftType());
+
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE,    offset>,
+                        camp::int_seq<INDEX_TYPE, tile_size>
+                    >;
+    
+                    auto right = et_right.eval(RightType());
+    
+                    // accumulate product
+                    auto temp = left.right_multiply_vector_accumulate(right, result);
+                    result += temp;
+                    
+              } else {
+
+                    using LeftType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
+                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
+                    >;
+		    auto left = et_left.eval(LeftType());
+	    
+                    using RightType = StaticTensorTile <
+                        INDEX_TYPE,
+                        TENSOR_PARTIAL,
+                        camp::int_seq<INDEX_TYPE,        offset>,
+                        camp::int_seq<INDEX_TYPE, k_size-offset>
+                    >;
+		    auto right = et_right.eval(RightType());
+	    
+		    // accumulate product of partial tile
+		    result = left.right_multiply_vector_accumulate(right, result);
+
+              }
+
+
+            }
+          };
+
+      template<
+          typename STORAGE,
+          typename INDEX_TYPE,
+          TensorTileSize TENSOR_SIZE,
+          INDEX_TYPE Begin0,  INDEX_TYPE... BeginTail,
+          INDEX_TYPE  Size0,  INDEX_TYPE... SizeTail
+      >
+      struct MultiplyBridge <
+          STORAGE,
+          StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >,
+          void
+      > {
+
+          using TileType = StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
+          >;
+              
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static
+          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+          {
+
+              const auto tile_size = left_type::result_type::s_dim_elem(1);
+              const auto k_size = et_left.getDimSize(1);
+              const size_t iter_count = (k_size/tile_size) + ( (k_size%tile_size != 0) ? 1 : 0 );
+
+              MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,iter_count>>::multiply_into_result(result,tile,et_left,et_right);
+
+            }
+          };
+
+      };
+
+
 
 
     template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
@@ -466,7 +770,7 @@ namespace expt
         // how do we provide checking for this kind of error?
 
         // tile over row of left and column of right
-        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile();
+        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
         right_tile.m_begin[1] = tile.m_begin[0];
         right_tile.m_size[1] = tile.m_size[0];
         right_tile.m_size[0] = tile_size;
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 1af566f6b1..8c97cc281d 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -49,6 +49,9 @@ namespace expt
         using right_operand_type = RIGHT_OPERAND_TYPE;
         using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
 
+        using element_type = typename LEFT_OPERAND_TYPE::element_type;
+        using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
         using result_type = typename multiply_op::result_type;
         static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 7f9e3f36e2..1ffbaf977a 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -52,6 +52,10 @@ namespace expt
         using right_operand_type = RIGHT_OPERAND_TYPE;
         using add_operand_type = ADD_OPERAND_TYPE;
         using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+        using element_type = typename LEFT_OPERAND_TYPE::element_type;
+        using index_type = typename LEFT_OPERAND_TYPE::index_type;
+
         using result_type = typename multiply_op::result_type;
         static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
 
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 1075257693..0c74cf14a5 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -230,91 +230,212 @@ namespace expt
       }
 
 
+      template<typename REF_TYPE>
+      struct RefBridge;
 
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type& load_ref (REF_TYPE const &ref){
+          RefBridge<REF_TYPE>::load_ref(*this,ref);
+          return *this;
+      }
 
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_ref (REF_TYPE &ref) const {
+          RefBridge<REF_TYPE>::store_ref(*this,ref);
+          return *this;
+      }
 
 
-      /*!
-       * @brief Performs load specified by TensorRef object.
-       */
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &load_ref(RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM> const &ref){
 
-        auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                   ref.m_tile.m_begin[1]*ref.m_stride[1];
+      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
+      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+      {
 
-        // check for packed data
-        if(is_ref_packed<STRIDE_ONE_DIM>()){
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-            load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void load_ref(self_type& self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
           }
-          // partial
-          else{
-            load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+    
+    
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>())
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
           }
 
-        }
-        // strided data
-        else
-        {
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-            load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-          }
-          // partial
-          else{
-            load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-          }
-        }
-        return *this;
-      }
+      };
 
 
-      /*!
-       * @brief Performs load specified by TensorRef object.
-       */
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type const &store_ref(RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE,2, STRIDE_ONE_DIM> const &ref) const {
 
-        auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                   ref.m_tile.m_begin[1]*ref.m_stride[1];
 
-        // check for packed data
-        if(is_ref_packed<STRIDE_ONE_DIM>())
-        {
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-            store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-          }
-          // partial
-          else{
-            store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-          }
+      template<
+           typename POINTER_TYPE,
+           typename INDEX_TYPE,
+           RAJA::internal::expt::TensorTileSize TENSOR_SIZE, 
+           INDEX_TYPE StrideInt1, INDEX_TYPE StrideInt2,
+           INDEX_TYPE  BeginInt1, INDEX_TYPE  BeginInt2,
+           INDEX_TYPE   SizeInt1, INDEX_TYPE   SizeInt2,
+           camp::idx_t STRIDE_ONE_DIM
+      >
+      struct RefBridge
+      <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>>
+      {
 
-        }
-        // strided data
-        else
-        {
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-            store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void load_ref(self_type& self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
           }
-          // partial
-          else{
-            store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+    
+    
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_INLINE
+          RAJA_HOST_DEVICE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
+                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
+    
+            // check for packed data
+            if(self.is_ref_packed<STRIDE_ONE_DIM>())
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+              }
+              // partial
+              else{
+                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+              }
+            }
           }
-        }
-        return *this;
-      }
+
+      };
+
+
 
 
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index 9b0d44d207..c5943ed363 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -67,6 +67,14 @@ namespace expt
           return arg;
         }
 
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const strip_by_value(arg_type const arg){
+          return arg;
+        }
+
         RAJA_INLINE
         RAJA_HOST_DEVICE
         static
@@ -122,6 +130,14 @@ namespace expt
           return *arg;
         }
 
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const strip_by_value(index_type const arg){
+          return (arg_type)arg;
+        }
+
         RAJA_INLINE
         RAJA_HOST_DEVICE
         static
@@ -155,6 +171,67 @@ namespace expt
         }
     };
 
+
+
+
+    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
+    struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+        RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>
+    >> {
+        using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+        using index_type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>;
+        using arg_type = IDX;
+        using value_type = strip_index_type_t<IDX>;
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        bool isTensorIndex(){
+          return true;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        arg_type const strip_by_value(index_type const){
+          return INDEX_VALUE;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type size(index_type const &){
+          return LENGTH_VALUE;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type begin(index_type const &){
+          return INDEX_VALUE;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type dim(){
+          return DIM;
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static
+        constexpr
+        value_type num_elem(){
+          return TENSOR_TYPE::s_dim_elem(DIM);
+        }
+    };
+
     /*
      * Returns vector size of argument.
      *
@@ -181,6 +258,17 @@ namespace expt
       return TensorIndexTraits<ARG>::strip(arg);
     }
 
+
+    template<typename ARG>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    auto stripTensorIndexByValue(ARG const arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const
+    {
+      return TensorIndexTraits<ARG>::strip_by_value(arg);
+    }
+
     /*
      * Returns tensor dimension size of argument.
      *
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index a96f0602dc..fe7ab8c481 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -30,6 +30,153 @@ namespace internal
 namespace expt
 {
 
+    template<typename INT_SEQ>
+    struct StaticIndexArray;
+
+    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
+    struct PrependStaticIndexArray;
+
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
+    struct AddStaticIndexArray;
+
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
+    struct SetStaticIndexArray;
+
+
+    template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>> {
+        
+        using seq_type = camp::int_seq<INDEX_TYPE,HEAD,TAIL...>;
+        using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE,TAIL...>>;
+
+        Tail tail;
+
+        RAJA_INLINE
+        StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>() = default;
+       
+	 
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        static constexpr INDEX_TYPE value_at(size_t index) {
+            if(index == 0){
+                return HEAD;
+            } else {
+                return Tail::value_at(index-1);
+            }
+        }
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        constexpr INDEX_TYPE operator[](size_t index) const {
+            if(index == 0){
+                return HEAD;
+            } else {
+                return tail[index-1];
+            }
+        }
+
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print_values() const {
+            printf("%ld ",(long)HEAD);
+            tail.print_values();
+        }
+
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+            printf("[");
+            print_values();
+            printf("]");
+        }
+
+
+    };
+
+    template<typename INDEX_TYPE>
+    struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
+    {
+
+        using seq_type = camp::int_seq<INDEX_TYPE>;
+
+        RAJA_INLINE
+        StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
+
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        static constexpr INDEX_TYPE value_at(size_t) {
+            return 0;
+        }
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        constexpr INDEX_TYPE operator[](size_t) const {
+            return 0;
+        }
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print_values() const {}
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+            print("[]");
+        }
+
+    };
+
+    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
+    struct PrependStaticIndexArray<INDEX_TYPE, NEW_HEAD, StaticIndexArray<camp::int_seq<INDEX_TYPE,ORIG_INTS...>>>
+    {
+        using Type = StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
+        using Seq  = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
+    };
+
+
+
+
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct AddStaticIndexArray<INDEX_TYPE, IDX, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
+    {
+        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using AddTail = typename AddStaticIndexArray<INDEX_TYPE,IDX-1,DELTA,typename Orig::Tail>::Type;
+        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Type;
+        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Seq;
+    };
+
+    template<typename INDEX_TYPE, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct AddStaticIndexArray<INDEX_TYPE, 0, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
+    {
+
+        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Type;
+        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Seq;
+    };
+
+
+
+    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
+    {
+        using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using SetTail = typename SetStaticIndexArray<INDEX_TYPE,IDX-1,VALUE,typename Orig::Tail>::Type;
+        using Type    = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Type;
+        using Seq     = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Seq;
+    };
+
+    template<typename INDEX_TYPE, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+    struct SetStaticIndexArray<INDEX_TYPE, 0, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
+    {
+        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
+        using Type = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Type;
+        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Seq;
+    };
+
 
     enum TensorTileSize
     {
@@ -42,6 +189,7 @@ namespace expt
     struct TensorTile
     {
         using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+        using nonstatic_self_type = self_type;
         using index_type = INDEX_TYPE;
         index_type m_begin[NUM_DIMS];
         index_type m_size[NUM_DIMS];
@@ -99,16 +247,125 @@ namespace expt
 
 
 
+
+    template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename BEGIN, typename SIZE>
+    struct StaticTensorTile;
+
+    template< typename INDEX_TYPE,
+              TensorTileSize TENSOR_SIZE,
+              INDEX_TYPE... BeginInts,
+              INDEX_TYPE... SizeInts>
+    struct StaticTensorTile <
+              INDEX_TYPE,
+              TENSOR_SIZE,
+              camp::int_seq<INDEX_TYPE, BeginInts...>,
+              camp::int_seq<INDEX_TYPE, SizeInts...>>
+    {
+
+
+
+        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
+        using begin_type = StaticIndexArray<begin_seq>;
+        using size_type  = StaticIndexArray<size_seq >;
+        using self_type  = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq,size_seq>;
+        using index_type = INDEX_TYPE;
+
+        using nonstatic_self_type = TensorTile<INDEX_TYPE,TENSOR_SIZE,sizeof...(BeginInts)>;
+
+        using Partial = StaticTensorTile< INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>; 
+        using Full    = StaticTensorTile< INDEX_TYPE, TENSOR_FULL   , begin_seq, size_seq>; 
+
+        begin_type m_begin;
+        size_type  m_size;
+
+	static_assert(
+          sizeof...(BeginInts) == sizeof...(SizeInts),
+          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
+        );
+
+        static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
+        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+        constexpr operator nonstatic_self_type() const {
+            return nonstatic_self_type { {BeginInts...}, {SizeInts...} };
+        }
+
+        constexpr nonstatic_self_type nonstatic() const {
+            return *this;
+        }
+        
+        template<TensorTileSize S>
+        constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const RAJA_UNUSED_ARG(&c)) const
+        {}
+
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+          printf("StaticTensorTile: dims=%d, m_begin=",  (int)s_num_dims);
+
+          m_begin.print();
+
+          printf(", m_size=");
+          
+          m_size.print();
+
+          printf("\n");
+        }
+    };
+
+        template< typename TILE, typename VALUE, size_t IDX>
+        struct SetStaticTensorTileBegin;
+
+        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename BEGIN, typename SIZE, INDEX_TYPE VALUE, size_t IDX > 
+        struct SetStaticTensorTileBegin<
+              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, BEGIN, SIZE >,
+              camp::integral_constant<INDEX_TYPE,VALUE>,
+              IDX
+        > {
+            using BeginType = StaticIndexArray<BEGIN>;
+            using Type = StaticTensorTile<
+                INDEX_TYPE,
+                TENSOR_SIZE,
+                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,BeginType>::Seq,
+                SIZE
+            >;
+        };
+
+        template< typename TILE, typename VALUE, size_t IDX>
+        struct SetStaticTensorTileSize;
+
+        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename BEGIN, typename SIZE, INDEX_TYPE VALUE, size_t IDX > 
+        struct SetStaticTensorTileSize<
+              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, BEGIN, SIZE >,
+              camp::integral_constant<INDEX_TYPE,VALUE>,
+              IDX
+        > {
+            using SizeType = StaticIndexArray<SIZE>;
+            using Type = StaticTensorTile<
+                INDEX_TYPE,
+                TENSOR_SIZE,
+                BEGIN,
+                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,SizeType>::Seq
+            >;
+        };
+
+
+
+
     template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM = -1>
     struct TensorRef
     {
+        static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
+        static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
+        static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
+
         using self_type = TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
         using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-
-//        using tensor_type = TENSOR_TYPE;
         using pointer_type = POINTER_TYPE;
         using index_type = INDEX_TYPE;
-        static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+        
 
         pointer_type m_pointer;
         index_type m_stride[NUM_DIMS];
@@ -131,25 +388,84 @@ namespace expt
     };
 
 
+
+    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename STRIDE_TYPE, typename BEGIN_TYPE, typename SIZE_TYPE, camp::idx_t STRIDE_ONE_DIM = -1>
+    struct StaticTensorRef;
+
+    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, INDEX_TYPE... StrideInts, INDEX_TYPE... BeginInts, INDEX_TYPE... SizeInts, camp::idx_t STRIDE_ONE_DIM>
+    struct StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInts...>,camp::int_seq<INDEX_TYPE,BeginInts...>,camp::int_seq<INDEX_TYPE,SizeInts...>,STRIDE_ONE_DIM>
+    {
+
+        static constexpr camp::idx_t    s_num_dims         = sizeof...(BeginInts);
+        static constexpr camp::idx_t    s_stride_one_dim   = STRIDE_ONE_DIM;
+        static constexpr TensorTileSize s_ref_tensor_size  = TENSOR_SIZE;
+        using pointer_type = POINTER_TYPE;
+        using index_type = INDEX_TYPE;
+        
+        using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
+        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
+
+        using stride_type  = StaticIndexArray<stride_seq>;
+
+	static_assert(
+          (sizeof...(BeginInts) == sizeof...(SizeInts)) && (sizeof...(SizeInts) == sizeof...(StrideInts)),
+          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
+        );
+        
+
+        using self_type = StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,stride_seq,begin_seq,size_seq>;
+        using tile_type = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+
+
+        pointer_type m_pointer;
+        stride_type m_stride;
+        tile_type m_tile;
+
+        RAJA_HOST_DEVICE
+        RAJA_INLINE
+        void print() const {
+          printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims, m_pointer);
+
+          m_stride.print();
+
+          printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+          m_tile.print();
+        }
+
+    };
+
+
+
+
     template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
     struct MergeRefTile;
 
+    template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t ... DIM_SEQ>
+    struct MergeRefTile <REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM, typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, camp::idx_t ... DIM_SEQ>
-    struct MergeRefTile<TensorRef<POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>, TensorTile<INDEX_TYPE2, TENSOR_SIZE, NUM_DIMS>, camp::idx_seq<DIM_SEQ...>> {
+        static_assert( REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims , "Merging a ref with a tile requires an equivalent number of dimensions.");
 
-        using ref_type = TensorRef<POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
-        using tile_type = TensorTile<INDEX_TYPE2, TENSOR_SIZE, NUM_DIMS>;
+        static constexpr camp::idx_t    s_num_dims         = REF_TYPE::s_num_dims;
+        static constexpr camp::idx_t    s_stride_one_dim   = REF_TYPE::s_stride_one_dim;
+        static constexpr TensorTileSize s_ref_tensor_size  = TILE_TYPE::s_tensor_size;
+        using pointer_type    = typename REF_TYPE::pointer_type;
+        using ref_index_type  = typename REF_TYPE::index_type;
+        
+        static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
+        using tile_index_type = typename TILE_TYPE::index_type;
 
-        using result_type = TensorRef<POINTER_TYPE, INDEX_TYPE2, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
+        using merge_type = TensorRef<pointer_type, tile_index_type, s_tile_tensor_size, s_num_dims, s_stride_one_dim>;
+        using shift_type = merge_type;
 
         RAJA_INLINE
         RAJA_HOST_DEVICE
         static constexpr
-        result_type merge(ref_type const &ref, tile_type const &tile){
-          return result_type{
+        merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile){
+          return merge_type{
             ref.m_pointer,
-            {INDEX_TYPE2(ref.m_stride[DIM_SEQ])...},
+            {tile_index_type(ref.m_stride[DIM_SEQ])...},
             tile
           };
         }
@@ -157,30 +473,150 @@ namespace expt
         RAJA_INLINE
         RAJA_HOST_DEVICE
         static constexpr
-        result_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
-          return result_type{
+        shift_type shift_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin){
+          return shift_type{
             ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            {INDEX_TYPE2(ref.m_stride[DIM_SEQ])...},
+            {tile_index_type(ref.m_stride[DIM_SEQ])...},
             ref.m_tile
           };
         }
 
+    };
+
+
+
+
+
+
+
+    template<
+       typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE,
+       typename STRIDE, INDEX_TYPE1... BEGIN1, INDEX_TYPE1... SIZE1, camp::idx_t STRIDE_ONE_DIM,
+       typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, typename BEGIN2, typename SIZE2,
+       camp::idx_t ... DIM_SEQ
+    >
+    struct MergeRefTile<
+       StaticTensorRef<
+              POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE,
+              STRIDE,
+              camp::int_seq<INDEX_TYPE1,BEGIN1...>,
+              camp::int_seq<INDEX_TYPE1,SIZE1...>,
+              STRIDE_ONE_DIM
+       >,
+       StaticTensorTile<
+              INDEX_TYPE2,
+              TENSOR_SIZE,
+              BEGIN2,
+              SIZE2
+       >,
+       camp::idx_seq<DIM_SEQ...>
+    > {
+
+        using ref_tile_type = StaticTensorTile<
+                  INDEX_TYPE1,
+                  RTENSOR_SIZE,
+                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
+                  camp::int_seq<INDEX_TYPE1, SIZE1...>
+              >;
+
+        using ref_type = StaticTensorRef<
+                  POINTER_TYPE,
+                  INDEX_TYPE1,
+                  RTENSOR_SIZE,
+                  STRIDE,
+                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
+                  camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                  STRIDE_ONE_DIM
+              >;
+
+        using tile_type = StaticTensorTile<
+                  INDEX_TYPE2,
+                  TENSOR_SIZE,
+                  BEGIN2,
+                  SIZE2
+              >;
+
+        using ref_stride_type = typename ref_type ::stride_type;
+        using ref_begin_type  = typename tile_type:: begin_type;
+        using ref_size_type   = typename tile_type::  size_type;
+        
+        using ref_begin_seq   = typename ref_begin_type::seq_type;
+        using ref_size_seq    = typename ref_size_type ::seq_type;
+
+        using new_stride_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>; 
+        
+        //using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2( ref_begin_type::value_at(DIM_SEQ))...>; 
+        //using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(  ref_size_type::value_at(DIM_SEQ))...>; 
+        using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(BEGIN1)...>; 
+        using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(SIZE1)...>; 
+       
+        using shift_tile_type = StaticTensorTile<INDEX_TYPE2,TENSOR_SIZE,shift_begin_seq,shift_size_seq>;
+ 
+        using new_stride_type = StaticIndexArray<new_stride_seq>; 
+
+        using merge_type = StaticTensorRef<
+                  POINTER_TYPE,
+                  INDEX_TYPE2,
+                  TENSOR_SIZE,
+                  new_stride_seq,
+                  BEGIN2,
+                  SIZE2,
+                  STRIDE_ONE_DIM
+              >;
+
+        using shift_type = StaticTensorRef<
+                  POINTER_TYPE,
+                  INDEX_TYPE2,
+                  TENSOR_SIZE,
+                  new_stride_seq,
+                  shift_begin_seq,
+                  shift_size_seq,
+                  STRIDE_ONE_DIM
+              >;
+
+
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static constexpr
+        merge_type merge(ref_type const &ref, tile_type const &tile){
+          return merge_type {
+            ref.m_pointer,
+            new_stride_type(),
+            tile
+          };
+        }
+
+        RAJA_INLINE
+        RAJA_HOST_DEVICE
+        static constexpr
+        shift_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
+          return shift_type {
+            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
+            new_stride_type(),
+            shift_tile_type()
+          };
+        }
+
 
 
     };
 
 
 
+
     template<typename REF_TYPE, typename TILE_TYPE>
     RAJA_INLINE
     RAJA_HOST_DEVICE
     constexpr
     auto merge_ref_tile(REF_TYPE const &ref, TILE_TYPE const &tile) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::result_type
+      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
     {
       return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
     }
 
+
+
     /*!
      * Modifies a ref's pointer so that the supplied tile_origin will resolve
      * to the original pointer.
@@ -190,11 +626,13 @@ namespace expt
     RAJA_HOST_DEVICE
     constexpr
     auto shift_tile_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::result_type
+      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
     {
       return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref, tile_origin);
     }
 
+
+
     /*!
      * Changes TensorTile size type to FULL
      */
@@ -221,10 +659,34 @@ namespace expt
 
 
 
+    /*!
+     * Changes StaticTensorTile size type to FULL
+     */
+    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename BEGIN, typename SIZE>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    StaticTensorTile<INDEX_TYPE, TENSOR_FULL, BEGIN, SIZE> &
+    make_tensor_tile_full(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, BEGIN, SIZE> &tile){
+      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_FULL, BEGIN, SIZE> &>(tile);
+    }
+
+    /*!
+     * Changes StaticTensorTile size type to PARTIAL
+     */
+    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename BEGIN, typename SIZE>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    constexpr
+    StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, BEGIN, SIZE> &
+    make_tensor_tile_partial(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, BEGIN, SIZE> &tile){
+      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, BEGIN, SIZE> &>(tile);
+    }
+
 
 
-  } // namespace internal
-} // namespace expt
+  } // namespace expt
+} // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index a9901cb164..5d2aa4cbad 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -83,6 +83,7 @@ namespace expt
       RAJA_INLINE
       self_type operator=(RHS const &rhs)
       {
+
         rhs.store_ref(m_ref);
         return *this;
       }
@@ -240,20 +241,19 @@ namespace expt
         return (dim==0) ? self_type::s_num_elem : 0;
       }
 
+
       /*!
        * Gets the default tile of this tensor
        * That tile always start at 0, and extends to the full tile sizes
        */
+
       RAJA_HOST_DEVICE
       RAJA_INLINE
       static
-      constexpr TensorTile<int, TENSOR_FULL, s_num_dims>
+      constexpr StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>
       s_get_default_tile()
       {
-        return TensorTile<int, TENSOR_FULL, s_num_dims>{
-          {int(SIZES*0)...},
-          {int(SIZES)...}
-        };
+        return StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>();
       }
 
       /*!
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 0a57f9a57a..bad44c9f86 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -34,6 +34,8 @@ namespace expt
 
 
 
+    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+    struct StaticTensorTileExec;
 
     template<typename STORAGE, typename DIM_SEQ>
     struct TensorTileExec;
@@ -99,8 +101,48 @@ namespace expt
 
 
 
+      template<
+          typename OTILE,
+          typename TTYPE,
+          typename BODY
+      >
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      void
+      static_exec(
+          OTILE const &otile,
+          TTYPE const &tile,
+          BODY && body
+      ){
+
+
+        auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+        auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+
+        auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+
+        auto constexpr step_size  = STORAGE::s_dim_elem(DIM0);
+
+        auto constexpr iter_count =
+               (tile_begin >= orig_begin) && (tile_begin < (orig_begin+orig_size))
+                 ? ((orig_begin + orig_size) - tile_begin + step_size - 1) / step_size
+                 : 0;
+
+
+        using IterCount = camp::integral_constant<typename TTYPE::index_type,iter_count>;
+        using DimSeq = camp::idx_seq<DIM0,DIM_REST...>;
+        using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,IterCount>::type;
+
+        StaticTensorTileExec<STORAGE,DimSeq,IdxSeq>::exec(otile,tile,body);
+        
+      }
+
+
+
     };
 
+
     /**
      * Termination of nested loop:  execute evaluation of ET
      */
@@ -118,9 +160,21 @@ namespace expt
 
       }
 
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static
+      void static_exec(OTILE const &, TTYPE const &tile, BODY && body){
+
+        // execute body, passing in the current tile
+        body(tile);
+
+      }
+
     };
 
 
+
     template<typename STORAGE, typename TILE_TYPE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
     RAJA_INLINE
     RAJA_HOST_DEVICE
@@ -152,6 +206,161 @@ namespace expt
 
     }
 
+
+    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+    struct StaticTensorTileExec;
+
+    /**
+     * Implement a dimension tiling loop
+     */
+
+    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST, camp::idx_t IDX, camp::idx_t ... IDX_REST>
+    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>,camp::idx_seq<IDX,IDX_REST...>>{
+
+          using DimList  = camp::idx_seq<DIM0, DIM_REST...>;
+          using DimTail  = camp::idx_seq<      DIM_REST...>;
+          using IdxList  = camp::idx_seq<IDX , IDX_REST...>;
+          using IdxTail  = camp::idx_seq<      IDX_REST...>;
+
+          using DownExec = TensorTileExec<STORAGE,camp::idx_seq<DIM_REST...>>;
+          using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0,DIM_REST...>,camp::idx_seq<IDX_REST...>>;
+
+          static auto const step_size = STORAGE::s_dim_elem(DIM0);
+
+          template<
+              typename OTILE,
+              typename TTYPE,
+              typename BODY
+          >
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static
+          void
+          exec(
+              OTILE const &otile,
+              TTYPE const &tile,
+              BODY && body
+          ){
+    
+            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    
+            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+
+            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
+            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+
+            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+
+            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
+            using PartTile  = typename TailTile::Partial;
+
+    
+            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB" );
+     
+            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
+               DownExec::static_exec(otile, tile, body);
+               NextTile next_tile;
+               NextExec::exec(otile, next_tile, body);
+            } else if ( tile_begin < (orig_begin + orig_size ) ) {
+               PartTile part_tile;
+               DownExec::static_exec(otile,part_tile,body);
+            }
+    
+          }
+
+
+
+    };
+
+
+
+    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t IDX, camp::idx_t ... IDX_REST>
+    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0>, camp::idx_seq<IDX,IDX_REST...>>{
+      using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0>,camp::idx_seq<IDX_REST...>>;
+
+
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static void exec(OTILE const & otile, TTYPE const &tile, BODY && body) {
+            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    
+            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+
+            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
+            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+
+            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+
+            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
+            using PartTile  = typename TailTile::Partial;
+
+    
+            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB" );
+     
+            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
+               body(tile);
+               NextTile next_tile;
+               NextExec::exec(otile, next_tile, body);
+            } else if ( tile_begin < (orig_begin + orig_size ) ) {
+               PartTile part_tile;
+               body(part_tile);
+            }
+      }
+
+    };
+
+    template<typename STORAGE, camp::idx_t ... DIM_REST>
+    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>, camp::idx_seq<> >{
+
+      template<typename OTILE, typename TTYPE, typename BODY>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      static void exec(OTILE const &, TTYPE const &, BODY &&) {}
+
+    };
+
+
+
+    template<typename STORAGE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename BEGIN, typename SIZE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
+    RAJA_INLINE
+    RAJA_HOST_DEVICE
+    void tensorTileExec_expanded( StaticTensorTile<INDEX_TYPE,TENSOR_SIZE, BEGIN, SIZE> const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
+    {
+
+      using InputType = StaticTensorTile<
+          INDEX_TYPE,
+          TENSOR_SIZE,
+          BEGIN,
+          SIZE
+      >;
+
+      using InputBegin = typename InputType::begin_type;
+
+      using Type = StaticTensorTile<
+          INDEX_TYPE,
+          TENSOR_FULL,
+          camp::int_seq<INDEX_TYPE,InputBegin::value_at(IDX_SEQ)...>,
+          camp::int_seq<INDEX_TYPE,STORAGE::s_dim_elem(IDX_SEQ)...>
+      >;
+
+      Type full_tile;
+
+      // Do all of the tiling loops in layout order, this may improve
+      // cache performance
+      using layout_order = typename STORAGE::layout_type::seq_t;
+      using tensor_tile_exec_t =
+             TensorTileExec<STORAGE, layout_order>;
+
+
+      tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+
+    }
+
+
+
     template<typename STORAGE, typename TILE_TYPE, typename BODY>
     RAJA_INLINE
     RAJA_HOST_DEVICE
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 1bb1723060..0f39e79657 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -215,104 +215,244 @@ namespace expt
       }
 
 
-      /*!
-       * @brief Performs load specified by TensorRef object.
-       */
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
+      template<typename REF_TYPE>
+      struct RefBridge;
+
+
+      template<typename REF_TYPE>
       RAJA_HOST_DEVICE
       RAJA_INLINE
-      self_type &load_ref(RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM> const &ref){
+      self_type& load_ref (REF_TYPE const &ref){
+          RefBridge<REF_TYPE>::load_ref(*this,ref);
+          return *this;
+      }
 
-        auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+      template<typename REF_TYPE>
+      RAJA_HOST_DEVICE
+      RAJA_INLINE
+      self_type const &store_ref (REF_TYPE &ref) const {
+          RefBridge<REF_TYPE>::store_ref(*this,ref);
+          return *this;
+      }
 
-        // check for packed data
-        if(STRIDE_ONE_DIM == 0){
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed ++;
-#endif
-            load_packed(ptr);
-          }
-          // partial
-          else{
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n ++;
-#endif
-            load_packed_n(ptr, ref.m_tile.m_size[0]);
-          }
 
-        }
-        // strided data
-        else
-        {
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided ++;
-#endif
-            load_strided(ptr, ref.m_stride[0]);
+      
+      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
+      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
+      {
+
+          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void load_ref (self_type& self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed ++;
+              #endif
+                self.load_packed(ptr);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed_n ++;
+              #endif
+                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided ++;
+              #endif
+                self.load_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided_n ++;
+              #endif
+                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
           }
-          // partial
-          else{
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
-#endif
-            load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+
+
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed ++;
+    #endif
+                self.store_packed(ptr);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed_n ++;
+    #endif
+                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided ++;
+    #endif
+                self.store_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided_n ++;
+    #endif
+                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
           }
-        }
-        return *this;
-      }
+           
 
+      };
 
-      /*!
-       * @brief Performs load specified by TensorRef object.
-       */
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref(RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM> const &ref) const {
 
-        auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
 
-        // check for packed data
-        if(STRIDE_ONE_DIM == 0){
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed ++;
-#endif
-            store_packed(ptr);
-          }
-          // partial
-          else{
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n ++;
-#endif
-            store_packed_n(ptr, ref.m_tile.m_size[0]);
-          }
 
-        }
-        // strided data
-        else
-        {
-          // full vector?
-          if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided ++;
-#endif
-            store_strided(ptr, ref.m_stride[0]);
+
+      
+      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, INDEX_TYPE STRIDE_VALUE, INDEX_TYPE BEGIN_VALUE, INDEX_TYPE SIZE_VALUE, camp::idx_t STRIDE_ONE_DIM>
+      struct RefBridge <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>> 
+      {
+
+          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>;
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void load_ref (self_type &self, RefType const &ref){
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed ++;
+              #endif
+                self.load_packed(ptr);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_packed_n ++;
+              #endif
+                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided ++;
+              #endif
+                self.load_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+              #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_load_strided_n ++;
+              #endif
+                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
           }
-          // partial
-          else{
-#ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n ++;
-#endif
-            store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+
+
+
+          /*!
+           * @brief Performs load specified by TensorRef object.
+           */
+          RAJA_HOST_DEVICE
+          RAJA_INLINE
+          static void store_ref(self_type const &self, RefType &ref) {
+    
+            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
+    
+            // check for packed data
+            if(STRIDE_ONE_DIM == 0){
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed ++;
+    #endif
+                self.store_packed(ptr);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_packed_n ++;
+    #endif
+                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
+              }
+    
+            }
+            // strided data
+            else
+            {
+              // full vector?
+              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided ++;
+    #endif
+                self.store_strided(ptr, ref.m_stride[0]);
+              }
+              // partial
+              else{
+    #ifdef RAJA_ENABLE_VECTOR_STATS
+              RAJA::tensor_stats::num_vector_store_strided_n ++;
+    #endif
+                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+              }
+            }
           }
-        }
-        return *this;
-      }
+           
+
+      };
+     
+
 
 
       /*!
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 914c486bfb..9c3adff140 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -49,7 +49,8 @@ enum class Pattern {
   workgroup,
   workgroup_exec,
   workgroup_order,
-  workgroup_storage
+  workgroup_storage,
+  workgroup_dispatch
 };
 
 enum class Launch { undefined, sync, async };
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index f92c744c5f..be3541ce61 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -35,31 +35,66 @@ namespace policy
 namespace workgroup
 {
 
+/// execute the enqueued loops in the order they were enqueued
+/// Note this is intended for debugging, the WorkGroup abstraction is intended
+/// to allow running loops in an unordered fashion (loop fusion)
 struct ordered
     : RAJA::make_policy_pattern_t<Policy::undefined,
                                   Pattern::workgroup_order> {
 };
+/// execute the enqueued loops in the reverse order from the order that they
+/// were enqueued
+/// Note this is intended for debugging, the WorkGroup abstraction is intended
+/// to allow running loops in an unordered fashion (loop fusion)
 struct reverse_ordered
     : RAJA::make_policy_pattern_t<Policy::undefined,
                                   Pattern::workgroup_order> {
 };
 
+/// store an array of pointers to the enqueued objects. The enqueued objects
+/// are stored in separate allocations.
 struct array_of_pointers
     : RAJA::make_policy_pattern_t<Policy::undefined,
                                   Pattern::workgroup_storage> {
 };
+/// store an array of pointers to the enqueued objects. The enqueued objects
+/// are stored in a single compact array.
 struct ragged_array_of_objects
     : RAJA::make_policy_pattern_t<Policy::undefined,
                                   Pattern::workgroup_storage> {
 };
+/// store an array of the enqueued objects with padding such that the objects
+/// can be accessed using a constant stride from the beginning of the array.
 struct constant_stride_array_of_objects
     : RAJA::make_policy_pattern_t<Policy::undefined,
                                   Pattern::workgroup_storage> {
 };
 
+/// Dispatch using function pointers to make indirect function calls
+struct indirect_function_call_dispatch
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_dispatch> {
+};
+/// Dispatch using virtual functions to make indirect function calls
+struct indirect_virtual_function_dispatch
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_dispatch> {
+};
+/// Dispatch using an implementation equivalent to a switch statement to select
+/// the type from RangeAndCallables and directly call the object.
+/// RangeAndCallables is a pack of types of the form camp::list<Range, Callable>
+/// where pairs of Range and Callable are the types of the range and callable
+/// objects that may be passed to WorkPool enqueue.
+template < typename ... RangeAndCallables >
+struct direct_dispatch
+    : RAJA::make_policy_pattern_t<Policy::undefined,
+                                  Pattern::workgroup_dispatch> {
+};
+
 template < typename EXEC_POLICY_T,
            typename ORDER_POLICY_T,
-           typename STORAGE_POLICY_T >
+           typename STORAGE_POLICY_T,
+           typename DISPATCH_POLICY_T = indirect_function_call_dispatch >
 struct WorkGroupPolicy
     : public RAJA::make_policy_pattern_platform_t<
                        policy_of<EXEC_POLICY_T>::value,
@@ -71,6 +106,8 @@ struct WorkGroupPolicy
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
   static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
       "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
+  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::value,
+      "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup dispatch policy");
 };
 
 }  // end namespace workgroup
@@ -83,6 +120,10 @@ using policy::workgroup::array_of_pointers;
 using policy::workgroup::ragged_array_of_objects;
 using policy::workgroup::constant_stride_array_of_objects;
 
+using policy::workgroup::indirect_function_call_dispatch;
+using policy::workgroup::indirect_virtual_function_dispatch;
+using policy::workgroup::direct_dispatch;
+
 using policy::workgroup::WorkGroupPolicy;
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index cdb82652ab..7e83e8da9a 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -38,12 +38,15 @@
  * Finally, we fallback on the seq_atomic, which performs non-atomic operations
  * because we assume there is no thread safety issues (no parallel model)
  */
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE)
 #define RAJA_AUTO_ATOMIC \
   RAJA::cuda_atomic {}
-#elif defined(__HIP_DEVICE_COMPILE__)
+#elif defined(__HIP_DEVICE_COMPILE__) && defined(RAJA_HIP_ACTIVE)
 #define RAJA_AUTO_ATOMIC \
   RAJA::hip_atomic {}
+#elif defined(__SYCL_DEVICE_ONLY__)
+#define RAJA_AUTO_ATOMIC \
+  RAJA::sycl_atomic {}
 #elif defined(RAJA_ENABLE_OPENMP)
 #define RAJA_AUTO_ATOMIC \
   RAJA::omp_atomic {}
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index f2bec79443..151481ca32 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -38,7 +38,7 @@
 #include "RAJA/policy/cuda/sort.hpp"
 #include "RAJA/policy/cuda/kernel.hpp"
 #include "RAJA/policy/cuda/synchronize.hpp"
-#include "RAJA/policy/cuda/teams.hpp"
+#include "RAJA/policy/cuda/launch.hpp"
 #include "RAJA/policy/cuda/WorkGroup.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_CUDA)
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 339ba8b0a3..390ac4ef51 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -118,9 +118,9 @@ namespace detail
 struct cudaInfo {
   cuda_dim_t gridDim{0, 0, 0};
   cuda_dim_t blockDim{0, 0, 0};
-  ::RAJA::resources::Cuda* res = nullptr;
+  ::RAJA::resources::Cuda res;
   bool setup_reducers = false;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   cudaInfo* thread_states = nullptr;
   omp::mutex lock;
 #endif
@@ -146,7 +146,7 @@ class SetterResetter
 extern cudaInfo g_status;
 
 extern cudaInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
 
@@ -165,7 +165,7 @@ void synchronize_impl(::RAJA::resources::Cuda res)
 RAJA_INLINE
 void synchronize()
 {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
@@ -184,7 +184,7 @@ void synchronize()
 RAJA_INLINE
 void synchronize(::RAJA::resources::Cuda res)
 {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
@@ -202,7 +202,7 @@ void synchronize(::RAJA::resources::Cuda res)
 RAJA_INLINE
 void launch(::RAJA::resources::Cuda res, bool async = true)
 {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
@@ -251,7 +251,7 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get resource for current launch
 RAJA_INLINE
-::RAJA::resources::Cuda* currentResource() { return detail::tl_status.res; }
+::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 
 //! create copy of loop_body that is setup for device execution
 template <typename LOOP_BODY>
@@ -264,8 +264,8 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
 {
   detail::SetterResetter<bool> setup_reducers_srer(
       detail::tl_status.setup_reducers, true);
-  detail::SetterResetter<::RAJA::resources::Cuda*> res_srer(
-      detail::tl_status.res, &res);
+  detail::SetterResetter<::RAJA::resources::Cuda> res_srer(
+      detail::tl_status.res, res);
 
   detail::tl_status.gridDim = gridDim;
   detail::tl_status.blockDim = blockDim;
diff --git a/include/RAJA/policy/cuda/WorkGroup.hpp b/include/RAJA/policy/cuda/WorkGroup.hpp
index 7d86627e11..1cba55c745 100644
--- a/include/RAJA/policy/cuda/WorkGroup.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ * \brief   Header file containing RAJA Dispatcher and WorkRunner constructs.
  *
  ******************************************************************************
  */
@@ -18,7 +18,7 @@
 #ifndef RAJA_cuda_WorkGroup_HPP
 #define RAJA_cuda_WorkGroup_HPP
 
-#include "RAJA/policy/cuda/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/cuda/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/cuda/WorkGroup/WorkRunner.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
new file mode 100644
index 0000000000..7b78d4d8c8
--- /dev/null
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -0,0 +1,119 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Dispatcher.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_cuda_WorkGroup_Dispatcher_HPP
+#define RAJA_cuda_WorkGroup_Dispatcher_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/cuda/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
+
+#include <thread>
+#include <mutex>
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+namespace cuda
+{
+
+// global function that creates the value on the device using the
+// factory and writes it into a pinned ptr
+template < typename Factory >
+__global__ void get_value_global(
+    typename Factory::value_type* ptr, Factory factory)
+{
+  *ptr = factory();
+}
+
+// get the pinned ptr buffer
+inline void* get_cached_value_ptr(size_t nbytes)
+{
+  static size_t cached_nbytes = 0;
+  static void* ptr = nullptr;
+  if (nbytes > cached_nbytes) {
+    cached_nbytes = 0;
+    cudaErrchk(cudaFreeHost(ptr));
+    cudaErrchk(cudaMallocHost(&ptr, nbytes));
+    cached_nbytes = nbytes;
+  }
+  return ptr;
+}
+
+// mutex that guards against concurrent use of
+// pinned buffer and get_cached_value_ptr()
+inline std::mutex& get_value_mutex()
+{
+  static std::mutex s_mutex;
+  return s_mutex;
+}
+
+// get the device function pointer by calling a global function to
+// write it into a pinned ptr, beware different instantiates of this
+// function may run concurrently
+template < typename Factory >
+inline auto get_value(Factory&& factory)
+{
+  using value_type = typename std::decay_t<Factory>::value_type;
+  const std::lock_guard<std::mutex> lock(get_value_mutex());
+
+  auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
+  get_value_global<std::decay_t<Factory>><<<1,1>>>(
+      ptr, std::forward<Factory>(factory));
+  cudaErrchk(cudaGetLastError());
+  cudaErrchk(cudaDeviceSynchronize());
+
+  return *ptr;
+}
+
+// get the device function pointer and store it so it can be used
+// multiple times
+template < typename Factory >
+inline auto get_cached_value(Factory&& factory)
+{
+  static auto value = get_value(std::forward<Factory>(factory));
+  return value;
+}
+
+}  // namespace cuda
+
+/*!
+* Populate and return a Dispatcher object that can be used in device code
+*/
+template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
+inline const Dispatcher_T* get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+{
+  static Dispatcher_T dispatcher{
+        Dispatcher_T::template makeDispatcher<T>(
+          [](auto&& factory) {
+            return cuda::get_cached_value(
+                std::forward<decltype(factory)>(factory));
+          }) };
+  return &dispatcher;
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp b/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
deleted file mode 100644
index c07661c0b0..0000000000
--- a/include/RAJA/policy/cuda/WorkGroup/Vtable.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA workgroup Vtable.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_cuda_WorkGroup_Vtable_HPP
-#define RAJA_cuda_WorkGroup_Vtable_HPP
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/policy/cuda/policy.hpp"
-
-#include "RAJA/pattern/WorkGroup/Vtable.hpp"
-
-#include <thread>
-#include <mutex>
-
-
-namespace RAJA
-{
-
-namespace detail
-{
-
-// global function that gets the device function pointer and
-// writes it into a pinned ptrptr
-template < typename T, typename Vtable_T >
-__global__ void get_Vtable_cuda_device_call_global(
-    typename Vtable_T::call_sig* ptrptr)
-{
-  *ptrptr = &Vtable_T::template device_call<T>;
-}
-
-// allocate the pinned ptrptr buffer
-inline void* get_Vtable_cuda_device_call_ptrptr()
-{
-  void* ptrptr = nullptr;
-  cudaErrchk(cudaMallocHost(&ptrptr, sizeof(typename Vtable<void>::call_sig)));
-  return ptrptr;
-}
-
-// get the pinned ptrptr buffer
-inline void* get_cached_Vtable_cuda_device_call_ptrptr()
-{
-  static void* ptrptr = get_Vtable_cuda_device_call_ptrptr();
-  return ptrptr;
-}
-
-// mutex that guards against concurrent use of
-// get_cached_Vtable_cuda_device_call_ptrptr()
-inline std::mutex& get_Vtable_cuda_mutex()
-{
-  static std::mutex s_mutex;
-  return s_mutex;
-}
-
-// get the device function pointer by calling a global function to
-// write it into a pinned ptrptr, beware different instantiates of this
-// function may run concurrently
-template < typename T, typename Vtable_T >
-inline typename Vtable_T::call_sig get_Vtable_cuda_device_call()
-{
-  const std::lock_guard<std::mutex> lock(get_Vtable_cuda_mutex());
-
-  typename Vtable_T::call_sig* ptrptr =
-      static_cast<typename Vtable_T::call_sig*>(
-        get_cached_Vtable_cuda_device_call_ptrptr());
-  get_Vtable_cuda_device_call_global<T, Vtable_T><<<1,1>>>(ptrptr);
-  cudaErrchk(cudaGetLastError());
-  cudaErrchk(cudaDeviceSynchronize());
-
-  return *ptrptr;
-}
-
-// get the device function pointer and store it so it can be used
-// multiple times
-template < typename T, typename Vtable_T >
-inline typename Vtable_T::call_sig get_cached_Vtable_cuda_device_call()
-{
-  static typename Vtable_T::call_sig ptr =
-      get_Vtable_cuda_device_call<T, Vtable_T>();
-  return ptr;
-}
-
-/*!
-* Populate and return a Vtable object where the
-* call operator is a device function
-*/
-template < typename T, typename Vtable_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
-inline const Vtable_T* get_Vtable(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
-{
-  static Vtable_T vtable{
-        &Vtable_T::template move_construct_destroy<T>,
-        get_cached_Vtable_cuda_device_call<T, Vtable_T>(),
-        &Vtable_T::template destroy<T>,
-        sizeof(T)
-      };
-  return &vtable;
-}
-
-}  // namespace detail
-
-}  // namespace RAJA
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 0c4fae7a2c..a82ad19be5 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -37,12 +37,14 @@ namespace detail
  * and returns any per run resources
  */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -50,6 +52,7 @@ struct WorkRunner<
         RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
         RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -58,6 +61,7 @@ struct WorkRunner<
         RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
         RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>;
@@ -92,12 +96,14 @@ struct WorkRunner<
  * and returns any per run resources
  */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -105,6 +111,7 @@ struct WorkRunner<
         RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
         RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -113,6 +120,7 @@ struct WorkRunner<
         RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
         RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>;
@@ -188,7 +196,7 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
   const index_type i_loop = blockIdx.y;
   // TODO: cache pointer to value_type in shared memory
   // TODO: cache holder (value_type::obj) in shared memory
-  value_type::call(&iter[i_loop], args...);
+  value_type::device_call(&iter[i_loop], args...);
 }
 
 
@@ -199,23 +207,46 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
  * by the average number of iterates per loop
  */
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
         RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
   using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
   using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using dispatch_policy = DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
   using resource_type = resources::Cuda;
 
-  using vtable_type = Vtable<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+  // The type that will hold the segment and loop body in work storage
+  struct holder_type {
+    template < typename T >
+    using type = HoldCudaDeviceXThreadblockLoop<
+        typename camp::at<T, camp::num<0>>::type, // ITERABLE
+        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
+        index_type, Args...>;
+  };
+  ///
+  template < typename T >
+  using holder_type_t = typename holder_type::template type<T>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the device
+  using dispatcher_exec_policy = exec_policy;
+
+  // The Dispatcher policy with holder_types used internally to handle the
+  // ranges and callables passed in by the user.
+  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+
+  using dispatcher_type = Dispatcher<Platform::cuda, dispatcher_holder_policy, RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
 
   WorkRunner() = default;
 
@@ -235,15 +266,6 @@ struct WorkRunner<
     return *this;
   }
 
-  // The type  that will hold the segment and loop body in work storage
-  template < typename ITERABLE, typename LOOP_BODY >
-  using holder_type = HoldCudaDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
-                                 index_type, Args...>;
-
-  // The policy indicating where the call function is invoked
-  // in this case the values are called on the device
-  using vtable_exec_policy = exec_policy;
-
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
   template < typename WorkContainer, typename Iterable, typename LoopBody >
@@ -254,7 +276,7 @@ struct WorkRunner<
     using ITERABLE  = camp::decay<Iterable>;
     using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
-    using holder = holder_type<ITERABLE, LOOP_BODY>;
+    using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
     // using true_value_type = typename WorkContainer::template true_value_type<holder>;
 
@@ -274,7 +296,7 @@ struct WorkRunner<
       //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index cf4af99c14..04b971ed28 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -368,7 +368,47 @@ RAJA_INLINE __device__ T cuda_atomicInc(T volatile *acc)
                                       [=] __device__(T a) { return a + 1; });
 }
 
+// 32-bit signed atomicAdd support by CUDA, used as backend for atomicInc
+template <>
+RAJA_INLINE __device__ int cuda_atomicInc<int>(int volatile *acc)
+{
+  return ::atomicAdd((int *)acc, (int)1);
+}
+
+// 32-bit unsigned atomicAdd support by CUDA, used as backend for atomicInc
+template <>
+RAJA_INLINE __device__ unsigned cuda_atomicInc<unsigned>(unsigned volatile *acc)
+{
+  return ::atomicAdd((unsigned *)acc, (unsigned)1);
+}
+
+// 64-bit unsigned atomicAdd support by CUDA, used as backend for atomicInc
+template <>
+RAJA_INLINE __device__ unsigned long long cuda_atomicInc<unsigned long long>(
+    unsigned long long volatile *acc)
+{
+  return ::atomicAdd((unsigned long long *)acc, (unsigned long long)1);
+}
+
+// 32-bit float atomicAdd support by CUDA, used as backend for atomicInc
+template <>
+RAJA_INLINE __device__ float cuda_atomicInc<float>(float volatile *acc)
+{
+  return ::atomicAdd((float *)acc, (float)1);
+}
+#endif
 
+// 64-bit double atomicAdd support added for sm_60, used as backend for atomicInc
+#if __CUDA_ARCH__ >= 600
+template <>
+RAJA_INLINE __device__ double cuda_atomicInc<double>(double volatile *acc)
+{
+  return ::atomicAdd((double *)acc, (double)1);
+}
+#endif
+
+
+#if __CUDA_ARCH__ >= 200
 template <typename T>
 RAJA_INLINE __device__ T cuda_atomicDec(T volatile *acc, T val)
 {
@@ -393,6 +433,20 @@ RAJA_INLINE __device__ T cuda_atomicDec(T volatile *acc)
   return cuda_atomic_CAS_oper(acc,
                                       [=] __device__(T a) { return a - 1; });
 }
+
+// 32-bit signed atomicSub support by CUDA, used as backend for atomicDec
+template <>
+RAJA_INLINE __device__ int cuda_atomicDec<int>(int volatile *acc)
+{
+  return ::atomicSub((int *)acc, (int)1);
+}
+
+// 32-bit unsigned atomicSub support by CUDA, used as backend for atomicDec
+template <>
+RAJA_INLINE __device__ unsigned cuda_atomicDec<unsigned>(unsigned volatile *acc)
+{
+  return ::atomicSub((unsigned *)acc, (unsigned)1);
+}
 #endif
 
 
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 13f3d3e94a..bc0936c353 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -30,6 +30,8 @@
 
 #include "RAJA/pattern/forall.hpp"
 
+#include "RAJA/pattern/params/forall.hpp"
+
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
@@ -148,6 +150,31 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
   }
 }
 
+template <typename EXEC_POL,
+          size_t BlockSize,
+          size_t BlocksPerSM,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam>
+__launch_bounds__(BlockSize, 1) __global__
+    void forallp_cuda_kernel(
+                            LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
+{
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(loop_body);
+  auto& body = privatizer.get_priv();
+  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
+  if ( ii < length )
+  {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  }
+  RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
+}
+
 }  // namespace impl
 
 //
@@ -158,11 +185,17 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async>
-RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda cuda_res,
-                                                    cuda_exec_explicit<BlockSize, BlocksPerSM, Async>,
-                                                    Iterable&& iter,
-                                                    LoopBody&& loop_body)
+template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Cuda>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Cuda cuda_res,
+            cuda_exec_explicit<BlockSize, BlocksPerSM, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -218,6 +251,82 @@ RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(resources::Cuda c
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
 
+template <typename Iterable, typename LoopBody, size_t BlockSize, size_t BlocksPerSM, bool Async, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Cuda>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+forall_impl(resources::Cuda cuda_res,
+            cuda_exec_explicit<BlockSize, BlocksPerSM, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam f_params)
+{
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using LOOP_BODY = camp::decay<LoopBody>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = RAJA::cuda_exec_explicit<BlockSize, BlocksPerSM, Async>;
+
+  auto func = impl::forallp_cuda_kernel< EXEC_POL, BlockSize, BlocksPerSM, Iterator, LOOP_BODY, IndexType, camp::decay<ForallParam> >;
+
+  //
+  // Compute the requested iteration space size
+  //
+  Iterator begin = std::begin(iter);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
+
+  // Only launch kernel if we have something to iterate over
+  if (len > 0 && BlockSize > 0) {
+
+    //
+    // Compute the number of blocks
+    //
+    cuda_dim_t blockSize{BlockSize, 1, 1};
+    cuda_dim_t gridSize = impl::getGridDim(static_cast<cuda_dim_member_t>(len), blockSize);
+
+    RAJA_FT_BEGIN;
+
+    RAJA::cuda::detail::cudaInfo launch_info;
+    launch_info.gridDim = gridSize;
+    launch_info.blockDim = blockSize;
+    launch_info.res = cuda_res;
+
+    //
+    // Setup shared memory buffers
+    //
+    size_t shmem = 0;
+
+    //  printf("gridsize = (%d,%d), blocksize = %d\n",
+    //         (int)gridSize.x,
+    //         (int)gridSize.y,
+    //         (int)blockSize.x);
+
+    {
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
+      //
+      // Privatize the loop_body, using make_launch_body to setup reductions
+      //
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          gridSize, blockSize, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+
+      //
+      // Launch the kernels
+      //
+      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, Async);
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    RAJA_FT_END;
+  }
+
+  return resources::EventProxy<resources::Cuda>(cuda_res);
+}
+
+
 
 //
 //////////////////////////////////////////////////////////////////////
diff --git a/include/RAJA/policy/cuda/teams.hpp b/include/RAJA/policy/cuda/launch.hpp
similarity index 88%
rename from include/RAJA/policy/cuda/teams.hpp
rename to include/RAJA/policy/cuda/launch.hpp
index a59d3188b7..9221925176 100644
--- a/include/RAJA/policy/cuda/teams.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   RAJA header file containing user interface for RAJA::Teams::cuda
+ * \brief   RAJA header file containing user interface for RAJA::launch::cuda
  *
  ******************************************************************************
  */
@@ -15,10 +15,10 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_cuda_HPP
-#define RAJA_pattern_teams_cuda_HPP
+#ifndef RAJA_pattern_launch_cuda_HPP
+#define RAJA_pattern_launch_cuda_HPP
 
-#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/pattern/detail/privatizer.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
@@ -28,42 +28,46 @@
 namespace RAJA
 {
 
-namespace expt
-{
-
 template <typename BODY>
-__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in)
+__global__ void launch_global_fcn(BODY body_in)
 {
+
+  LaunchContext ctx;
+
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
+
+  //Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
   body(ctx);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 1>> {
+struct LaunchExecute<RAJA::cuda_launch_t<async, 1>> {
 // cuda_launch_t num_threads set to 1, but not used in launch of kernel
 
   template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
+
     using BODY = camp::decay<BODY_IN>;
 
     auto func = launch_global_fcn<BODY>;
 
     resources::Cuda cuda_res = resources::Cuda::get_default();
-
     //
     // Compute the number of blocks and threads
     //
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
-
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
@@ -71,53 +75,47 @@ struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 1>> {
          blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
 
       RAJA_FT_BEGIN;
-
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
     }
 
-  }
+  };
 
   template <typename BODY_IN>
   static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
+
     using BODY = camp::decay<BODY_IN>;
 
     auto func = launch_global_fcn<BODY>;
 
-    /*Get the concrete resource */
+    // Get the concrete resource
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
     //
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
@@ -126,24 +124,19 @@ struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 1>> {
 
       RAJA_FT_BEGIN;
 
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
+        void *args[] = {(void*)&body};
         {
-          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
         }
       }
 
@@ -155,22 +148,32 @@ struct LaunchExecute<RAJA::expt::cuda_launch_t<async, 1>> {
 
 };
 
+
 template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-    void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
+
+  LaunchContext ctx;
+
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
+
+  //Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
   body(ctx);
 }
 
 template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::expt::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
 
   template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
+
     using BODY = camp::decay<BODY_IN>;
 
     auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
@@ -181,13 +184,13 @@ struct LaunchExecute<RAJA::policy::cuda::expt::cuda_launch_explicit_t<async, nth
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
@@ -196,52 +199,47 @@ struct LaunchExecute<RAJA::policy::cuda::expt::cuda_launch_explicit_t<async, nth
 
       RAJA_FT_BEGIN;
 
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
-    }
-
+      }
   }
 
   template <typename BODY_IN>
   static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
+
     using BODY = camp::decay<BODY_IN>;
 
     auto func = launch_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM>;
 
-    /*Get the concrete resource */
+    //Get the concrete resource
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
     //
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(ctx.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(ctx.teams.value[2]) };
+    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[1]),
+                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(ctx.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(ctx.threads.value[2]) };
+    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
@@ -250,24 +248,19 @@ struct LaunchExecute<RAJA::policy::cuda::expt::cuda_launch_explicit_t<async, nth
 
       RAJA_FT_BEGIN;
 
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::cuda::make_launch_body(
-            gridSize, blockSize, shmem, cuda_res, std::forward<BODY_IN>(body_in));
+            gridSize, blockSize, params.shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
+        void *args[] = {(void*)&body};
         {
-          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, shmem, cuda_res, async, ctx.kernel_name);
+          RAJA::cuda::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, cuda_res, async, kernel_name);
         }
       }
 
@@ -433,7 +426,7 @@ struct LoopExecute<cuda_flatten_block_threads_direct<DIM0, DIM1>, SEGMENT>
       const int ty = internal::get_cuda_dim<DIM1>(threadIdx);
       const int bx = internal::get_cuda_dim<DIM0>(blockDim);
       const int tid = tx + bx*ty;
-      
+
       if (tid < len) body(*(segment.begin() + tid));
     }
   }
@@ -1144,7 +1137,5 @@ struct TileICountExecute<cuda_block_xyz_direct<DIM>, SEGMENT> {
   }
 };
 
-}  // namespace expt
-
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
new file mode 100644
index 0000000000..d845bccfc2
--- /dev/null
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -0,0 +1,47 @@
+#ifndef CUDA_KERNELNAME_HPP
+#define CUDA_KERNELNAME_HPP
+
+//#include "../util/policy.hpp"
+
+#if defined(RAJA_CUDA_ACTIVE)
+
+#include <cuda.h>
+#include "RAJA/pattern/params/kernel_name.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo & cs)
+  {
+#if defined(RAJA_ENABLE_NV_TOOLS_EXT)
+    nvtxRangePush(kn.name);
+#endif
+  }
+
+  // Combine
+  template<typename EXEC_POL>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  combine(KernelName&) {}
+
+  // Resolve
+  template<typename EXEC_POL>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  resolve(KernelName&)
+  {
+#if defined(RAJA_ENABLE_NV_TOOLS_EXT)
+    nvtxRangePop();
+#endif
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif
+
+#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
new file mode 100644
index 0000000000..6d142ca19b
--- /dev/null
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -0,0 +1,48 @@
+#ifndef NEW_REDUCE_CUDA_REDUCE_HPP
+#define NEW_REDUCE_CUDA_REDUCE_HPP
+
+#if defined(RAJA_CUDA_ACTIVE)
+
+#include <cuda.h>
+#include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/reduce.hpp"
+#include "RAJA/pattern/params/reducer.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  init(Reducer<OP, T>& red, const RAJA::cuda::detail::cudaInfo & cs)
+  {
+    cudaMalloc( (void**)(&(red.devicetarget)), sizeof(T));
+    red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z);
+    red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  combine(Reducer<OP, T>& red) {
+    RAJA::cuda::impl::expt::grid_reduce(red);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
+  resolve(Reducer<OP, T>& red) {
+    cudaDeviceSynchronize();
+    cudaMemcpy(&red.val, red.devicetarget, sizeof(T), cudaMemcpyDeviceToHost);
+    *red.target = OP{}(red.val, *red.target);
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif
+
+#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 58146bc1c7..79cdd3957b 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -85,8 +85,6 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::cuda> {
 };
 
-namespace expt
-{
 template <bool Async, int num_threads, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
@@ -94,7 +92,6 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
                                 detail::get_launch<Async>::value,
                                 RAJA::Platform::cuda> {
 };
-}
 
 
 
@@ -114,6 +111,10 @@ struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::cuda> {
 };
 
+/// execute the enqueued loops in an unordered fashion by mapping loops to
+/// blocks in the y direction and loop iterations to threads in the x direction
+/// with the size of the x direction being the average of the iteration counts
+/// of all the loops
 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
                        RAJA::Policy::cuda,
@@ -267,12 +268,9 @@ using policy::cuda::cuda_thread_masked_loop;
 
 using policy::cuda::cuda_synchronize;
 
-namespace expt
-{
   // num_threads defaults to 1, but not expected to be used in kernel launch
   template <bool Async, int num_threads = 1>
-  using cuda_launch_t = policy::cuda::expt::cuda_launch_explicit_t<Async, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
-}
+  using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 /*!
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 5e0c842681..57ff050d2c 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -25,8 +25,6 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
-#include <type_traits>
-
 #include <cuda.h>
 
 #include "RAJA/util/macros.hpp"
@@ -473,7 +471,6 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   return temp;
 }
 
-
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
 template <typename Combiner, typename T, typename TempIterator>
@@ -527,6 +524,133 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
   return lastBlock && threadId == 0;
 }
 
+namespace expt {
+
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int warpId = threadId % RAJA::policy::cuda::WARP_SIZE;
+  int warpNum = threadId / RAJA::policy::cuda::WARP_SIZE;
+
+  T temp = val;
+
+  if (numThreads % RAJA::policy::cuda::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < RAJA::policy::cuda::WARP_SIZE; i *= 2) {
+      T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
+      temp = Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < RAJA::policy::cuda::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        temp = Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  static_assert(RAJA::policy::cuda::MAX_WARPS <= RAJA::policy::cuda::WARP_SIZE,
+               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+
+  // reduce per warp values
+  if (numThreads > RAJA::policy::cuda::WARP_SIZE) {
+
+    // Need to separate declaration and initialization for clang-cuda
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::MAX_WARPS>)];
+
+    // Partial placement new: Should call new(tmpsd) here but recasting memory
+    // to avoid calling constructor/destructor in shared memory.
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::MAX_WARPS> *>(tmpsd);
+
+    // write per warp values to shared memory
+    if (warpId == 0) {
+      sd->set(warpNum, temp);
+    }
+
+    __syncthreads();
+
+    if (warpNum == 0) {
+
+      // read per warp values
+      if (warpId * RAJA::policy::cuda::WARP_SIZE < numThreads) {
+        temp = sd->get(warpId);
+      } else {
+        temp = identity;
+      }
+
+      for (int i = 1; i < RAJA::policy::cuda::MAX_WARPS; i *= 2) {
+        T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
+        temp = Combiner{}(temp, rhs);
+      }
+    }
+
+    __syncthreads();
+  }
+
+  return temp;
+}
+
+
+template <typename OP, typename T>
+RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red) {
+
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+  unsigned int wrap_around = numBlocks - 1;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  T temp = block_reduce<OP>(red.val, OP::identity());
+
+  // one thread per block writes to device_mem
+  bool lastBlock = false;
+  if (threadId == 0) {
+    red.device_mem.set(blockId, temp);
+    // ensure write visible to all threadblocks
+    __threadfence();
+    // increment counter, (wraps back to zero if old count == wrap_around)
+    unsigned int old_count = ::atomicInc(red.device_count, wrap_around);
+    lastBlock = (old_count == wrap_around);
+  }
+
+  // returns non-zero value if any thread passes in a non-zero value
+  lastBlock = __syncthreads_or(lastBlock);
+
+  // last block accumulates values from device_mem
+  if (lastBlock) {
+    temp = OP::identity();
+
+    for (int i = threadId; i < numBlocks; i += numThreads) {
+      temp = OP{}(temp, red.device_mem.get(i));
+    }
+
+    temp = block_reduce<OP>(temp, OP::identity());
+
+    // one thread returns value
+    if (threadId == 0) {
+      *(red.devicetarget) = temp;
+    }
+  }
+
+  return lastBlock && threadId == 0;
+}
+
+} //  namespace expt
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
@@ -701,7 +825,7 @@ class PinnedTally
   //! get new value for use in resource
   T* new_value(::RAJA::resources::Cuda res)
   {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
@@ -748,7 +872,7 @@ class PinnedTally
 
   ~PinnedTally() { free_list(); }
 
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   omp::mutex m_mutex;
 #endif
 
@@ -985,7 +1109,7 @@ class Reduce
     if (parent) {
       if (val.setupForDevice()) {
         tally_or_val_ptr.val_ptr =
-            tally_or_val_ptr.list->new_value(*currentResource());
+            tally_or_val_ptr.list->new_value(currentResource());
         val.init_grid_val(tally_or_val_ptr.val_ptr);
         parent = nullptr;
       }
@@ -1004,7 +1128,7 @@ class Reduce
       tally_or_val_ptr.list = nullptr;
     } else if (parent) {
       if (val.value != val.identity) {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
diff --git a/include/RAJA/policy/hip.hpp b/include/RAJA/policy/hip.hpp
index e8127161a9..f848c11c27 100644
--- a/include/RAJA/policy/hip.hpp
+++ b/include/RAJA/policy/hip.hpp
@@ -27,8 +27,9 @@
 #include <hip/hip_runtime.h>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/hip/atomic.hpp"
+#include "RAJA/policy/hip/atomic.hpp"
 #endif
+
 #include "RAJA/policy/hip/forall.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/reduce.hpp"
@@ -36,7 +37,7 @@
 #include "RAJA/policy/hip/sort.hpp"
 #include "RAJA/policy/hip/kernel.hpp"
 #include "RAJA/policy/hip/synchronize.hpp"
-#include "RAJA/policy/hip/teams.hpp"
+#include "RAJA/policy/hip/launch.hpp"
 #include "RAJA/policy/hip/WorkGroup.hpp"
 
 
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 4fd680078d..9b5367552c 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -121,9 +121,9 @@ namespace detail
 struct hipInfo {
   hip_dim_t gridDim = 0;
   hip_dim_t blockDim = 0;
-  ::RAJA::resources::Hip* res = nullptr;
+  ::RAJA::resources::Hip res;
   bool setup_reducers = false;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   hipInfo* thread_states = nullptr;
   omp::mutex lock;
 #endif
@@ -149,7 +149,7 @@ class SetterResetter
 extern hipInfo g_status;
 
 extern hipInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
 
@@ -168,7 +168,7 @@ void synchronize_impl(::RAJA::resources::Hip res)
 RAJA_INLINE
 void synchronize()
 {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
@@ -187,7 +187,7 @@ void synchronize()
 RAJA_INLINE
 void synchronize(::RAJA::resources::Hip res)
 {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
@@ -205,7 +205,7 @@ void synchronize(::RAJA::resources::Hip res)
 RAJA_INLINE
 void launch(::RAJA::resources::Hip res, bool async = true)
 {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
@@ -254,7 +254,7 @@ hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get resource for current launch
 RAJA_INLINE
-::RAJA::resources::Hip* currentResource() { return detail::tl_status.res; }
+::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 
 //! create copy of loop_body that is setup for device execution
 template <typename LOOP_BODY>
@@ -267,8 +267,8 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
 {
   detail::SetterResetter<bool> setup_reducers_srer(
       detail::tl_status.setup_reducers, true);
-  detail::SetterResetter<::RAJA::resources::Hip*> res_srer(
-      detail::tl_status.res, &res);
+  detail::SetterResetter<::RAJA::resources::Hip> res_srer(
+      detail::tl_status.res, res);
 
   detail::tl_status.gridDim = gridDim;
   detail::tl_status.blockDim = blockDim;
diff --git a/include/RAJA/policy/hip/WorkGroup.hpp b/include/RAJA/policy/hip/WorkGroup.hpp
index a4e0384953..e36bc8d788 100644
--- a/include/RAJA/policy/hip/WorkGroup.hpp
+++ b/include/RAJA/policy/hip/WorkGroup.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ * \brief   Header file containing RAJA Dispatcher and WorkRunner constructs.
  *
  ******************************************************************************
  */
@@ -18,7 +18,7 @@
 #ifndef RAJA_hip_WorkGroup_HPP
 #define RAJA_hip_WorkGroup_HPP
 
-#include "RAJA/policy/hip/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/hip/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/hip/WorkGroup/WorkRunner.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
new file mode 100644
index 0000000000..ef2a5f5045
--- /dev/null
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -0,0 +1,118 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Dispatcher.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_hip_WorkGroup_Dispatcher_HPP
+#define RAJA_hip_WorkGroup_Dispatcher_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/hip/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
+
+#include <thread>
+#include <mutex>
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+namespace hip
+{
+
+// global function that creates the value on the device using the
+// factory and writes it into a pinned ptr
+template < typename Factory >
+__global__ void get_value_global(
+    typename Factory::value_type* ptr, Factory factory)
+{
+  *ptr = factory();
+}
+
+// get the pinned ptr buffer
+inline void* get_cached_value_ptr(size_t nbytes)
+{
+  static size_t cached_nbytes = 0;
+  static void* ptr = nullptr;
+  if (nbytes > cached_nbytes) {
+    cached_nbytes = 0;
+    hipErrchk(hipHostFree(ptr));
+    hipErrchk(hipHostMalloc(&ptr, nbytes));
+    cached_nbytes = nbytes;
+  }
+  return ptr;
+}
+
+// mutex that guards against concurrent use of
+// pinned buffer and get_cached_value_ptr()
+inline std::mutex& get_value_mutex()
+{
+  static std::mutex s_mutex;
+  return s_mutex;
+}
+
+// get the device function pointer by calling a global function to
+// write it into a pinned ptr, beware different instantiates of this
+// function may run concurrently
+template < typename Factory >
+inline auto get_value(Factory&& factory)
+{
+  using value_type = typename std::decay_t<Factory>::value_type;
+  const std::lock_guard<std::mutex> lock(get_value_mutex());
+
+  auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
+  auto func = get_value_global<std::decay_t<Factory>>;
+  hipLaunchKernelGGL(func, dim3(1), dim3(1), 0, 0,
+                     ptr, std::forward<Factory>(factory));
+  hipErrchk(hipGetLastError());
+  hipErrchk(hipDeviceSynchronize());
+
+  return *ptr;
+}
+
+template < typename Factory >
+inline auto get_cached_value(Factory&& factory)
+{
+  static auto value = get_value(std::forward<Factory>(factory));
+  return value;
+}
+
+}  // namespace hip
+
+/*!
+* Populate and return a Dispatcher object that can be used in device code
+*/
+template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async >
+inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
+{
+  static Dispatcher_T dispatcher{
+        Dispatcher_T::template makeDispatcher<T>(
+          [](auto&& factory) {
+            return hip::get_cached_value(
+                std::forward<decltype(factory)>(factory));
+          }) };
+  return &dispatcher;
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp b/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
deleted file mode 100644
index 7bc23d58cc..0000000000
--- a/include/RAJA/policy/hip/WorkGroup/Vtable.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA workgroup Vtable.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_hip_WorkGroup_Vtable_HPP
-#define RAJA_hip_WorkGroup_Vtable_HPP
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/policy/hip/policy.hpp"
-
-#include "RAJA/pattern/WorkGroup/Vtable.hpp"
-
-#include <thread>
-#include <mutex>
-
-
-namespace RAJA
-{
-
-namespace detail
-{
-
-#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
-
-// global function that gets the device function pointer and
-// writes it into a pinned ptrptr
-template < typename T, typename Vtable_T >
-__global__ void get_Vtable_hip_device_call_global(
-    typename Vtable_T::call_sig* ptrptr)
-{
-  *ptrptr = &Vtable_T::template device_call<T>;
-}
-
-// allocate the pinned ptrptr buffer
-inline void* get_Vtable_hip_device_call_ptrptr()
-{
-  void* ptrptr = nullptr;
-  hipErrchk(hipHostMalloc(&ptrptr, sizeof(typename Vtable<void>::call_sig)));
-  return ptrptr;
-}
-
-// get the pinned ptrptr buffer
-inline void* get_cached_Vtable_hip_device_call_ptrptr()
-{
-  static void* ptrptr = get_Vtable_hip_device_call_ptrptr();
-  return ptrptr;
-}
-
-// mutex that guards against concurrent use of
-// get_cached_Vtable_hip_device_call_ptrptr()
-inline std::mutex& get_Vtable_hip_mutex()
-{
-  static std::mutex s_mutex;
-  return s_mutex;
-}
-
-template < typename T, typename Vtable_T >
-inline typename Vtable_T::call_sig get_Vtable_hip_device_call()
-{
-  const std::lock_guard<std::mutex> lock(get_Vtable_hip_mutex());
-
-  typename Vtable_T::call_sig* ptrptr =
-      static_cast<typename Vtable_T::call_sig*>(
-        get_cached_Vtable_hip_device_call_ptrptr());
-  auto func = get_Vtable_hip_device_call_global<T, Vtable_T>;
-  hipLaunchKernelGGL(func,
-      dim3(1), dim3(1), 0, 0, ptrptr);
-  hipErrchk(hipGetLastError());
-  hipErrchk(hipDeviceSynchronize());
-
-  return *ptrptr;
-}
-
-template < typename T, typename Vtable_T >
-inline typename Vtable_T::call_sig get_cached_Vtable_hip_device_call()
-{
-  static typename Vtable_T::call_sig ptr =
-      get_Vtable_hip_device_call<T, Vtable_T>();
-  return ptr;
-}
-
-/*!
-* Populate and return a Vtable object where the
-* call operator is a device function
-*/
-template < typename T, typename Vtable_T, size_t BLOCK_SIZE, bool Async >
-inline const Vtable_T* get_Vtable(hip_work<BLOCK_SIZE, Async> const&)
-{
-  static Vtable_T vtable{
-        &Vtable_T::template move_construct_destroy<T>,
-        get_cached_Vtable_hip_device_call<T, Vtable_T>(),
-        &Vtable_T::template destroy<T>,
-        sizeof(T)
-      };
-  return &vtable;
-}
-
-#endif
-
-}  // namespace detail
-
-}  // namespace RAJA
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 79c58c994a..9e831715fd 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -37,12 +37,14 @@ namespace detail
  * and returns any per run resources
  */
 template <size_t BLOCK_SIZE, bool Async,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -50,6 +52,7 @@ struct WorkRunner<
         RAJA::hip_exec_async<BLOCK_SIZE>,
         RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -58,6 +61,7 @@ struct WorkRunner<
         RAJA::hip_exec_async<BLOCK_SIZE>,
         RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>;
@@ -92,12 +96,14 @@ struct WorkRunner<
  * and returns any per run resources
  */
 template <size_t BLOCK_SIZE, bool Async,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -105,6 +111,7 @@ struct WorkRunner<
         RAJA::hip_exec_async<BLOCK_SIZE>,
         RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -113,6 +120,7 @@ struct WorkRunner<
         RAJA::hip_exec_async<BLOCK_SIZE>,
         RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>;
@@ -143,8 +151,6 @@ struct WorkRunner<
 };
 
 
-#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
-
 /*!
  * A body and segment holder for storing loops that will be executed
  * on the device
@@ -189,7 +195,7 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
   const index_type i_loop = blockIdx.y;
   // TODO: cache pointer to value_type in shared memory
   // TODO: cache holder (value_type::obj) in shared memory
-  value_type::call(&iter[i_loop], args...);
+  value_type::device_call(&iter[i_loop], args...);
 }
 
 
@@ -200,23 +206,46 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
  * by the average number of iterates per loop
  */
 template <size_t BLOCK_SIZE, bool Async,
+          typename DISPATCH_POLICY_T,
           typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::hip_work<BLOCK_SIZE, Async>,
         RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
 {
   using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
   using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using dispatch_policy =  DISPATCH_POLICY_T;
   using Allocator = ALLOCATOR_T;
   using index_type = INDEX_T;
   using resource_type = resources::Hip;
 
-  using vtable_type = Vtable<RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
+  // The type that will hold the segment and loop body in work storage
+  struct holder_type {
+    template < typename T >
+    using type = HoldHipDeviceXThreadblockLoop<
+        typename camp::at<T, camp::num<0>>::type, // ITERABLE
+        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
+        index_type, Args...>;
+  };
+  ///
+  template < typename T >
+  using holder_type_t = typename holder_type::template type<T>;
+
+  // The policy indicating where the call function is invoked
+  // in this case the values are called on the device
+  using dispatcher_exec_policy = exec_policy;
+
+  // The Dispatcher policy with holder_types used internally to handle the
+  // ranges and callables passed in by the user.
+  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+
+  using dispatcher_type = Dispatcher<Platform::hip, dispatcher_holder_policy, RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
 
   WorkRunner() = default;
 
@@ -236,15 +265,6 @@ struct WorkRunner<
     return *this;
   }
 
-  // The type  that will hold the segment and loop body in work storage
-  template < typename ITERABLE, typename LOOP_BODY >
-  using holder_type = HoldHipDeviceXThreadblockLoop<ITERABLE, LOOP_BODY,
-                                 index_type, Args...>;
-
-  // The policy indicating where the call function is invoked
-  // in this case the values are called on the device
-  using vtable_exec_policy = exec_policy;
-
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
   template < typename WorkContainer, typename Iterable, typename LoopBody >
@@ -255,7 +275,7 @@ struct WorkRunner<
     using ITERABLE  = camp::decay<Iterable>;
     using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
-    using holder = holder_type<ITERABLE, LOOP_BODY>;
+    using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
     // using true_value_type = typename WorkContainer::template true_value_type<holder>;
 
@@ -275,7 +295,7 @@ struct WorkRunner<
       //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Vtable<holder, vtable_type>(vtable_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -346,6 +366,32 @@ struct WorkRunner<
   index_type m_total_iterations = 0;
 };
 
+#if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported runner types incomplete
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        RAJA::indirect_function_call_dispatch,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename ALLOCATOR_T,
+          typename INDEX_T,
+          typename ... Args>
+struct WorkRunner<
+        RAJA::hip_work<BLOCK_SIZE, Async>,
+        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+        RAJA::indirect_virtual_function_dispatch,
+        ALLOCATOR_T,
+        INDEX_T,
+        Args...>;
 
 #endif
 
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index 0f5a49d628..9145466e9a 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -33,6 +33,8 @@
 #include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
+#include "RAJA/util/camp_aliases.hpp"
+#include "RAJA/util/concepts.hpp"
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/TypeConvert.hpp"
 #include "RAJA/util/macros.hpp"
@@ -166,15 +168,101 @@ RAJA_INLINE __device__ T hip_atomic_CAS_oper(T volatile *acc, OPER &&oper)
 }
 
 
+template < typename T, typename TypeList >
+struct is_any_of;
+
+template < typename T, typename... Types >
+struct is_any_of<T, list<Types...>>
+  : concepts::any_of<camp::is_same<T, Types>...>
+{};
+
+template < typename T, typename TypeList >
+using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
+
+template < typename T, typename TypeList >
+using enable_if_is_none_of = std::enable_if_t<concepts::negate<is_any_of<T, TypeList>>::value, T>;
+
+
+using hip_atomicCommon_builtin_types = list<
+      int
+     ,unsigned int
+     ,unsigned long long
+    >;
+
+
+using hip_atomicAdd_builtin_types = list<
+      int
+     ,unsigned int
+     ,unsigned long long
+     ,float
+#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
+     ,double
+#endif
+    >;
+
 /*!
- * Catch-all policy passes off to HIP's builtin atomics.
- *
- * This catch-all will only work for types supported by the compiler.
- * Specialization below can adapt for some unsupported types.
+ * List of types where HIP builtin atomics are used to implement atomicSub.
+ */
+using hip_atomicSub_types = list<
+      int
+     ,unsigned int
+     ,float
+#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
+     ,double
+#endif
+    >;
+
+using hip_atomicSub_builtin_types = list<
+      int
+     ,unsigned int
+    >;
+
+/*!
+ * List of types where HIP builtin atomicAdd is used to implement atomicSub.
  *
- * These are atomic in hip device code and non-atomic otherwise
+ * Avoid multiple definition errors by including the previous list type here
+ * to ensure these lists have different types.
  */
-template <typename T>
+using hip_atomicSub_via_Add_builtin_types = list<
+      float
+#ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
+     ,double
+#endif
+    >;
+
+using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
+
+using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
+
+using hip_atomicIncReset_builtin_types = list<
+      unsigned int
+    >;
+
+using hip_atomicInc_builtin_types = list< >;
+
+using hip_atomicDecReset_builtin_types = list<
+      unsigned int
+    >;
+
+using hip_atomicDec_builtin_types = list< >;
+
+using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
+
+using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
+
+using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
+
+using hip_atomicExch_builtin_types = list<
+      int
+     ,unsigned int
+     ,unsigned long long
+     ,float
+    >;
+
+using hip_atomicCAS_builtin_types = hip_atomicCommon_builtin_types;
+
+
+template <typename T, enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
 RAJA_INLINE __device__ T hip_atomicAdd(T volatile *acc, T value)
 {
   return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
@@ -182,48 +270,14 @@ RAJA_INLINE __device__ T hip_atomicAdd(T volatile *acc, T value)
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit signed atomicAdd support by HIP
-template <>
-RAJA_INLINE __device__ int hip_atomicAdd<int>(int volatile *acc,
-                                          int value)
-{
-  return ::atomicAdd((int *)acc, value);
-}
-
-
-// 32-bit unsigned atomicAdd support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicAdd<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
-{
-  return ::atomicAdd((unsigned *)acc, value);
-}
-#endif
-
-// 64-bit unsigned atomicAdd support by HIP
-#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicAdd<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
+template <typename T, enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T volatile *acc, T value)
 {
-  return ::atomicAdd((unsigned long long *)acc, value);
+  return ::atomicAdd((T *)acc, value);
 }
-#endif
-
 
-// 32-bit float atomicAdd support by HIP
-#if __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__
-template <>
-RAJA_INLINE __device__ float hip_atomicAdd<float>(float volatile *acc,
-                                              float value)
-{
-  return ::atomicAdd((float *)acc, value);
-}
-#endif
 
-template <typename T>
+template <typename T, enable_if_is_none_of<T, hip_atomicSub_types>* = nullptr>
 RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value)
 {
   return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
@@ -231,25 +285,26 @@ RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value)
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit signed atomicSub support by HIP
-template <>
-RAJA_INLINE __device__ int hip_atomicSub<int>(int volatile *acc,
-                                          int value)
+/*!
+ * HIP atomicSub builtin implementation.
+ */
+template <typename T, enable_if_is_any_of<T, hip_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value)
 {
-  return ::atomicSub((int *)acc, value);
+  return ::atomicSub((T *)acc, value);
 }
 
-// 32-bit unsigned atomicSub support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicSub<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+/*!
+ * HIP atomicSub via atomicAdd builtin implementation.
+ */
+template <typename T, enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T volatile *acc, T value)
 {
-  return ::atomicSub((unsigned *)acc, value);
+  return ::atomicAdd((T *)acc, -value);
 }
-#endif
 
-template <typename T>
+
+template <typename T, enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
 RAJA_INLINE __device__ T hip_atomicMin(T volatile *acc, T value)
 {
   return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
@@ -257,36 +312,14 @@ RAJA_INLINE __device__ T hip_atomicMin(T volatile *acc, T value)
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit signed atomicMin support by HIP
-template <>
-RAJA_INLINE __device__ int hip_atomicMin<int>(int volatile *acc,
-                                          int value)
-{
-  return ::atomicMin((int *)acc, value);
-}
-
-
-// 32-bit unsigned atomicMin support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicMin<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+template <typename T, enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T volatile *acc, T value)
 {
-  return ::atomicMin((unsigned *)acc, value);
+  return ::atomicMin((T *)acc, value);
 }
-#endif
 
-#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicMin<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
-{
-  return ::atomicMin((unsigned long long *)acc, value);
-}
-#endif
 
-template <typename T>
+template <typename T, enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
 RAJA_INLINE __device__ T hip_atomicMax(T volatile *acc, T value)
 {
   return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
@@ -294,273 +327,141 @@ RAJA_INLINE __device__ T hip_atomicMax(T volatile *acc, T value)
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit signed atomicMax support by HIP
-template <>
-RAJA_INLINE __device__ int hip_atomicMax<int>(int volatile *acc,
-                                          int value)
+template <typename T, enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T volatile *acc, T value)
 {
-  return ::atomicMax((int *)acc, value);
+  return ::atomicMax((T *)acc, value);
 }
 
 
-// 32-bit unsigned atomicMax support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicMax<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
-{
-  return ::atomicMax((unsigned *)acc, value);
-}
-#endif
-
-#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicMax<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
+template <typename T, enable_if_is_none_of<T, hip_atomicIncReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc, T val)
 {
-  return ::atomicMax((unsigned long long *)acc, value);
+  return hip_atomic_CAS_oper(acc, [=] __device__(T old) {
+    return ((old >= val) ? (T)0 : (old + (T)1));
+  });
 }
-#endif
 
-template <typename T>
+template <typename T, enable_if_is_any_of<T, hip_atomicIncReset_builtin_types>* = nullptr>
 RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc, T val)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T old) {
-    return ((old >= val) ? 0 : (old + 1));
-  });
+  return ::atomicInc((T *)acc, val);
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit unsigned atomicInc support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicInc<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+
+template <typename T, enable_if_is_none_of<T, hip_atomicInc_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc)
 {
-  return ::atomicInc((unsigned *)acc, value);
+  return hip_atomicAdd(acc, (T)1);
 }
-#endif
 
-template <typename T>
+template <typename T, enable_if_is_any_of<T, hip_atomicInc_builtin_types>* = nullptr>
 RAJA_INLINE __device__ T hip_atomicInc(T volatile *acc)
 {
-  return hip_atomic_CAS_oper(acc,
-                                      [=] __device__(T a) { return a + 1; });
+  return ::atomicInc((T *)acc);
 }
 
 
-template <typename T>
+template <typename T, enable_if_is_none_of<T, hip_atomicDecReset_builtin_types>* = nullptr>
 RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc, T val)
 {
   // See:
   // http://docs.nvidia.com/hip/hip-c-programming-guide/index.html#atomicdec
   return hip_atomic_CAS_oper(acc, [=] __device__(T old) {
-    return (((old == 0) | (old > val)) ? val : (old - 1));
+    return (((old == (T)0) | (old > val)) ? val : (old - (T)1));
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit unsigned atomicDec support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicDec<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
-{
-  return ::atomicDec((unsigned *)acc, value);
-}
-#endif
-
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc)
+template <typename T, enable_if_is_any_of<T, hip_atomicDecReset_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc, T val)
 {
-  return hip_atomic_CAS_oper(acc,
-                                      [=] __device__(T a) { return a - 1; });
+  return ::atomicDec((T *)acc, val);
 }
 
 
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicAnd(T volatile *acc, T value)
-{
-  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a & value;
-  });
-}
-
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit signed atomicAnd support by HIP
-template <>
-RAJA_INLINE __device__ int hip_atomicAnd<int>(int volatile *acc,
-                                          int value)
+template <typename T, enable_if_is_none_of<T, hip_atomicDec_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc)
 {
-  return ::atomicAnd((int *)acc, value);
+  return hip_atomicSub(acc, (T)1);
 }
 
-
-// 32-bit unsigned atomicAnd support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicAnd<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+template <typename T, enable_if_is_any_of<T, hip_atomicDec_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicDec(T volatile *acc)
 {
-  return ::atomicAnd((unsigned *)acc, value);
+  return ::atomicDec((T *)acc);
 }
-#endif
 
-// 64-bit unsigned atomicAnd support by HIP sm_35 and later
-#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicAnd<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
-{
-  return ::atomicAnd((unsigned long long *)acc, value);
-}
-#endif
 
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicOr(T volatile *acc, T value)
+template <typename T, enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T volatile *acc, T val)
 {
   return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a | value;
+    return a & val;
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit signed atomicOr support by HIP
-template <>
-RAJA_INLINE __device__ int hip_atomicOr<int>(int volatile *acc,
-                                         int value)
+template <typename T, enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T volatile *acc, T val)
 {
-  return ::atomicOr((int *)acc, value);
+  return ::atomicAnd((T *)acc, val);
 }
 
 
-// 32-bit unsigned atomicOr support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicOr<unsigned>(unsigned volatile *acc,
-                                                   unsigned value)
-{
-  return ::atomicOr((unsigned *)acc, value);
-}
-#endif
-
-// 64-bit unsigned atomicOr support by HIP sm_35 and later
-#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicOr<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
-{
-  return ::atomicOr((unsigned long long *)acc, value);
-}
-#endif
-
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicXor(T volatile *acc, T value)
+template <typename T, enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T volatile *acc, T val)
 {
   return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
-    return a ^ value;
+    return a | val;
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-// 32-bit signed atomicXor support by HIP
-template <>
-RAJA_INLINE __device__ int hip_atomicXor<int>(int volatile *acc,
-                                          int value)
-{
-  return ::atomicXor((int *)acc, value);
-}
-
-
-// 32-bit unsigned atomicXor support by HIP
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicXor<unsigned>(unsigned volatile *acc,
-                                                    unsigned value)
+template <typename T, enable_if_is_any_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T volatile *acc, T val)
 {
-  return ::atomicXor((unsigned *)acc, value);
+  return ::atomicOr((T *)acc, val);
 }
-#endif
 
-#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-// 64-bit unsigned atomicXor support by HIP sm_35 and later
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicXor<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
-{
-  return ::atomicXor((unsigned long long *)acc, value);
-}
-#endif
 
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T value)
+template <typename T, enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T volatile *acc, T val)
 {
-  return hip_atomic_CAS_oper(acc, [=] __device__(T) {
-    return value;
+  return hip_atomic_CAS_oper(acc, [=] __device__(T a) {
+    return a ^ val;
   });
 }
 
-#if __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__
-template <>
-RAJA_INLINE __device__ int hip_atomicExchange<int>(
-    int volatile *acc, int value)
+template <typename T, enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T volatile *acc, T val)
 {
-  return ::atomicExch((int *)acc, value);
+  return ::atomicXor((T *)acc, val);
 }
 
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicExchange<unsigned>(
-    unsigned volatile *acc, unsigned value)
-{
-  return ::atomicExch((unsigned *)acc, value);
-}
-#endif
 
-#if __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicExchange<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long value)
+template <typename T, enable_if_is_none_of<T, hip_atomicExch_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T val)
 {
-  return ::atomicExch((unsigned long long *)acc, value);
+  return hip_atomic_CAS_oper(acc, [=] __device__(T) {
+    return val;
+  });
 }
-#endif
 
-#if __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__
-template <>
-RAJA_INLINE __device__ float hip_atomicExchange<float>(
-    float volatile *acc, float value)
+template <typename T, enable_if_is_any_of<T, hip_atomicExch_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicExchange(T volatile *acc, T val)
 {
-  return ::atomicExch((float *)acc, value);
+  return ::atomicExch((T *)acc, val);
 }
-#endif
 
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicCAS(T volatile *acc, T compare, T value)
-{
-  return hip_atomic_CAS(acc, compare, value);
-}
 
-template <>
-RAJA_INLINE __device__ int hip_atomicCAS<int>(
-    int volatile *acc, int compare, int value)
+template <typename T, enable_if_is_none_of<T, hip_atomicCAS_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicCAS(T volatile *acc, T compare, T val)
 {
-  return ::atomicCAS((int *)acc, compare, value);
+  return hip_atomic_CAS(acc, compare, val);
 }
 
-template <>
-RAJA_INLINE __device__ unsigned hip_atomicCAS<unsigned>(
-    unsigned volatile *acc, unsigned compare, unsigned value)
+template <typename T, enable_if_is_any_of<T, hip_atomicCAS_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicCAS( T volatile *acc, T compare, T val)
 {
-  return ::atomicCAS((unsigned *)acc, compare, value);
-}
-
-template <>
-RAJA_INLINE __device__ unsigned long long hip_atomicCAS<unsigned long long>(
-    unsigned long long volatile *acc,
-    unsigned long long compare,
-    unsigned long long value)
-{
-  return ::atomicCAS((unsigned long long *)acc, compare, value);
+  return ::atomicCAS((T *)acc, compare, val);
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 54074e8bce..ab3109b354 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -145,6 +145,30 @@ __launch_bounds__(BlockSize, 1) __global__
   }
 }
 
+template <typename EXEC_POL,
+          size_t BlockSize,
+          typename Iterator,
+          typename LOOP_BODY,
+          typename IndexType,
+          typename ForallParam>
+__launch_bounds__(BlockSize, 1) __global__
+    void forallp_hip_kernel(
+                            LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
+{
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(loop_body);
+  auto& body = privatizer.get_priv();
+  auto ii = static_cast<IndexType>(getGlobalIdx_1D_1D());
+  if ( ii < length )
+  {
+    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  }
+  RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
+}
+
 }  // namespace impl
 
 //
@@ -155,11 +179,17 @@ __launch_bounds__(BlockSize, 1) __global__
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async>
-RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip_res,
-                                                    hip_exec<BlockSize, Async>,
-                                                    Iterable&& iter,
-                                                    LoopBody&& loop_body)
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Hip>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Hip hip_res,
+            hip_exec<BlockSize, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
@@ -215,6 +245,82 @@ RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(resources::Hip hip
   return resources::EventProxy<resources::Hip>(hip_res);
 }
 
+
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Hip>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+forall_impl(resources::Hip hip_res,
+            hip_exec<BlockSize, Async>,
+            Iterable&& iter,
+            LoopBody&& loop_body,
+            ForallParam f_params)
+{
+  using Iterator  = camp::decay<decltype(std::begin(iter))>;
+  using LOOP_BODY = camp::decay<LoopBody>;
+  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = RAJA::hip_exec<BlockSize, Async>;
+
+  auto func = impl::forallp_hip_kernel< EXEC_POL, BlockSize, Iterator, LOOP_BODY, IndexType, camp::decay<ForallParam> >;
+
+  //
+  // Compute the requested iteration space size
+  //
+  Iterator begin = std::begin(iter);
+  Iterator end = std::end(iter);
+  IndexType len = std::distance(begin, end);
+
+  // Only launch kernel if we have something to iterate over
+  if (len > 0 && BlockSize > 0) {
+    //
+    // Compute the number of blocks
+    //
+    hip_dim_t blockSize{BlockSize, 1, 1};
+    hip_dim_t gridSize = impl::getGridDim(static_cast<hip_dim_member_t>(len), blockSize);
+
+    RAJA_FT_BEGIN;
+
+    RAJA::hip::detail::hipInfo launch_info;
+    launch_info.gridDim = gridSize;
+    launch_info.blockDim = blockSize;
+    launch_info.res = hip_res;
+
+    //
+    // Setup shared memory buffers
+    //
+    size_t shmem = 0;
+
+    //  printf("gridsize = (%d,%d), blocksize = %d\n",
+    //         (int)gridSize.x,
+    //         (int)gridSize.y,
+    //         (int)blockSize.x);
+
+    {
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
+      //
+      // Privatize the loop_body, using make_launch_body to setup reductions
+      //
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          gridSize, blockSize, shmem, hip_res, std::forward<LoopBody>(loop_body));
+
+
+      //
+      // Launch the kernels
+      //
+      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
+      RAJA::hip::launch((const void*)func, gridSize, BlockSize, args, shmem, hip_res, Async);
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    RAJA_FT_END;
+  }
+
+  return resources::EventProxy<resources::Hip>(hip_res);
+}
+
 //
 //////////////////////////////////////////////////////////////////////
 //
diff --git a/include/RAJA/policy/hip/teams.hpp b/include/RAJA/policy/hip/launch.hpp
similarity index 89%
rename from include/RAJA/policy/hip/teams.hpp
rename to include/RAJA/policy/hip/launch.hpp
index 22015b89d0..edeb6a91b6 100644
--- a/include/RAJA/policy/hip/teams.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -15,10 +15,10 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_hip_HPP
-#define RAJA_pattern_teams_hip_HPP
+#ifndef RAJA_pattern_launch_hip_HPP
+#define RAJA_pattern_launch_hip_HPP
 
-#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/pattern/detail/privatizer.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
@@ -28,23 +28,27 @@
 namespace RAJA
 {
 
-namespace expt
-{
-
 template <typename BODY>
-__global__ void launch_global_fcn(LaunchContext ctx, BODY body_in)
+__global__ void launch_global_fcn(BODY body_in)
 {
+  LaunchContext ctx;
+
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
+
+  //Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
   body(ctx);
 }
 
 template <bool async>
-struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
+struct LaunchExecute<RAJA::hip_launch_t<async, 0>> {
 
   template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -56,13 +60,13 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -71,23 +75,18 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
 
       RAJA_FT_BEGIN;
 
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+            gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -97,7 +96,7 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
 
   template <typename BODY_IN>
   static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -109,13 +108,13 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -124,23 +123,18 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
 
       RAJA_FT_BEGIN;
 
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+            gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -153,19 +147,26 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, 0>> {
 
 template <typename BODY, int num_threads>
 __launch_bounds__(num_threads, 1) __global__
-static void launch_global_fcn_fixed(LaunchContext ctx, BODY body_in)
+static void launch_global_fcn_fixed(BODY body_in)
 {
+  LaunchContext ctx;
+
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
   auto& body = privatizer.get_priv();
+
+  //Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
   body(ctx);
 }
 
 template <bool async, int nthreads>
-struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
+struct LaunchExecute<RAJA::hip_launch_t<async, nthreads>> {
 
   template <typename BODY_IN>
-  static void exec(LaunchContext const &ctx, BODY_IN &&body_in)
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -177,13 +178,13 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -192,23 +193,18 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
 
       RAJA_FT_BEGIN;
 
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+            gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -218,7 +214,7 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
 
   template <typename BODY_IN>
   static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY_IN &&body_in)
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -230,13 +226,13 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(ctx.teams.value[0]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[1]),
-                        static_cast<hip_dim_member_t>(ctx.teams.value[2]) };
+    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+                        static_cast<hip_dim_member_t>(params.teams.value[1]),
+                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(ctx.threads.value[0]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[1]),
-                         static_cast<hip_dim_member_t>(ctx.threads.value[2]) };
+    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
+                         static_cast<hip_dim_member_t>(params.threads.value[1]),
+                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -245,23 +241,18 @@ struct LaunchExecute<RAJA::expt::hip_launch_t<async, nthreads>> {
 
       RAJA_FT_BEGIN;
 
-      //
-      // Setup shared memory buffers
-      //
-      size_t shmem = 0;
-
       {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
         BODY body = RAJA::hip::make_launch_body(
-            gridSize, blockSize, shmem, hip_res, std::forward<BODY_IN>(body_in));
+            gridSize, blockSize, params.shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&ctx, (void*)&body};
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, shmem, hip_res, async, ctx.kernel_name);
+        void *args[] = {(void*)&body};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, args, params.shared_mem_size, hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -1136,7 +1127,5 @@ struct TileICountExecute<hip_block_xyz_direct<DIM>, SEGMENT> {
   }
 };
 
-}  // namespace expt
-
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
new file mode 100644
index 0000000000..92e9a5c954
--- /dev/null
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -0,0 +1,48 @@
+#ifndef NEW_REDUCE_HIP_REDUCE_HPP
+#define NEW_REDUCE_HIP_REDUCE_HPP
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include <hip/hip_runtime.h>
+#include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/policy/hip/reduce.hpp"
+#include "RAJA/pattern/params/reducer.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  init(Reducer<OP, T>& red, const RAJA::hip::detail::hipInfo & cs)
+  {
+    hipMalloc( (void**)(&(red.devicetarget)), sizeof(T));
+    red.device_mem.allocate(cs.gridDim.x * cs.gridDim.y * cs.gridDim.z);
+    red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T>
+  RAJA_HOST_DEVICE
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  combine(Reducer<OP, T>& red) {
+    RAJA::hip::impl::expt::grid_reduce(red);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
+  resolve(Reducer<OP, T>& red) {
+    hipDeviceSynchronize();
+    hipMemcpy(&red.val, red.devicetarget, sizeof(T), hipMemcpyDeviceToHost);
+    *red.target = OP{}(red.val, *red.target);
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif
+
+#endif //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 38977bddb1..2b7a12ea4b 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -102,14 +102,16 @@ struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::hip> {
 };
 
-#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+/// execute the enqueued loops in an unordered fashion by mapping loops to
+/// blocks in the y direction and loop iterations to threads in the x direction
+/// with the size of the x direction being the average of the iteration counts
+/// of all the loops
 struct unordered_hip_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
                        RAJA::Policy::hip,
                        RAJA::Pattern::workgroup_order,
                        RAJA::Platform::hip> {
 };
-#endif
 
 
 ///
@@ -235,9 +237,7 @@ using hip_work_async = policy::hip::hip_work<BLOCK_SIZE, true>;
 using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
 
-#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 using policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
-#endif
 
 using policy::hip::hip_reduce_base;
 using policy::hip::hip_reduce;
@@ -257,10 +257,7 @@ using policy::hip::hip_thread_masked_loop;
 
 using policy::hip::hip_synchronize;
 
-namespace expt
-{
-  using policy::hip::hip_launch_t;
-}
+using policy::hip::hip_launch_t;
 
 /*!
  * Maps segment indices to HIP threads.
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 60e6494c55..2c957b9b7a 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -398,6 +398,134 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
   return lastBlock && threadId == 0;
 }
 
+namespace expt {
+
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int warpId = threadId % RAJA::policy::hip::WARP_SIZE;
+  int warpNum = threadId / RAJA::policy::hip::WARP_SIZE;
+
+  T temp = val;
+
+  if (numThreads % RAJA::policy::hip::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < RAJA::policy::hip::WARP_SIZE; i *= 2) {
+      T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
+      temp = Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < RAJA::policy::hip::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        temp = Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  static_assert(RAJA::policy::hip::MAX_WARPS <= RAJA::policy::hip::WARP_SIZE,
+               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+
+  // reduce per warp values
+  if (numThreads > RAJA::policy::hip::WARP_SIZE) {
+
+    // Need to separate declaration and initialization for clang-hip
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::MAX_WARPS>)];
+
+    // Partial placement new: Should call new(tmpsd) here but recasting memory
+    // to avoid calling constructor/destructor in shared memory.
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::MAX_WARPS> *>(tmpsd);
+
+    // write per warp values to shared memory
+    if (warpId == 0) {
+      sd->set(warpNum, temp);
+    }
+
+    __syncthreads();
+
+    if (warpNum == 0) {
+
+      // read per warp values
+      if (warpId * RAJA::policy::hip::WARP_SIZE < numThreads) {
+        temp = sd->get(warpId);
+      } else {
+        temp = identity;
+      }
+
+      for (int i = 1; i < RAJA::policy::hip::MAX_WARPS; i *= 2) {
+        T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
+        temp = Combiner{}(temp, rhs);
+      }
+    }
+
+    __syncthreads();
+  }
+
+  return temp;
+}
+
+
+template <typename OP, typename T>
+RAJA_DEVICE RAJA_INLINE bool grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red) {
+
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+  unsigned int wrap_around = numBlocks - 1;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  T temp = block_reduce<OP>(red.val, OP::identity());
+
+  // one thread per block writes to device_mem
+  bool lastBlock = false;
+  if (threadId == 0) {
+    red.device_mem.set(blockId, temp);
+    // ensure write visible to all threadblocks
+    __threadfence();
+    // increment counter, (wraps back to zero if old count == wrap_around)
+    unsigned int old_count = ::atomicInc(red.device_count, wrap_around);
+    lastBlock = (old_count == wrap_around);
+  }
+
+  // returns non-zero value if any thread passes in a non-zero value
+  lastBlock = __syncthreads_or(lastBlock);
+
+  // last block accumulates values from device_mem
+  if (lastBlock) {
+    temp = OP::identity();
+
+    for (int i = threadId; i < numBlocks; i += numThreads) {
+      temp = OP{}(temp, red.device_mem.get(i));
+    }
+
+    temp = block_reduce<OP>(temp, OP::identity());
+
+    // one thread returns value
+    if (threadId == 0) {
+      *(red.devicetarget) = temp;
+    }
+  }
+
+  return lastBlock && threadId == 0;
+}
+
+} //  namespace expt
+
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
@@ -572,7 +700,7 @@ class PinnedTally
   //! get new value for use in resource
   T* new_value(::RAJA::resources::Hip res)
   {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
@@ -619,7 +747,7 @@ class PinnedTally
 
   ~PinnedTally() { free_list(); }
 
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   omp::mutex m_mutex;
 #endif
 
@@ -856,7 +984,7 @@ class Reduce
     if (parent) {
       if (val.setupForDevice()) {
         tally_or_val_ptr.val_ptr =
-            tally_or_val_ptr.list->new_value(*currentResource());
+            tally_or_val_ptr.list->new_value(currentResource());
         val.init_grid_val(tally_or_val_ptr.val_ptr);
         parent = nullptr;
       }
@@ -875,7 +1003,7 @@ class Reduce
       tally_or_val_ptr.list = nullptr;
     } else if (parent) {
       if (val.value != val.identity) {
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
diff --git a/include/RAJA/policy/loop.hpp b/include/RAJA/policy/loop.hpp
index 28f50c453c..e2166207cc 100644
--- a/include/RAJA/policy/loop.hpp
+++ b/include/RAJA/policy/loop.hpp
@@ -29,7 +29,7 @@
 #include "RAJA/policy/loop/policy.hpp"
 #include "RAJA/policy/loop/scan.hpp"
 #include "RAJA/policy/loop/sort.hpp"
-#include "RAJA/policy/loop/teams.hpp"
+#include "RAJA/policy/loop/launch.hpp"
 #include "RAJA/policy/loop/WorkGroup.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/WorkGroup.hpp b/include/RAJA/policy/loop/WorkGroup.hpp
index 0a3c792709..637c711aad 100644
--- a/include/RAJA/policy/loop/WorkGroup.hpp
+++ b/include/RAJA/policy/loop/WorkGroup.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ * \brief   Header file containing RAJA Dispatcher and WorkRunner constructs.
  *
  ******************************************************************************
  */
@@ -18,7 +18,7 @@
 #ifndef RAJA_loop_WorkGroup_HPP
 #define RAJA_loop_WorkGroup_HPP
 
-#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/loop/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/loop/WorkGroup/WorkRunner.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/WorkGroup/Vtable.hpp b/include/RAJA/policy/loop/WorkGroup/Dispatcher.hpp
similarity index 59%
rename from include/RAJA/policy/loop/WorkGroup/Vtable.hpp
rename to include/RAJA/policy/loop/WorkGroup/Dispatcher.hpp
index 76e57477ef..61d801221b 100644
--- a/include/RAJA/policy/loop/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/loop/WorkGroup/Dispatcher.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA workgroup Vtable.
+ * \brief   Header file containing RAJA workgroup Dispatcher.
  *
  ******************************************************************************
  */
@@ -15,14 +15,14 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_loop_WorkGroup_Vtable_HPP
-#define RAJA_loop_WorkGroup_Vtable_HPP
+#ifndef RAJA_loop_WorkGroup_Dispatcher_HPP
+#define RAJA_loop_WorkGroup_Dispatcher_HPP
 
 #include "RAJA/config.hpp"
 
 #include "RAJA/policy/loop/policy.hpp"
 
-#include "RAJA/pattern/WorkGroup/Vtable.hpp"
+#include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 
 
 namespace RAJA
@@ -32,18 +32,13 @@ namespace detail
 {
 
 /*!
- * Populate and return a Vtable object
+ * Populate and return a Dispatcher object
  */
-template < typename T, typename Vtable_T >
-inline const Vtable_T* get_Vtable(loop_work const&)
+template < typename T, typename Dispatcher_T >
+inline const Dispatcher_T* get_Dispatcher(loop_work const&)
 {
-  static Vtable_T vtable{
-        &Vtable_T::template move_construct_destroy<T>,
-        &Vtable_T::template host_call<T>,
-        &Vtable_T::template destroy<T>,
-        sizeof(T)
-      };
-  return &vtable;
+  static Dispatcher_T dispatcher{ Dispatcher_T::template makeDispatcher<T>() };
+  return &dispatcher;
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/loop/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/loop/WorkGroup/WorkRunner.hpp
index 3bdc71d78f..ac5c2df907 100644
--- a/include/RAJA/policy/loop/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/loop/WorkGroup/WorkRunner.hpp
@@ -35,12 +35,14 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::loop_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -48,6 +50,7 @@ struct WorkRunner<
         RAJA::loop_exec,
         RAJA::loop_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -57,12 +60,14 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::loop_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -70,6 +75,7 @@ struct WorkRunner<
         RAJA::loop_exec,
         RAJA::loop_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
diff --git a/include/RAJA/policy/loop/atomic.hpp b/include/RAJA/policy/loop/atomic.hpp
index 9b7e1eeb9b..d19b41ba2a 100644
--- a/include/RAJA/policy/loop/atomic.hpp
+++ b/include/RAJA/policy/loop/atomic.hpp
@@ -22,6 +22,7 @@
 
 #include "RAJA/util/macros.hpp"
 
+#include "RAJA/policy/loop/policy.hpp"
 #include "RAJA/policy/sequential/atomic.hpp"
 
 namespace RAJA
diff --git a/include/RAJA/policy/loop/forall.hpp b/include/RAJA/policy/loop/forall.hpp
index 12489ecfa7..9660cfa6a2 100644
--- a/include/RAJA/policy/loop/forall.hpp
+++ b/include/RAJA/policy/loop/forall.hpp
@@ -32,12 +32,15 @@
 
 #include "RAJA/internal/fault_tolerance.hpp"
 
+#include "RAJA/pattern/params/forall.hpp"
+
 using RAJA::concepts::enable_if;
 
 namespace RAJA
 {
 namespace policy
 {
+
 namespace loop
 {
 
@@ -52,11 +55,41 @@ namespace loop
 //
 
 
-template <typename Iterable, typename Func, typename Resource>
-RAJA_INLINE resources::EventProxy<Resource> forall_impl(Resource res,
-                                                    const loop_exec &,
-                                                    Iterable &&iter,
-                                                    Func &&body)
+template <typename Iterable, typename Func, typename Resource, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<Resource>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
+forall_impl(Resource res,
+            const loop_exec &,
+            Iterable &&iter,
+            Func &&body,
+            ForallParam f_params)
+{
+  expt::ParamMultiplexer::init<seq_exec>(f_params);
+  RAJA_EXTRACT_BED_IT(iter);
+
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    expt::invoke_body(f_params, body, *(begin_it + i));
+  }
+  expt::ParamMultiplexer::resolve<seq_exec>(f_params);
+  return RAJA::resources::EventProxy<Resource>(res);
+}
+
+template <typename Iterable, typename Func, typename Resource, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<Resource>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
+forall_impl(Resource res,
+            const loop_exec &,
+            Iterable &&iter,
+            Func &&body,
+            ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
diff --git a/include/RAJA/policy/loop/teams.hpp b/include/RAJA/policy/loop/launch.hpp
similarity index 87%
rename from include/RAJA/policy/loop/teams.hpp
rename to include/RAJA/policy/loop/launch.hpp
index 8e7e61fa9f..50d5ff1c0a 100644
--- a/include/RAJA/policy/loop/teams.hpp
+++ b/include/RAJA/policy/loop/launch.hpp
@@ -15,10 +15,10 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_loop_HPP
-#define RAJA_pattern_teams_loop_HPP
+#ifndef RAJA_pattern_launch_loop_HPP
+#define RAJA_pattern_launch_loop_HPP
 
-#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
 #include "RAJA/policy/loop/policy.hpp"
 
@@ -26,11 +26,8 @@
 namespace RAJA
 {
 
-namespace expt
-{
-
 template <>
-struct LaunchExecute<RAJA::expt::null_launch_t> {
+struct LaunchExecute<RAJA::null_launch_t> {
   template <typename BODY>
   static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
                    BODY const& RAJA_UNUSED_ARG(body))
@@ -41,20 +38,36 @@ struct LaunchExecute<RAJA::expt::null_launch_t> {
 
 
 template <>
-struct LaunchExecute<RAJA::expt::seq_launch_t> {
+struct LaunchExecute<RAJA::seq_launch_t> {
 
   template <typename BODY>
-  static void exec(LaunchContext const &ctx, BODY const &body)
+  static void exec(LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body)
   {
+    LaunchContext ctx;
+
+    ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+
     body(ctx);
+
+    free(ctx.shared_mem_ptr);
+    ctx.shared_mem_ptr = nullptr;
   }
 
   template <typename BODY>
   static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY const &body)
+  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body)
   {
+
+    LaunchContext ctx;
+
+    char *kernel_local_mem = new char[params.shared_mem_size];
+    ctx.shared_mem_ptr = kernel_local_mem;
+
     body(ctx);
 
+    delete[] kernel_local_mem;
+    ctx.shared_mem_ptr = nullptr;
+
     return resources::EventProxy<resources::Resource>(res);
   }
 
@@ -245,7 +258,5 @@ struct TileICountExecute<loop_exec, SEGMENT> {
 
 };
 
-}  // namespace expt
-
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index 9bbfbe2c1b..a2cd6175b9 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -41,7 +41,7 @@
 #include "RAJA/policy/openmp/scan.hpp"
 #include "RAJA/policy/openmp/sort.hpp"
 #include "RAJA/policy/openmp/synchronize.hpp"
-#include "RAJA/policy/openmp/teams.hpp"
+#include "RAJA/policy/openmp/launch.hpp"
 #include "RAJA/policy/openmp/WorkGroup.hpp"
 
 
diff --git a/include/RAJA/policy/openmp/WorkGroup.hpp b/include/RAJA/policy/openmp/WorkGroup.hpp
index 1f9e9a1708..1a41e6b5ef 100644
--- a/include/RAJA/policy/openmp/WorkGroup.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ * \brief   Header file containing RAJA Dispatcher and WorkRunner constructs.
  *
  ******************************************************************************
  */
@@ -18,7 +18,7 @@
 #ifndef RAJA_openmp_WorkGroup_HPP
 #define RAJA_openmp_WorkGroup_HPP
 
-#include "RAJA/policy/openmp/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/openmp/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/openmp/WorkGroup/WorkRunner.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/WorkGroup/Vtable.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
similarity index 65%
rename from include/RAJA/policy/openmp/WorkGroup/Vtable.hpp
rename to include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index 624ca276bd..17606d64d7 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA workgroup Vtable.
+ * \brief   Header file containing RAJA workgroup Dispatcher.
  *
  ******************************************************************************
  */
@@ -15,14 +15,14 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_openmp_WorkGroup_Vtable_HPP
-#define RAJA_openmp_WorkGroup_Vtable_HPP
+#ifndef RAJA_openmp_WorkGroup_Dispatcher_HPP
+#define RAJA_openmp_WorkGroup_Dispatcher_HPP
 
 #include "RAJA/config.hpp"
 
 #include "RAJA/policy/openmp/policy.hpp"
 
-#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/loop/WorkGroup/Dispatcher.hpp"
 
 
 namespace RAJA
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Vtable object
+* Populate and return a Dispatcher object
 */
-template < typename T, typename Vtable_T >
-inline const Vtable_T* get_Vtable(omp_work const&)
+template < typename T, typename Dispatcher_T >
+inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
-  return get_Vtable<T, Vtable_T>(loop_work{});
+  return get_Dispatcher<T, Dispatcher_T>(loop_work{});
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index 3a15cc2da1..ebb05c132b 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -35,12 +35,14 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::omp_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -48,6 +50,7 @@ struct WorkRunner<
         RAJA::omp_parallel_for_exec,
         RAJA::omp_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -57,12 +60,14 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::omp_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -70,6 +75,7 @@ struct WorkRunner<
         RAJA::omp_parallel_for_exec,
         RAJA::omp_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 4e58b7d02c..0fa1b6621e 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -43,6 +43,9 @@
 #include "RAJA/pattern/forall.hpp"
 #include "RAJA/pattern/region.hpp"
 
+#include "RAJA/pattern/params/forall.hpp"
+
+#include "RAJA/policy/openmp/params/forall.hpp"
 
 namespace RAJA
 {
@@ -51,19 +54,23 @@ namespace policy
 {
 namespace omp
 {
-///
-/// OpenMP parallel policy implementation
-///
-template <typename Iterable, typename Func, typename InnerPolicy>
-RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                    const omp_parallel_exec<InnerPolicy>&,
-                                                    Iterable&& iter,
-                                                    Func&& loop_body)
+
+template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Host host_res,
+            const omp_parallel_exec<InnerPolicy>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
 {
   RAJA::region<RAJA::omp_parallel_region>([&]() {
     using RAJA::internal::thread_privatize;
     auto body = thread_privatize(loop_body);
-    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv());
+    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
   });
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -204,6 +211,7 @@ namespace internal
     }
   }
 
+  // TODO :: not implemented in forall param interface ...
   #if !defined(RAJA_COMPILER_MSVC)
   // dynamic & guided
   template <typename Policy, typename Iterable, typename Func>
@@ -270,6 +278,7 @@ namespace internal
     }
   }
 
+  //TODO :: not implemented in param interface...
   #if !defined(RAJA_COMPILER_MSVC)
   // dynamic & guided
   template <typename Policy, typename Iterable, typename Func>
@@ -288,21 +297,33 @@ namespace internal
 
 } // end namespace internal
 
-template <typename Schedule, typename Iterable, typename Func>
-RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                               const omp_for_schedule_exec<Schedule>&,
-                                                               Iterable&& iter,
-                                                               Func&& loop_body)
+template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Host host_res,
+            const omp_for_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule, typename Iterable, typename Func>
-RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                               const omp_for_nowait_schedule_exec<Schedule>&,
-                                                               Iterable&& iter,
-                                                               Func&& loop_body)
+template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Host host_res,
+            const omp_for_nowait_schedule_exec<Schedule>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/openmp/teams.hpp b/include/RAJA/policy/openmp/launch.hpp
similarity index 88%
rename from include/RAJA/policy/openmp/teams.hpp
rename to include/RAJA/policy/openmp/launch.hpp
index 3d042211fa..7ec9a7c5ed 100644
--- a/include/RAJA/policy/openmp/teams.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   RAJA header file containing user interface for RAJA::Teams::openmp
+ * \brief   RAJA header file containing user interface for RAJA::launch::openmp
  *
  ******************************************************************************
  */
@@ -15,41 +15,56 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_openmp_HPP
-#define RAJA_pattern_teams_openmp_HPP
+#ifndef RAJA_pattern_launch_openmp_HPP
+#define RAJA_pattern_launch_openmp_HPP
 
-#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
 
 
 namespace RAJA
 {
 
-namespace expt
-{
-
 template <>
-struct LaunchExecute<RAJA::expt::omp_launch_t> {
+struct LaunchExecute<RAJA::omp_launch_t> {
 
 
   template <typename BODY>
-  static void exec(LaunchContext const &ctx, BODY const &body)
+  static void exec(LaunchParams const &params, const char *, BODY const &body)
   {
     RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
-      loop_body.get_priv()(ctx);
+
+        LaunchContext ctx;
+
+        using RAJA::internal::thread_privatize;
+        auto loop_body = thread_privatize(body);
+
+        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+
+        loop_body.get_priv()(ctx);
+
+        free(ctx.shared_mem_ptr);
+        ctx.shared_mem_ptr = nullptr;
     });
   }
 
   template <typename BODY>
   static resources::EventProxy<resources::Resource>
-  exec(RAJA::resources::Resource res, LaunchContext const &ctx, BODY const &body)
+  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body)
   {
     RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
-      loop_body.get_priv()(ctx);
+
+        LaunchContext ctx;
+
+        using RAJA::internal::thread_privatize;
+        auto loop_body = thread_privatize(body);
+
+        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+
+        loop_body.get_priv()(ctx);
+
+        free(ctx.shared_mem_ptr);
+        ctx.shared_mem_ptr = nullptr;
     });
 
     return resources::EventProxy<resources::Resource>(res);
@@ -204,7 +219,7 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
 // Return local index
 //
 template <typename SEGMENT>
-struct LoopICountExecute<omp_parallel_for_exec, SEGMENT> {
+struct LoopICountExecute<omp_for_exec, SEGMENT> {
 
   template <typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
@@ -214,15 +229,11 @@ struct LoopICountExecute<omp_parallel_for_exec, SEGMENT> {
   {
 
     int len = segment.end() - segment.begin();
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
 
 #pragma omp for
       for (int i = 0; i < len; i++) {
-        loop_body.get_priv()(*(segment.begin() + i), i);
+        body(*(segment.begin() + i), i);
       }
-    });
   }
 
   template <typename BODY>
@@ -236,21 +247,16 @@ struct LoopICountExecute<omp_parallel_for_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
-
-#pragma omp parallel for
+#pragma omp for
       for (int j = 0; j < len1; j++) {
         for (int i = 0; i < len0; i++) {
 
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j),
-                               i,
-                               j);
+          body(*(segment0.begin() + i),
+               *(segment1.begin() + j),
+               i,
+               j);
         }
       }
-    });
   }
 
   template <typename BODY>
@@ -266,24 +272,19 @@ struct LoopICountExecute<omp_parallel_for_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    RAJA::region<RAJA::omp_parallel_region>([&]() {
-      using RAJA::internal::thread_privatize;
-      auto loop_body = thread_privatize(body);
-
 #pragma omp for
       for (int k = 0; k < len2; k++) {
         for (int j = 0; j < len1; j++) {
           for (int i = 0; i < len0; i++) {
-            loop_body.get_priv()(*(segment0.begin() + i),
-                                 *(segment1.begin() + j),
-                                 *(segment2.begin() + k),
-                                 i,
-                                 j,
-                                 k);
+            body(*(segment0.begin() + i),
+                 *(segment1.begin() + j),
+                 *(segment2.begin() + k),
+                 i,
+                 j,
+                 k);
           }
         }
       }
-    });
   }
 };
 
@@ -510,7 +511,5 @@ struct TileICountExecute<omp_for_exec, SEGMENT> {
   }
 };
 
-}  // namespace expt
-
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
new file mode 100644
index 0000000000..7715ab4d04
--- /dev/null
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -0,0 +1,333 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_forall_param_openmp_HPP
+#define RAJA_forall_param_openmp_HPP
+
+#define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
+      _Pragma(" omp declare reduction( combine \
+        : typename std::remove_reference<decltype(f_params)>::type \
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) \
+        //initializer(omp_priv = omp_in) ")
+
+namespace RAJA
+{
+
+namespace policy
+{
+namespace omp
+{
+namespace expt
+{
+
+  namespace internal
+  {
+    //
+    // omp for (Auto)
+    //
+    template <typename ExecPol, typename Iterable, typename Func, typename ForallParam>
+    RAJA_INLINE 
+    concepts::enable_if< std::is_same<ExecPol, RAJA::policy::omp::Auto> >
+    forall_impl(const ExecPol& p,
+                Iterable&& iter,
+                Func&& loop_body,
+                ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<ExecPol>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<ExecPol>(f_params);
+    }
+
+    //
+    // omp for schedule(static)
+    //
+    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
+    RAJA_INLINE 
+    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+                         std::integral_constant<bool,(ChunkSize <= 0)> >
+    forall_impl(const ExecPol<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(static) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(static, ChunkSize)
+    //
+    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
+    RAJA_INLINE 
+    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+                         std::integral_constant<bool,(ChunkSize > 0)> >
+    forall_impl(const ExecPol<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(static, ChunkSize) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(runtime)
+    //
+    template <typename Iterable, typename Func, typename ForallParam>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(runtime) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for nowait (Auto)
+    //
+    template <typename Iterable, typename Func, typename ForallParam>
+    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel
+      {
+      #pragma omp for nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(dynamic)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(dynamic, ChunkSize)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(guided)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(guided) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(guided, ChunkSize)
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+      #pragma omp parallel for schedule(guided, ChunkSize) reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(static) nowait
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel
+      {
+      #pragma omp for schedule(static) nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+    //
+    // omp for schedule(static, ChunkSize) nowait
+    //
+    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
+      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
+                                 Iterable&& iter,
+                                 Func&& loop_body,
+                                 ForallParam&& f_params)
+    {
+      using EXEC_POL = typename std::decay<decltype(p)>::type;
+      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+      RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel
+      {
+      #pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+      }
+      }
+
+      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+    }
+
+  } //  namespace internal
+
+  template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
+  RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
+                                                                 const omp_for_schedule_exec<Schedule>&,
+                                                                 Iterable&& iter,
+                                                                 Func&& loop_body,
+                                                                 ForallParam f_params)
+  {
+    expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body), std::forward<ForallParam>(f_params));
+    return resources::EventProxy<resources::Host>(host_res);
+  }
+} //  namespace expt
+
+///
+/// OpenMP parallel policy implementation
+///
+template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Host host_res,
+            const omp_parallel_exec<InnerPolicy>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  expt::forall_impl(host_res, InnerPolicy{}, iter, loop_body, f_params);
+  return resources::EventProxy<resources::Host>(host_res);
+}
+
+}  // namespace omp
+
+}  // namespace policy
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
new file mode 100644
index 0000000000..99d95da049
--- /dev/null
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -0,0 +1,39 @@
+#ifndef NEW_REDUCE_OMP_REDUCE_HPP
+#define NEW_REDUCE_OMP_REDUCE_HPP
+
+#include "RAJA/pattern/params/reducer.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  init(Reducer<OP, T>& red) {
+    red.val = OP::identity();
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
+    out.val = OP{}(out.val, in.val);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
+  resolve(Reducer<OP, T>& red) {
+    *red.target = OP{}(red.val, *red.target);
+  }
+
+#endif
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index 70ad618376..c45da2937d 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -55,6 +55,7 @@ namespace internal
     struct Schedule : public ScheduleTag {
         constexpr static omp_sched_t schedule = Sched;
         constexpr static int chunk_size = Chunk;
+        constexpr static Policy policy = Policy::openmp;
     };
 }  // namespace internal
 
@@ -77,7 +78,7 @@ struct NoWait {
 
 static constexpr int default_chunk_size = -1;
 
-struct Auto : private internal::Schedule<omp_sched_auto, default_chunk_size>{
+struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>{
 };
 
 template <int ChunkSize = default_chunk_size>
@@ -380,11 +381,7 @@ using policy::omp::omp_for_runtime_exec;
 /// Type aliases for omp parallel region
 ///
 using policy::omp::omp_parallel_region;
-
-namespace expt
-{
-  using policy::omp::omp_launch_t;
-}
+using policy::omp::omp_launch_t;
 
 ///
 /// Type aliases for omp reductions
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 372be2307f..fd40e83aea 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -164,6 +164,7 @@ void sort(Sorter sorter,
     const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
 
     const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
 #pragma omp master
@@ -174,6 +175,7 @@ void sort(Sorter sorter,
 #else
 
     const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
     {
diff --git a/include/RAJA/policy/openmp_target/WorkGroup.hpp b/include/RAJA/policy/openmp_target/WorkGroup.hpp
index 7ab86383c8..1de367036a 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ * \brief   Header file containing RAJA Dispatcher and WorkRunner constructs.
  *
  ******************************************************************************
  */
@@ -18,7 +18,7 @@
 #ifndef RAJA_openmp_target_WorkGroup_HPP
 #define RAJA_openmp_target_WorkGroup_HPP
 
-#include "RAJA/policy/openmp_target/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
new file mode 100644
index 0000000000..2f14312d81
--- /dev/null
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -0,0 +1,82 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA workgroup Dispatcher.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_openmp_target_WorkGroup_Dispatcher_HPP
+#define RAJA_openmp_target_WorkGroup_Dispatcher_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/openmp_target/policy.hpp"
+
+#include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
+
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+namespace omp_target
+{
+
+// create the value in a target region using the factory, map the value
+// back, and return the value created in the target region
+template < typename Factory >
+inline auto get_value(Factory factory)
+{
+  typename std::decay_t<Factory>::value_type value;
+
+  #pragma omp target map(tofrom : value) map(to : factory)
+  {
+    value = factory();
+  }
+
+  return value;
+}
+
+// get the device value and store it so it can be used
+// multiple times
+template < typename Factory >
+inline auto get_cached_value(Factory&& factory)
+{
+  static auto value = get_value(std::forward<Factory>(factory));
+  return value;
+}
+
+}  // namespace omp_target
+
+/*!
+* Populate and return a Dispatcher object that can be used in omp target regions
+*/
+template < typename T, typename Dispatcher_T >
+inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
+{
+  static Dispatcher_T dispatcher{
+        Dispatcher_T::template makeDispatcher<T>(
+          [](auto&& factory) {
+            return omp_target::get_cached_value(
+                std::forward<decltype(factory)>(factory));
+          }) };
+  return &dispatcher;
+}
+
+}  // namespace detail
+
+}  // namespace RAJA
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Vtable.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Vtable.hpp
deleted file mode 100644
index c32b710a11..0000000000
--- a/include/RAJA/policy/openmp_target/WorkGroup/Vtable.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA workgroup Vtable.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_openmp_target_WorkGroup_Vtable_HPP
-#define RAJA_openmp_target_WorkGroup_Vtable_HPP
-
-#include "RAJA/config.hpp"
-
-#include "RAJA/policy/openmp_target/policy.hpp"
-
-#include "RAJA/pattern/WorkGroup/Vtable.hpp"
-
-
-namespace RAJA
-{
-
-namespace detail
-{
-
-// get the device function pointer by opening a target region and writing out
-// the pointer to the function call
-template < typename T, typename Vtable_T >
-inline typename Vtable_T::call_sig get_Vtable_omp_target_call()
-{
-  typename Vtable_T::call_sig ptr = nullptr;
-
-  #pragma omp target map(tofrom : ptr)
-  {
-    ptr = &Vtable_T::template host_call<T>;
-  }
-
-  return ptr;
-}
-
-// get the device function pointer and store it so it can be used
-// multiple times
-template < typename T, typename Vtable_T >
-inline typename Vtable_T::call_sig get_cached_Vtable_omp_target_call()
-{
-  static typename Vtable_T::call_sig ptr =
-      get_Vtable_omp_target_call<T, Vtable_T>();
-  return ptr;
-}
-
-/*!
-* Populate and return a Vtable object where the
-* call operator is a device function
-*/
-template < typename T, typename Vtable_T >
-inline const Vtable_T* get_Vtable(omp_target_work const&)
-{
-  static Vtable_T vtable{
-        &Vtable_T::template move_construct_destroy<T>,
-        get_cached_Vtable_omp_target_call<T, Vtable_T>(),
-        &Vtable_T::template destroy<T>,
-        sizeof(T)
-      };
-  return &vtable;
-}
-
-}  // namespace detail
-
-}  // namespace RAJA
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index 7b07ecad79..e9222c5de2 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -35,12 +35,14 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::omp_target_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -48,6 +50,7 @@ struct WorkRunner<
         RAJA::omp_target_parallel_for_exec_nt,
         RAJA::omp_target_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -57,12 +60,14 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::omp_target_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -70,6 +75,7 @@ struct WorkRunner<
         RAJA::omp_target_parallel_for_exec_nt,
         RAJA::omp_target_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 43b9ac4019..216edb589a 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -18,6 +18,8 @@
 
 #include "RAJA/policy/openmp/policy.hpp"
 
+#include "RAJA/pattern/params/forall.hpp"
+
 namespace RAJA
 {
 
@@ -31,11 +33,71 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func>
-RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Omp omp_res,
-                                                              const omp_target_parallel_for_exec<ThreadsPerTeam>&,
-                                                              Iterable&& iter,
-                                                              Func&& loop_body)
+template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  using Body = typename std::remove_reference<decltype(loop_body)>::type;
+  Body body = loop_body;
+
+  RAJA_EXTRACT_BED_IT(iter);
+
+  // Reset if exceed CUDA threads per block limit.
+  int tperteam = ThreadsPerTeam;
+  if ( tperteam > omp::MAXNUMTHREADS )
+  {
+    tperteam = omp::MAXNUMTHREADS;
+  }
+
+  // calculate number of teams based on user defined threads per team
+  // datasize is distance between begin() and end() of iterable
+  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
+  if ( numteams > tperteam )
+  {
+    // Omp target reducers will write team # results, into Threads-sized array.
+    // Need to insure NumTeams <= Threads to prevent array out of bounds access.
+    numteams = tperteam;
+  }
+
+// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  auto i = distance_it;
+
+#pragma omp target teams distribute parallel for num_teams(numteams) \
+    schedule(static, 1) map(to : body,begin_it) redcution(combine: f_params)
+  for (i = 0; i < distance_it; ++i) {
+    Body ib = body;
+    RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
+  }
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  return resources::EventProxy<resources::Omp>(omp_res);
+}
+
+template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec<ThreadsPerTeam>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body = loop_body;
@@ -68,14 +130,59 @@ RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Omp omp
     Body ib = body;
     ib(begin_it[i]);
   }
+
+  return resources::EventProxy<resources::Omp>(omp_res);
+}
+
+
+
+
+
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec_nt& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  using Body = typename std::remove_reference<decltype(loop_body)>::type;
+  Body body = loop_body;
+
+  RAJA_EXTRACT_BED_IT(iter);
+
+#pragma omp target teams distribute parallel for schedule(static, 1) \
+    firstprivate(body,begin_it) reduction(combine: f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    Body ib = body;
+    RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
+  }
+
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <typename Iterable, typename Func>
-RAJA_INLINE resources::EventProxy<resources::Omp> forall_impl(resources::Omp omp_res,
-                                                              const omp_target_parallel_for_exec_nt&,
-                                                              Iterable&& iter,
-                                                              Func&& loop_body)
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Omp>,
+  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
+forall_impl(resources::Omp omp_res,
+            const omp_target_parallel_for_exec_nt&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
   Body body = loop_body;
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
new file mode 100644
index 0000000000..f4a3952056
--- /dev/null
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -0,0 +1,39 @@
+#ifndef NEW_REDUCE_OMP_TARGET_REDUCE_HPP
+#define NEW_REDUCE_OMP_TARGET_REDUCE_HPP
+
+#include "RAJA/pattern/params/reducer.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+#if defined(RAJA_ENABLE_TARGET_OPENMP)
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  init(Reducer<OP, T>& red) {
+    red.val = OP::identity();
+  }
+
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
+    out.val = OP{}(out.val, in.val);
+  }
+
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
+  resolve(Reducer<OP, T>& red) {
+    *red.target = OP{}(red.val, *red.target);
+  }
+
+#endif
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index f75ee98405..9066b51b88 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -30,7 +30,7 @@
 #include "RAJA/policy/sequential/reduce.hpp"
 #include "RAJA/policy/sequential/scan.hpp"
 #include "RAJA/policy/sequential/sort.hpp"
-#include "RAJA/policy/sequential/teams.hpp"
+#include "RAJA/policy/sequential/launch.hpp"
 #include "RAJA/policy/sequential/WorkGroup.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup.hpp b/include/RAJA/policy/sequential/WorkGroup.hpp
index da38cbeef1..e21a62e5d4 100644
--- a/include/RAJA/policy/sequential/WorkGroup.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ * \brief   Header file containing RAJA Dispatcher and WorkRunner constructs.
  *
  ******************************************************************************
  */
@@ -18,7 +18,7 @@
 #ifndef RAJA_sequential_WorkGroup_HPP
 #define RAJA_sequential_WorkGroup_HPP
 
-#include "RAJA/policy/sequential/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/sequential/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/sequential/WorkGroup/WorkRunner.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sequential/WorkGroup/Vtable.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
similarity index 64%
rename from include/RAJA/policy/sequential/WorkGroup/Vtable.hpp
rename to include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index 58e9509e58..a64f5b466d 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA workgroup Vtable.
+ * \brief   Header file containing RAJA workgroup Dispatcher.
  *
  ******************************************************************************
  */
@@ -15,14 +15,14 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_sequential_WorkGroup_Vtable_HPP
-#define RAJA_sequential_WorkGroup_Vtable_HPP
+#ifndef RAJA_sequential_WorkGroup_Dispatcher_HPP
+#define RAJA_sequential_WorkGroup_Dispatcher_HPP
 
 #include "RAJA/config.hpp"
 
 #include "RAJA/policy/sequential/policy.hpp"
 
-#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/loop/WorkGroup/Dispatcher.hpp"
 
 
 namespace RAJA
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Vtable object
+* Populate and return a Dispatcher object
 */
-template < typename T, typename Vtable_T >
-inline const Vtable_T* get_Vtable(seq_work const&)
+template < typename T, typename Dispatcher_T >
+inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  return get_Vtable<T, Vtable_T>(loop_work{});
+  return get_Dispatcher<T, Dispatcher_T>(loop_work{});
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index cb0e682859..f8e1bfbfe9 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -35,12 +35,14 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::seq_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -48,6 +50,7 @@ struct WorkRunner<
         RAJA::seq_exec,
         RAJA::seq_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -57,12 +60,14 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::seq_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -70,6 +75,7 @@ struct WorkRunner<
         RAJA::seq_exec,
         RAJA::seq_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 50a798b62b..aae1504ad9 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -35,6 +35,8 @@
 
 #include "RAJA/util/resource.hpp"
 
+#include "RAJA/pattern/params/forall.hpp"
+
 namespace RAJA
 {
 namespace policy
@@ -53,11 +55,44 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename Func, typename Resource>
-RAJA_INLINE resources::EventProxy<Resource> forall_impl(Resource res,
-                                                               const seq_exec &,
-                                                               Iterable &&iter,
-                                                               Func &&body)
+template <typename Iterable, typename Func, typename Resource, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<Resource>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
+forall_impl(Resource res,
+            const seq_exec &,
+            Iterable &&iter,
+            Func &&body,
+            ForallParam f_params)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+
+  expt::ParamMultiplexer::init<seq_exec>(f_params);
+
+  RAJA_NO_SIMD
+  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    expt::invoke_body(f_params, body, *(begin_it + i));
+  }
+
+  expt::ParamMultiplexer::resolve<seq_exec>(f_params);
+  return resources::EventProxy<Resource>(res);
+}
+
+template <typename Iterable, typename Func, typename Resource, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<Resource>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
+forall_impl(Resource res,
+            const seq_exec &,
+            Iterable &&iter,
+            Func &&body,
+            ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
diff --git a/include/RAJA/policy/sequential/teams.hpp b/include/RAJA/policy/sequential/launch.hpp
similarity index 90%
rename from include/RAJA/policy/sequential/teams.hpp
rename to include/RAJA/policy/sequential/launch.hpp
index ab0928e62a..072d9facca 100644
--- a/include/RAJA/policy/sequential/teams.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -15,19 +15,16 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_sequential_HPP
-#define RAJA_pattern_teams_sequential_HPP
+#ifndef RAJA_pattern_launch_sequential_HPP
+#define RAJA_pattern_launch_sequential_HPP
 
-#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/sequential/policy.hpp"
 
 
 namespace RAJA
 {
 
-namespace expt
-{
-
 template <typename SEGMENT>
 struct LoopExecute<seq_exec, SEGMENT> {
 
@@ -64,7 +61,5 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
   }
 };
 
-}  // namespace expt
-
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
new file mode 100644
index 0000000000..f923f6f821
--- /dev/null
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -0,0 +1,33 @@
+#ifndef NEW_REDUCE_SEQ_REDUCE_HPP
+#define NEW_REDUCE_SEQ_REDUCE_HPP
+
+#include "RAJA/pattern/params/reducer.hpp"
+
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  init(Reducer<OP, T>& red) {
+    red.val = OP::identity();
+  }
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
+    out.val = OP{}(out.val, in.val);
+  }
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
+  resolve(Reducer<OP, T>& red) {
+    *red.target = OP{}(red.val, *red.target);
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index de7a66b96d..c10b5b10cf 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -103,11 +103,7 @@ using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
-
-namespace expt
-{
-  using policy::sequential::seq_launch_t;
-}
+using policy::sequential::seq_launch_t;
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/simd.hpp b/include/RAJA/policy/simd.hpp
index 4d6277ad88..fd4886d61a 100644
--- a/include/RAJA/policy/simd.hpp
+++ b/include/RAJA/policy/simd.hpp
@@ -22,7 +22,7 @@
 
 #include "RAJA/policy/simd/forall.hpp"
 #include "RAJA/policy/simd/policy.hpp"
-#include "RAJA/policy/loop/teams.hpp"
+#include "RAJA/policy/loop/launch.hpp"
 #include "RAJA/policy/simd/kernel/For.hpp"
 #include "RAJA/policy/simd/kernel/ForICount.hpp"
 
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index b37dbe7637..2a2423c400 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -37,6 +37,8 @@
 
 #include "RAJA/policy/simd/policy.hpp"
 
+#include "RAJA/pattern/params/forall.hpp"
+
 namespace RAJA
 {
 namespace policy
@@ -45,11 +47,45 @@ namespace simd
 {
 
 
-template <typename Iterable, typename Func>
-RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(RAJA::resources::Host host_res,
-                                                               const simd_exec &,
-                                                               Iterable &&iter,
-                                                               Func &&loop_body)
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
+forall_impl(RAJA::resources::Host host_res,
+            const simd_exec &,
+            Iterable &&iter,
+            Func &&loop_body,
+            ForallParam f_params)
+{
+  expt::ParamMultiplexer::init<seq_exec>(f_params);
+
+  auto begin = std::begin(iter);
+  auto end = std::end(iter);
+  auto distance = std::distance(begin, end);
+  RAJA_SIMD
+  for (decltype(distance) i = 0; i < distance; ++i) {
+    expt::invoke_body(f_params, loop_body, *(begin + i));
+  }
+
+  expt::ParamMultiplexer::resolve<seq_exec>(f_params);
+  return RAJA::resources::EventProxy<resources::Host>(host_res);
+}
+
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
+forall_impl(RAJA::resources::Host host_res,
+            const simd_exec &,
+            Iterable &&iter,
+            Func &&loop_body,
+            ForallParam)
 {
   auto begin = std::begin(iter);
   auto end = std::end(iter);
diff --git a/include/RAJA/policy/simd/teams.hpp b/include/RAJA/policy/simd/launch.hpp
similarity index 90%
rename from include/RAJA/policy/simd/teams.hpp
rename to include/RAJA/policy/simd/launch.hpp
index 618cbf8340..3d441ef35e 100644
--- a/include/RAJA/policy/simd/teams.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -15,19 +15,16 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_pattern_teams_simd_HPP
-#define RAJA_pattern_teams_simd_HPP
+#ifndef RAJA_pattern_launch_simd_HPP
+#define RAJA_pattern_launch_simd_HPP
 
-#include "RAJA/pattern/teams/teams_core.hpp"
+#include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/simd/policy.hpp"
 
 
 namespace RAJA
 {
 
-namespace expt
-{
-
 template <typename SEGMENT>
 struct LoopExecute<simd_exec, SEGMENT> {
 
@@ -64,7 +61,5 @@ struct LoopICountExecute<simd_exec, SEGMENT> {
   }
 };
 
-}  // namespace expt
-
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index a3caa0dbb6..ed4e4f81f5 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -22,11 +22,10 @@
 
 #include "RAJA/config.hpp"
 
-#if defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_SYCL_ACTIVE)
 
 #include <CL/sycl.hpp>
 
-//#include "RAJA/policy/sycl/atomic.hpp"
 #include "RAJA/policy/sycl/forall.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/reduce.hpp"
@@ -34,6 +33,7 @@
 //#include "RAJA/policy/sycl/sort.hpp"
 #include "RAJA/policy/sycl/kernel.hpp"
 //#include "RAJA/policy/sycl/synchronize.hpp"
+#include "RAJA/policy/sycl/launch.hpp"
 //#include "RAJA/policy/sycl/WorkGroup.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_SYCL)
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 81b8000f7f..7ce5fa7c86 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -52,7 +52,7 @@ struct syclInfo {
   sycl_dim_t blockDim{0};
   cl::sycl::queue qu = cl::sycl::queue();
   bool setup_reducers = false;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
   syclInfo* thread_states = nullptr;
   omp::mutex lock;
 #endif
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 5fc1a28f55..c98744dd66 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -82,12 +82,13 @@ cl::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async,
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
           typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
 RAJA_INLINE resources::EventProxy<resources::Sycl>  forall_impl(resources::Sycl &sycl_res,
                                                                 sycl_exec<BlockSize, Async>,
                                                                 Iterable&& iter,
-                                                                LoopBody&& loop_body)
+                                                                LoopBody&& loop_body,
+                                                                ForallParam)
 {
 
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
@@ -138,12 +139,13 @@ RAJA_INLINE resources::EventProxy<resources::Sycl>  forall_impl(resources::Sycl
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async,
+template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
           typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
 RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
                                                     sycl_exec<BlockSize, Async>,
                                                     Iterable&& iter,
-                                                    LoopBody&& loop_body)
+                                                    LoopBody&& loop_body,
+                                                    ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp
index c1d239bfa6..65e74eec6d 100644
--- a/include/RAJA/policy/sycl/kernel.hpp
+++ b/include/RAJA/policy/sycl/kernel.hpp
@@ -19,17 +19,17 @@
 #ifndef RAJA_policy_sycl_kernel_HPP
 #define RAJA_policy_sycl_kernel_HPP
 
-//#include "RAJA/policy/sycl/kernel/Conditional.hpp"
+#include "RAJA/policy/sycl/kernel/Conditional.hpp"
 #include "RAJA/policy/sycl/kernel/SyclKernel.hpp"
 #include "RAJA/policy/sycl/kernel/For.hpp"
-//#include "RAJA/policy/sycl/kernel/ForICount.hpp"
+#include "RAJA/policy/sycl/kernel/ForICount.hpp"
 //#include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
 //#include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
 #include "RAJA/policy/sycl/kernel/Lambda.hpp"
 //#include "RAJA/policy/sycl/kernel/Reduce.hpp"
 //#include "RAJA/policy/sycl/kernel/Sync.hpp"
 #include "RAJA/policy/sycl/kernel/Tile.hpp"
-//#include "RAJA/policy/sycl/kernel/TileTCount.hpp"
+#include "RAJA/policy/sycl/kernel/TileTCount.hpp"
 #include "RAJA/policy/sycl/kernel/internal.hpp"
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
new file mode 100644
index 0000000000..1d6f0f4d4c
--- /dev/null
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -0,0 +1,78 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for SYCL kernel conditional methods.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_policy_sycl_kernel_Conditional_HPP
+#define RAJA_policy_sycl_kernel_Conditional_HPP
+
+#include "RAJA/config.hpp"
+
+#include <iostream>
+#include <type_traits>
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/pattern/kernel/Conditional.hpp"
+
+#include "RAJA/policy/sycl/kernel/internal.hpp"
+
+namespace RAJA
+{
+namespace internal
+{
+
+
+template <typename Data,
+          typename Conditional,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::If<Conditional, EnclosedStmts...>,
+                             Types> {
+
+  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
+
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    if (Conditional::eval(data)) {
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active);
+    }
+  }
+
+
+
+  static
+  inline
+  LaunchDims calculateDimensions(Data const &data)
+  {
+    return enclosed_stmts_t::calculateDimensions(data);
+  }
+};
+
+
+}  // namespace internal
+}  // end namespace RAJA
+
+
+#endif
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index ce3f7482f7..126d078252 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -56,6 +56,7 @@ struct SyclStatementExecutor<
   using enclosed_stmts_t =
       SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
@@ -120,6 +121,7 @@ struct SyclStatementExecutor<
   using enclosed_stmts_t =
       SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
@@ -182,6 +184,7 @@ struct SyclStatementExecutor<
   using enclosed_stmts_t =
       SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
@@ -247,6 +250,7 @@ struct SyclStatementExecutor<
   using enclosed_stmts_t =
       SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
@@ -310,6 +314,7 @@ struct SyclStatementExecutor<
   using enclosed_stmts_t =
       SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
@@ -385,6 +390,7 @@ struct SyclStatementExecutor<
   using enclosed_stmts_t =
       SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
 
   static
   inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item)
@@ -443,6 +449,8 @@ struct SyclStatementExecutor<
   using enclosed_stmts_t =
       SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
+  using diff_t = segment_diff_type<ArgumentId, Data>;
+
   static
   inline
   RAJA_DEVICE
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
new file mode 100644
index 0000000000..9177af82d1
--- /dev/null
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -0,0 +1,425 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for SYCL statement executors.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+
+#ifndef RAJA_policy_sycl_kernel_ForICount_HPP
+#define RAJA_policy_sycl_kernel_ForICount_HPP
+
+#include "RAJA/config.hpp"
+
+#include "RAJA/policy/sycl/kernel/internal.hpp"
+
+
+namespace RAJA
+{
+
+namespace internal
+{
+
+
+
+/*
+ * Executor for local work sharing loop inside SyclKernel.
+ * Mapping directly from local id to indices
+ * Assigns the loop iterate to offset ArgumentId
+ * Assigns the loop count to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>, Types> {
+
+  using Base = SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+        Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i = item.get_local_id(ThreadDim);
+
+    // assign thread id directly to offset
+    data.template assign_offset<ArgumentId>(i);
+    data.template assign_param<ParamId>(i);
+
+    // execute enclosed statements if in bounds
+    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+
+  }
+};
+
+
+
+
+/*
+ * Executor for local work sharing loop inside SyclKernel.
+ * Assigns the loop index to offset ArgumentId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename ... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::sycl_local_masked_direct<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public SyclStatementExecutor<
+    Data,
+    statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
+                   EnclosedStmts ...>, Types > {
+
+  using Base = SyclStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
+
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
+
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
+  using enclosed_stmts_t =
+          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using mask_t = Mask;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i0 = item.get_local_id(0);
+    diff_t i = mask_t::maskValue(i0);
+
+    // assign thread id directly to offset
+    data.template assign_offset<ArgumentId>(i);
+    data.template assign_param<ParamId>(i);
+
+    // execute enclosed statements if in bounds
+    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+  }
+
+};
+
+
+
+
+
+/*
+ * Executor for local work sharing loop inside SyclKernel.
+ * Assigns the loop index to offset ArgumentId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename Mask,
+          typename ... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+  Data,
+  statement::ForICount<ArgumentId, ParamId,
+                       RAJA::sycl_local_masked_loop<Mask>,
+                       EnclosedStmts ...>, Types >
+  : public SyclStatementExecutor<
+    Data,
+    statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
+                   EnclosedStmts ...>, Types > {
+
+  using Base = SyclStatementExecutor<
+          Data,
+          statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts ...>, Types >;
+
+  using typename Base::diff_t;
+
+  using stmt_list_t = StatementList<EnclosedStmts ...>;
+
+  // Set the argument type for this loop
+  using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
+
+  using enclosed_stmts_t =
+          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+
+  using mask_t = Mask;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // masked size strided loop
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i0 = item.get_local_id(0);
+    diff_t i_init = mask_t::maskValue(i0);
+    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
+
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+      data.template assign_param<ParamId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active && have_work);
+    }
+  }
+
+};
+
+
+
+
+
+/*
+ * Executor for thread work sharing loop inside SyclKernel.
+ * Provides a block-stride loop (stride of blockDim.xyz) for
+ * each thread in xyz.
+ * Assigns the loop iterate to offset ArgumentId
+ * Assigns the loop offset to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int ThreadDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+        Types> {
+
+  using Base = SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+        Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // block stride loop
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i_init = item.get_local_id(ThreadDim);
+    auto i_stride = item.get_local_range(ThreadDim);
+
+    // Iterate through grid stride of chunks
+    for (diff_t ii = 0; ii < len; ii += i_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
+
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+      data.template assign_param<ParamId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active && have_work);
+    }
+  }
+};
+
+
+
+/*
+ * Executor for group work sharing inside SyclKernel.
+ * Provides a direct mapping of each block in 012.
+ * Assigns the loop index to offset ArgumentId
+ * Assigns the loop index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+        Types> {
+
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // grid stride loop
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i = item.get_group(BlockDim);
+
+    if (i < len) {
+
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+      data.template assign_param<ParamId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active);
+    }
+  }
+};
+
+/*
+ * Executor for group work sharing inside SyclKernel.
+ * Provides a group-stride loop for
+ * each block in 012.
+ * Assigns the loop index to offset ArgumentId
+ * Assigns the loop index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+        Types> {
+
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // grid stride loop
+    diff_t len = segment_length<ArgumentId>(data);
+    auto i_init = item.get_group(BlockDim);
+    auto i_stride = item.get_group_range(BlockDim);
+
+    // Iterate through grid stride of chunks
+    for (diff_t i = i_init; i < len; i += i_stride) {
+
+      // Assign the x thread to the argument
+      data.template assign_offset<ArgumentId>(i);
+      data.template assign_param<ParamId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active);
+    }
+  }
+};
+
+
+/*
+ * Executor for sequential loops inside of a SyclKernel.
+ *
+ * This is specialized since it need to execute the loop immediately.
+ * Assigns the loop index to offset ArgumentId
+ * Assigns the loop index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
+    : public SyclStatementExecutor<
+        Data,
+        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
+
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    diff_t len = segment_length<ArgumentId>(data);
+
+    for(diff_t i = 0;i < len;++ i){
+      // Assign i to the argument
+      data.template assign_offset<ArgumentId>(i);
+      data.template assign_param<ParamId>(i);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active);
+    }
+  }
+};
+
+
+
+
+
+}  // namespace internal
+}  // end namespace RAJA
+
+
+#endif /* RAJA_policy_sycl_kernel_ForICount_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 962cc51aed..d1e7fcdc53 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -219,6 +219,13 @@ struct StatementExecutor<
     int shmem = 0;
     cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
 
+    // Global resource was not set, use the resource that was passed to forall
+    // Determine if the default SYCL res is being used
+    if (!q) {
+      camp::resources::Resource res = camp::resources::Sycl();
+      q = res.get<camp::resources::Sycl>().get_queue();
+    }
+
     //
     // Launch the kernels
     //
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
new file mode 100644
index 0000000000..c830497e9f
--- /dev/null
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -0,0 +1,410 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file for SYCL tiled executors.
+ *
+ ******************************************************************************
+ */
+
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+
+#ifndef RAJA_policy_sycl_kernel_TileTCount_HPP
+#define RAJA_policy_sycl_kernel_TileTCount_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_SYCL)
+
+#include <iostream>
+#include <type_traits>
+
+#include "camp/camp.hpp"
+#include "camp/concepts.hpp"
+#include "camp/tuple.hpp"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/pattern/kernel/Tile.hpp"
+#include "RAJA/pattern/kernel/internal.hpp"
+
+namespace RAJA
+{
+namespace internal
+{
+
+/*!
+ * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          typename TPol,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+    : public SyclStatementExecutor<
+        Data,
+        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
+
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    // Keep copy of original segment, so we can restore it
+    using segment_t = camp::decay<decltype(segment)>;
+    segment_t orig_segment = segment;
+
+    diff_t chunk_size = TPol::chunk_size;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+
+    // Iterate through tiles
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
+
+      // Assign our new tiled segment
+      segment = orig_segment.slice(i, chunk_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active);
+    }
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
+
+/*!
+ * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_group_012_direct<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
+    : public SyclStatementExecutor<
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_group_012_direct<BlockDim>,
+                        EnclosedStmts...>,
+                        Types> {
+
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      sycl_group_012_direct<BlockDim>,
+                      EnclosedStmts...>,
+                      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    using segment_t = camp::decay<decltype(segment)>;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+    //diff_t t = get_sycl_dim<BlockDim>(blockIdx);
+    diff_t t = item.get_group(BlockDim);
+    diff_t i = t * chunk_size;
+
+    // check have a chunk
+    if (i < len) {
+
+      // Keep copy of original segment, so we can restore it
+      segment_t orig_segment = segment;
+
+      // Assign our new tiled segment
+      segment = orig_segment.slice(i, chunk_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active);
+
+      // Set range back to original values
+      segment = orig_segment;
+    }
+  }
+};
+
+/*!
+ * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int BlockDim,
+          typename... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+    Data,
+    statement::TileTCount<ArgumentId, ParamId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_group_012_loop<BlockDim>,
+                    EnclosedStmts...>,
+                    Types>
+    : public SyclStatementExecutor<
+        Data,
+        statement::Tile<ArgumentId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_group_012_loop<BlockDim>,
+                        EnclosedStmts...>,
+                        Types> {
+
+  using Base = SyclStatementExecutor<
+      Data,
+      statement::Tile<ArgumentId,
+                      RAJA::tile_fixed<chunk_size>,
+                      sycl_group_012_loop<BlockDim>,
+                      EnclosedStmts...>,
+                      Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    // Keep copy of original segment, so we can restore it
+    using segment_t = camp::decay<decltype(segment)>;
+    segment_t orig_segment = segment;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+    diff_t t_init = item.get_group(BlockDim);
+    diff_t i_init = t_init * chunk_size;
+    diff_t t_stride = item.get_group_range(BlockDim);
+    diff_t i_stride = t_stride * chunk_size;
+
+    // Iterate through grid stride of chunks
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+
+      // Assign our new tiled segment
+      segment = orig_segment.slice(i, chunk_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active);
+    }
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
+
+
+/*!
+ * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          typename ... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+  Data,
+  statement::TileTCount<ArgumentId, ParamId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_local_012_direct<ThreadDim>,
+                        EnclosedStmts ...>,
+                        Types>
+  : public SyclStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_local_012_direct<ThreadDim>,
+                    EnclosedStmts ...>,
+                    Types> {
+
+  using Base = SyclStatementExecutor<
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts ...>,
+                          Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    // Keep copy of original segment, so we can restore it
+    using segment_t = camp::decay<decltype(segment)>;
+    segment_t orig_segment = segment;
+
+    // compute trip count
+    diff_t len = segment.end() - segment.begin();
+    //diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
+    diff_t t = item.get_local_id(ThreadDim);
+    diff_t i = t * chunk_size;
+
+    // execute enclosed statements if any thread will
+    // but mask off threads without work
+    bool have_work = i < len;
+
+    // Assign our new tiled segment
+    diff_t slice_size = have_work ? chunk_size : 0;
+    segment = orig_segment.slice(i, slice_size);
+    data.template assign_param<ParamId>(t);
+
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && have_work);
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
+
+/*!
+ * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
+ * Assigns the tile segment to segment ArgumentId
+ * Assigns the tile index to param ParamId
+ */
+template <typename Data,
+          camp::idx_t ArgumentId,
+          typename ParamId,
+          camp::idx_t chunk_size,
+          int ThreadDim,
+          typename ... EnclosedStmts,
+          typename Types>
+struct SyclStatementExecutor<
+  Data,
+  statement::TileTCount<ArgumentId, ParamId,
+                        RAJA::tile_fixed<chunk_size>,
+                        sycl_local_012_loop<ThreadDim>,
+                        EnclosedStmts ...>,
+                        Types>
+  : public SyclStatementExecutor<
+    Data,
+    statement::Tile<ArgumentId,
+                    RAJA::tile_fixed<chunk_size>,
+                    sycl_local_012_loop<ThreadDim>,
+                    EnclosedStmts ...>,
+                    Types> {
+
+  using Base = SyclStatementExecutor<
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts ...>,
+                          Types>;
+
+  using typename Base::enclosed_stmts_t;
+  using typename Base::diff_t;
+
+  static
+  inline
+  RAJA_DEVICE
+  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  {
+    // Get the segment referenced by this Tile statement
+    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+
+    // Keep copy of original segment, so we can restore it
+    using segment_t = camp::decay<decltype(segment)>;
+    segment_t orig_segment = segment;
+
+    // compute trip count
+    diff_t len = segment_length<ArgumentId>(data);
+//    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
+    diff_t t_init = item.get_local_id(ThreadDim);
+    diff_t i_init = t_init * chunk_size;
+//    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
+    diff_t t_stride = item.get_local_range(ThreadDim);
+    diff_t i_stride = t_stride * chunk_size;
+
+    // Iterate through grid stride of chunks
+    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+      diff_t i = ii + i_init;
+
+      // execute enclosed statements if any thread will
+      // but mask off threads without work
+      bool have_work = i < len;
+
+      // Assign our new tiled segment
+      diff_t slice_size = have_work ? chunk_size : 0;
+      segment = orig_segment.slice(i, slice_size);
+      data.template assign_param<ParamId>(t);
+
+      // execute enclosed statements
+      enclosed_stmts_t::exec(data, item, thread_active && have_work);
+    }
+
+    // Set range back to original values
+    segment = orig_segment;
+  }
+};
+
+}  // end namespace internal
+}  // end namespace RAJA
+
+#endif  // RAJA_ENABLE_SYCL
+#endif  /* RAJA_policy_sycl_kernel_TileTCount_HPP */
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index edc1e1cf2c..fbc3611158 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -95,8 +95,53 @@ struct LaunchDims {
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
+    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    // Global resource was not set, use the resource that was passed to forall
+    // Determine if the default SYCL res is being used
+    if (!q) {
+      camp::resources::Resource sycl_res = camp::resources::Sycl();
+      q = sycl_res.get<camp::resources::Sycl>().get_queue();
+    }
+
+    cl::sycl::device dev = q->get_device();
+
+    auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>();
+
+    if(launch_local.x > max_work_group_size) {
+      launch_local.x = max_work_group_size;
+    }
+    if(launch_local.y > max_work_group_size) {
+      launch_local.y = max_work_group_size;
+    }
+    if(launch_local.z > max_work_group_size) {
+      launch_local.z = max_work_group_size;
+    }
 
-// User gave group policy, use to calculate global space
+
+    // Make sure the multiple of locals fits
+    // Prefer larger z -> y -> x
+    if(launch_local.x * launch_local.y * launch_local.z > max_work_group_size) {
+      int remaining = 1;
+      // local z cannot be > max_wrk from above
+      // if equal then remaining is 1, on handle < 
+      if(max_work_group_size > launch_local.z) {
+        // keep local z
+        remaining = max_work_group_size / launch_local.z;
+      }
+      if(remaining >= launch_local.y) {
+        // keep local y
+        remaining = remaining / launch_local.y;
+      } else {
+        launch_local.y = remaining;
+        remaining = remaining / launch_local.y;
+      }
+      if(remaining < launch_local.x) {
+        launch_local.x = remaining;
+      }
+    }
+
+
+    // User gave group policy, use to calculate global space
     if (group.x != 0 || group.y != 0 || group.z != 0) {
       sycl_dim_3_t launch_group {1,1,1};
       launch_group.x = std::max(launch_group.x, group.x);
@@ -113,9 +158,15 @@ struct LaunchDims {
     }
 
 
-    // Note: Work group allowable sizes depend on the device
-    //       Could query the device to set them
-    //       For now, error on bad work group size
+    if(launch_global.x % launch_local.x != 0) {
+      launch_global.x = ((launch_global.x / launch_local.x) + 1) * launch_local.x; 
+    }
+    if(launch_global.y % launch_local.y != 0) {
+      launch_global.y = ((launch_global.y / launch_local.y) + 1) * launch_local.y; 
+    }
+    if(launch_global.z % launch_local.z != 0) {
+      launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; 
+    }
 
     cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z};
     cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z};
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
new file mode 100644
index 0000000000..fcb4c7c0d6
--- /dev/null
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -0,0 +1,1170 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   RAJA header file containing user interface for RAJA::launch::sycl
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_pattern_launch_sycl_HPP
+#define RAJA_pattern_launch_sycl_HPP
+
+#include "RAJA/pattern/launch/launch_core.hpp"
+#include "RAJA/pattern/detail/privatizer.hpp"
+#include "RAJA/policy/sycl/policy.hpp"
+#include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
+//#include "RAJA/policy/sycl/raja_syclerrchk.hpp"
+#include "RAJA/util/resource.hpp"
+
+namespace RAJA
+{
+
+template <bool async>
+struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
+
+  //If the launch lambda is trivially copyable
+  template <typename BODY_IN,
+	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)	      
+  {
+
+    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+
+    resources::Sycl sycl_res = resources::Sycl::get_default();
+
+    // Global resource was not set, use the resource that was passed to forall
+    // Determine if the default SYCL res is being used
+    if (!q) {
+      q = sycl_res.get_queue();
+    }
+
+    const ::sycl::range<3> blockSize(params.threads.value[0],
+				     params.threads.value[1],
+				     params.threads.value[2]);
+
+    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+				    params.threads.value[1] * params.teams.value[1],
+				    params.threads.value[2] * params.teams.value[2]);
+
+    // Only launch kernel if we have something to iterate over
+    constexpr size_t zero = 0;
+    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
+         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+
+      RAJA_FT_BEGIN
+
+      q->submit([&](cl::sycl::handler& h) {
+
+        auto s_vec = cl::sycl::accessor<char, 1, cl::sycl::access::mode::read_write,
+                                        cl::sycl::access::target::local> (params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+	     LaunchContext ctx;	     
+             ctx.itm = &itm;
+
+             //Point to shared memory
+             ctx.shared_mem_ptr = s_vec.get_pointer().get();
+
+             body_in(ctx);
+
+           });
+
+      });
+
+      RAJA_FT_END;
+    }
+
+    if (!async) { q->wait(); }
+  }
+
+  //If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+  static void exec(const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
+  {
+
+    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+
+    resources::Sycl sycl_res = resources::Sycl::get_default();
+
+    // Global resource was not set, use the resource that was passed to forall
+    // Determine if the default SYCL res is being used
+    if (!q) {
+      q = sycl_res.get_queue();
+    }
+
+    const ::sycl::range<3> blockSize(params.threads.value[0],
+				     params.threads.value[1],
+				     params.threads.value[2]);
+
+    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+				    params.threads.value[1] * params.teams.value[1],
+				    params.threads.value[2] * params.teams.value[2]);
+
+    // Only launch kernel if we have something to iterate over
+    constexpr size_t zero = 0;
+    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
+         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+
+      RAJA_FT_BEGIN
+
+      //
+      // Kernel body is nontrivially copyable, create space on device and copy to
+      // Workaround until "is_device_copyable" is supported
+      //
+      using LOOP_BODY = camp::decay<BODY_IN>;
+      LOOP_BODY* lbody;
+      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
+
+      q->submit([&](cl::sycl::handler& h) {
+
+	  auto s_vec = cl::sycl::accessor<char, 1, cl::sycl::access::mode::read_write,
+					  cl::sycl::access::target::local> (params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+   	    LaunchContext ctx;
+	    ctx.itm = &itm;
+
+	    //Point to shared memory
+	    ctx.shared_mem_ptr = s_vec.get_pointer().get();
+
+	    (*lbody)(ctx);
+
+	  });
+
+	}).wait();
+
+       cl::sycl::free(lbody, *q);
+
+      RAJA_FT_END;
+    }
+
+    if (!async) { q->wait(); }
+  }
+
+ //If the launch lambda is trivially copyable
+  template <typename BODY_IN,
+	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
+  {
+
+    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+
+    /*Get the concrete resource */
+    resources::Sycl sycl_res = res.get<RAJA::resources::Sycl>();
+
+    // Global resource was not set, use the resource that was passed to forall
+    // Determine if the default SYCL res is being used
+    if (!q) {
+      q = sycl_res.get_queue();
+    }
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    const ::sycl::range<3> blockSize(params.threads.value[0],
+				     params.threads.value[1],
+				     params.threads.value[2]);
+
+    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+				    params.threads.value[1] * params.teams.value[1],
+				    params.threads.value[2] * params.teams.value[2]);
+
+    // Only launch kernel if we have something to iterate over
+    constexpr size_t zero = 0;
+    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
+         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+
+      RAJA_FT_BEGIN;
+
+      q->submit([&](cl::sycl::handler& h) {
+
+        auto s_vec = cl::sycl::accessor<char, 1, cl::sycl::access::mode::read_write,
+                                        cl::sycl::access::target::local> (params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+ 	     LaunchContext ctx;
+             ctx.itm = &itm;
+
+             //Point to shared memory
+             ctx.shared_mem_ptr = s_vec.get_pointer().get();
+
+             body_in(ctx);
+
+           });
+
+      });
+
+      RAJA_FT_END;
+
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+  //If the launch lambda is not trivially copyable
+  template <typename BODY_IN,
+	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
+  static resources::EventProxy<resources::Resource>
+  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name, BODY_IN &&body_in)
+  {
+
+    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+
+    /*Get the concrete resource */
+    resources::Sycl sycl_res = res.get<RAJA::resources::Sycl>();
+
+    // Global resource was not set, use the resource that was passed to forall
+    // Determine if the default SYCL res is being used
+    if (!q) {
+      q = sycl_res.get_queue();
+    }
+
+    //
+    // Compute the number of blocks and threads
+    //
+
+    const ::sycl::range<3> blockSize(params.threads.value[0],
+				     params.threads.value[1],
+				     params.threads.value[2]);
+
+    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+				    params.threads.value[1] * params.teams.value[1],
+				    params.threads.value[2] * params.teams.value[2]);
+
+    // Only launch kernel if we have something to iterate over
+    constexpr size_t zero = 0;
+    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
+         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+
+      RAJA_FT_BEGIN;
+
+      //
+      // Kernel body is nontrivially copyable, create space on device and copy to
+      // Workaround until "is_device_copyable" is supported
+      //
+      using LOOP_BODY = camp::decay<BODY_IN>;
+      LOOP_BODY* lbody;
+      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
+
+      q->submit([&](cl::sycl::handler& h) {
+
+        auto s_vec = cl::sycl::accessor<char, 1, cl::sycl::access::mode::read_write,
+                                        cl::sycl::access::target::local> (params.shared_mem_size, h);
+
+        h.parallel_for
+          (cl::sycl::nd_range<3>(gridSize, blockSize),
+           [=] (cl::sycl::nd_item<3> itm) {
+
+	     LaunchContext ctx;
+             ctx.itm = &itm;
+
+             //Point to shared memory
+             ctx.shared_mem_ptr = s_vec.get_pointer().get();
+
+	     (*lbody)(ctx);
+
+           });
+
+      });
+
+      RAJA_FT_END;
+
+    }
+
+    return resources::EventProxy<resources::Resource>(res);
+  }
+
+
+};
+
+/*
+   SYCL global thread mapping
+*/
+template<int ... DIM>
+struct sycl_global_item;
+
+using sycl_global_item_0 = sycl_global_item<0>;
+using sycl_global_item_1 = sycl_global_item<1>;
+using sycl_global_item_2 = sycl_global_item<2>;
+
+template <typename SEGMENT, int DIM>
+struct LoopExecute<sycl_global_item<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx =
+        ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
+        ctx.itm->get_local_id(DIM);
+
+      if (tx < len) body(*(segment.begin() + tx));
+    }
+  }
+};
+
+using sycl_global_item_01 = sycl_global_item<0,1>;
+using sycl_global_item_02 = sycl_global_item<0,2>;
+using sycl_global_item_10 = sycl_global_item<1,0>;
+using sycl_global_item_12 = sycl_global_item<1,2>;
+using sycl_global_item_20 = sycl_global_item<2,0>;
+using sycl_global_item_21 = sycl_global_item<2,1>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx =
+        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+        ctx.itm->get_local_id(DIM0);
+
+      const int ty =
+        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+        ctx.itm->get_local_id(DIM1);
+
+
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+
+using sycl_global_item_012 = sycl_global_item<0,1,2>;
+using sycl_global_item_021 = sycl_global_item<0,2,1>;
+using sycl_global_item_102 = sycl_global_item<1,0,2>;
+using sycl_global_item_120 = sycl_global_item<1,2,0>;
+using sycl_global_item_201 = sycl_global_item<2,0,1>;
+using sycl_global_item_210 = sycl_global_item<2,1,0>;
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx =
+        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+        ctx.itm->get_local_id(DIM0);
+
+      const int ty =
+        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+        ctx.itm->get_local_id(DIM1);
+
+      const int tz =
+        ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
+        ctx.itm->get_local_id(DIM2);
+
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment1.begin() + ty));
+    }
+  }
+};
+
+/*
+Reshape threads in a block into a 1D iteration space
+*/
+template<int ... dim>
+struct sycl_flatten_group_local_direct{};
+
+using sycl_flatten_group_local_01_direct = sycl_flatten_group_local_direct<0,1>;
+using sycl_flatten_group_local_02_direct = sycl_flatten_group_local_direct<0,2>;
+using sycl_flatten_group_local_10_direct = sycl_flatten_group_local_direct<1,0>;
+using sycl_flatten_group_local_12_direct = sycl_flatten_group_local_direct<1,2>;
+using sycl_flatten_group_local_20_direct = sycl_flatten_group_local_direct<2,0>;
+using sycl_flatten_group_local_21_direct = sycl_flatten_group_local_direct<2,1>;
+
+using sycl_flatten_group_local_012_direct = sycl_flatten_group_local_direct<0,1,2>;
+using sycl_flatten_group_local_021_direct = sycl_flatten_group_local_direct<0,2,1>;
+using sycl_flatten_group_local_102_direct = sycl_flatten_group_local_direct<1,0,2>;
+using sycl_flatten_group_local_120_direct = sycl_flatten_group_local_direct<1,2,0>;
+using sycl_flatten_group_local_201_direct = sycl_flatten_group_local_direct<2,0,1>;
+using sycl_flatten_group_local_210_direct = sycl_flatten_group_local_direct<2,1,0>;
+
+template<int ... dim>
+struct sycl_flatten_group_local_loop{};
+
+using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0,1>;
+using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0,2>;
+using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1,0>;
+using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1,2>;
+using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2,0>;
+using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2,1>;
+
+using sycl_flatten_group_local_012_loop = sycl_flatten_group_local_loop<0,1,2>;
+using sycl_flatten_group_local_021_loop = sycl_flatten_group_local_loop<0,2,1>;
+using sycl_flatten_group_local_102_loop = sycl_flatten_group_local_loop<1,0,2>;
+using sycl_flatten_group_local_120_loop = sycl_flatten_group_local_loop<1,2,0>;
+using sycl_flatten_group_local_201_loop = sycl_flatten_group_local_loop<2,0,1>;
+using sycl_flatten_group_local_210_loop = sycl_flatten_group_local_loop<2,1,0>;
+
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = ctx.itm->get_local_id(DIM0);
+      const int ty = ctx.itm->get_local_id(DIM1);
+      const int bx = ctx.itm->get_local_range(DIM0);
+      const int tid = tx + bx*ty;
+
+      if (tid < len) body(*(segment.begin() + tid));
+    }
+  }
+};
+
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int len = segment.end() - segment.begin();
+
+    const int tx = ctx.itm->get_local_id(DIM0);
+    const int ty = ctx.itm->get_local_id(DIM1);
+
+    const int bx = ctx.itm->get_local_range(DIM0);
+    const int by = ctx.itm->get_local_range(DIM1);
+
+    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
+      body(*(segment.begin() + tid));
+    }
+
+  }
+};
+
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = ctx.itm->get_local_id(DIM0);
+      const int ty = ctx.itm->get_local_id(DIM1);
+      const int tz = ctx.itm->get_local_id(DIM2);
+      const int bx = ctx.itm->get_local_range(DIM0);
+      const int by = ctx.itm->get_local_range(DIM1);
+
+      const int tid = tx + bx*(ty + by*tz);
+
+      if (tid < len) body(*(segment.begin() + tid));
+    }
+  }
+};
+
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+    const int len = segment.end() - segment.begin();
+
+    const int tx = ctx.itm->get_local_id(DIM0);
+    const int ty = ctx.itm->get_local_id(DIM1);
+    const int tz = ctx.itm->get_local_id(DIM2);
+    const int bx = ctx.itm->get_local_range(DIM0);
+    const int by = ctx.itm->get_local_range(DIM1);
+    const int bz = ctx.itm->get_local_range(DIM2);
+
+    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
+      body(*(segment.begin() + tid));
+    }
+
+  }
+};
+
+/*
+  SYCL thread loops with block strides
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = ctx.itm->get_local_id(DIM);
+         tx < len;
+         tx += ctx.itm->get_local_range(DIM))
+    {
+      body(*(segment.begin() + tx));
+    }
+  }
+};
+
+/*
+  SYCL thread direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = ctx.itm->get_local_id(DIM);
+      if (tx < len) body(*(segment.begin() + tx));
+    }
+  }
+};
+
+/*
+  SYCL block loops with grid strides
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx = ctx.itm->get_group(DIM);
+         bx < len;
+         bx += ctx.itm->get_group_range(DIM) ) {
+      body(*(segment.begin() + bx));
+    }
+  }
+};
+
+/*
+  SYCL block direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bx = ctx.itm->get_group(DIM);
+      if (bx < len) body(*(segment.begin() + bx));
+    }
+  }
+};
+
+/*
+  SYCL thread loops with block strides + Return Index
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = ctx.itm->get_local_id(DIM);
+         tx < len;
+         tx += ctx.itm->get_local_range(DIM) )
+    {
+      body(*(segment.begin() + tx), tx);
+    }
+  }
+};
+
+/*
+  SYCL thread direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int tx = ctx.itm->get_local_id(DIM);
+      if (tx < len) body(*(segment.begin() + tx), tx);
+    }
+  }
+};
+
+/*
+  SYCL block loops with grid strides
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx =  ctx.itm->get_group(DIM);
+         bx < len;
+         bx += ctx.itm->get_group_range(DIM) ) {
+      body(*(segment.begin() + bx), bx);
+    }
+  }
+};
+
+/*
+  SYCL block direct mappings
+*/
+template <typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+    {
+      const int bx = ctx.itm->get_group(DIM);
+      if (bx < len) body(*(segment.begin() + bx), bx);
+    }
+  }
+};
+
+// perfectly nested sycl direct policies
+using sycl_group_01_nested_direct = sycl_group_012_direct<0,1>;
+using sycl_group_02_nested_direct = sycl_group_012_direct<0,2>;
+using sycl_group_10_nested_direct = sycl_group_012_direct<1,0>;
+using sycl_group_12_nested_direct = sycl_group_012_direct<1,2>;
+using sycl_group_20_nested_direct = sycl_group_012_direct<2,0>;
+using sycl_group_21_nested_direct = sycl_group_012_direct<2,1>;
+
+using sycl_group_012_nested_direct = sycl_group_012_direct<0,1,2>;
+using sycl_group_021_nested_direct = sycl_group_012_direct<0,2,1>;
+using sycl_group_102_nested_direct = sycl_group_012_direct<1,0,2>;
+using sycl_group_120_nested_direct = sycl_group_012_direct<1,2,0>;
+using sycl_group_201_nested_direct = sycl_group_012_direct<2,0,1>;
+using sycl_group_210_nested_direct = sycl_group_012_direct<2,1,0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty));
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
+      const int tz = ctx.itm->get_group(DIM2);
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz));
+    }
+  }
+};
+
+/*
+  Perfectly nested sycl direct policies
+  Return local index
+*/
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx =  ctx.itm->get_group(DIM0);
+      const int ty =  ctx.itm->get_group(DIM1);
+      if (tx < len0 && ty < len1)
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
+             tx, ty);
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
+      const int tz = ctx.itm->get_group(DIM2);
+      if (tx < len0 && ty < len1 && tz < len2)
+        body(*(segment0.begin() + tx),
+             *(segment1.begin() + ty),
+             *(segment2.begin() + tz), tx, ty, tz);
+    }
+  }
+};
+
+// perfectly nested sycl loop policies
+using sycl_group_01_nested_loop = sycl_group_012_loop<0,1>;
+using sycl_group_02_nested_loop = sycl_group_012_loop<0,2>;
+using sycl_group_10_nested_loop = sycl_group_012_loop<1,0>;
+using sycl_group_12_nested_loop = sycl_group_012_loop<1,2>;
+using sycl_group_20_nested_loop = sycl_group_012_loop<2,0>;
+using sycl_group_21_nested_loop = sycl_group_012_loop<2,1>;
+
+using sycl_group_012_nested_loop = sycl_group_012_loop<0,1,2>;
+using sycl_group_021_nested_loop = sycl_group_012_loop<0,2,1>;
+using sycl_group_102_nested_loop = sycl_group_012_loop<1,0,2>;
+using sycl_group_120_nested_loop = sycl_group_012_loop<1,2,0>;
+using sycl_group_201_nested_loop = sycl_group_012_loop<2,0,1>;
+using sycl_group_210_nested_loop = sycl_group_012_loop<2,1,0>;
+
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+
+      for (int bx = ctx.itm->get_group(DIM0);
+           bx < len0;
+           bx += ctx.itm->get_group_range(DIM0))
+      {
+        for (int by = ctx.itm->get_group(DIM1);
+             by < len1;
+             bx += ctx.itm->get_group_range(DIM1))
+        {
+          body(*(segment0.begin() + bx), *(segment1.begin() + by));
+        }
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bx = ctx.itm->get_group(DIM0);
+         bx < len0;
+         bx += ctx.itm->get_group_range(DIM0))
+    {
+
+      for (int by = ctx.itm->get_group(DIM1);
+           by < len1;
+           by += ctx.itm->get_group_range(DIM1))
+      {
+
+        for (int bz = ctx.itm->get_group(DIM2);
+             bz < len2;
+             bz += ctx.itm->get_group_range(DIM2))
+        {
+
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz));
+        }
+      }
+    }
+  }
+};
+
+/*
+  perfectly nested sycl loop policies + returns local index
+*/
+template <typename SEGMENT, int DIM0, int DIM1>
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      BODY const &body)
+  {
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+    {
+
+      for (int bx = ctx.itm->get_group(DIM0);
+           bx < len0;
+           bx += ctx.itm->get_group_range(DIM0))
+      {
+        for (int by = ctx.itm->get_group(DIM0);
+             by < len1;
+             by += ctx.itm->get_group_range(DIM1))
+        {
+
+          body(*(segment0.begin() + bx), *(segment1.begin() + by), bx, by);
+        }
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+
+  template <typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const &segment0,
+      SEGMENT const &segment1,
+      SEGMENT const &segment2,
+      BODY const &body)
+  {
+    const int len2 = segment2.end() - segment2.begin();
+    const int len1 = segment1.end() - segment1.begin();
+    const int len0 = segment0.end() - segment0.begin();
+
+    for (int bx = ctx.itm->get_group(DIM0);
+         bx < len0;
+         bx += ctx.itm->get_group_range(DIM0))
+    {
+
+      for (int by = ctx.itm->get_group(DIM0);
+           by < len1;
+           by += ctx.itm->get_group_range(DIM0))
+      {
+
+        for (int bz =  ctx.itm->get_group(DIM0);
+             bz < len2;
+             bz += ctx.itm->get_group_range(DIM0))
+        {
+
+          body(*(segment0.begin() + bx),
+               *(segment1.begin() + by),
+               *(segment2.begin() + bz), bx, by, bz);
+        }
+      }
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
+         tx < len;
+         tx += ctx.itm->get_local_range(DIM) * tile_size)
+    {
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int tx = ctx.itm->get_local_id(DIM) * tile_size;
+    if(tx < len)
+    {
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = ctx.itm->get_group(DIM)* tile_size;
+
+         tx < len;
+
+         tx += ctx.itm->get_group_range(DIM) * tile_size)
+    {
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+template <typename SEGMENT, int DIM>
+struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int tx = ctx.itm->get_group(DIM) * tile_size;
+    if(tx < len){
+      body(segment.slice(tx, tile_size));
+    }
+  }
+};
+
+//Tile execute + return index
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
+         tx < len;
+         tx += ctx.itm->get_local_range(DIM) * tile_size)
+    {
+      body(segment.slice(tx, tile_size), tx/tile_size);
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int tx = ctx.itm->get_local_id(DIM) * tile_size;
+    if(tx < len)
+    {
+      body(segment.slice(tx, tile_size), tx/tile_size);
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    for (int bx = ctx.itm->get_group(DIM) * tile_size;
+         bx < len;
+         bx += ctx.itm->get_group_range(DIM) * tile_size)
+    {
+      body(segment.slice(bx, tile_size), bx/tile_size);
+    }
+  }
+};
+
+
+template <typename SEGMENT, int DIM>
+struct TileICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+
+  template <typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      TILE_T tile_size,
+      SEGMENT const &segment,
+      BODY const &body)
+  {
+
+    const int len = segment.end() - segment.begin();
+
+    int bx = ctx.itm->get_group(DIM) * tile_size;
+    if(bx < len){
+      body(segment.slice(bx, tile_size), bx/tile_size);
+    }
+  }
+};
+
+}  // namespace RAJA
+#endif
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index 1b1b13ab34..77b00ec325 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -20,11 +20,15 @@
 
 #include "RAJA/config.hpp"
 
-#if defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_SYCL_ACTIVE)
 
 #include <CL/sycl.hpp>
 
 #include "RAJA/policy/PolicyBase.hpp"
+#include "RAJA/policy/loop/policy.hpp"
+
+#include "RAJA/util/Operators.hpp"
+#include "RAJA/util/types.hpp"
 
 #include <cstddef>
 
@@ -72,20 +76,55 @@ struct sycl_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::sycl> {
 };
 
+template <bool Async, int num_threads = 0>
+struct sycl_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
+                       RAJA::Policy::sycl,
+                       RAJA::Pattern::region,
+                       detail::get_launch<Async>::value,
+                       RAJA::Platform::sycl> {
+};
+
 struct sycl_reduce
     : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce> {
 };
 
+//
+// Sycl atomic policy for using sycl atomics on the device and
+// the provided Policy on the host
+//
+template<typename host_policy>
+struct sycl_atomic_explicit{};
+
+//
+// Default cuda atomic policy uses cuda atomics on the device and non-atomics
+// on the host
+//
+using sycl_atomic = sycl_atomic_explicit<loop_atomic>;
+
+template<typename Mask>
+struct sycl_local_masked_direct {};
+
+template<typename Mask>
+struct sycl_local_masked_loop {};
+
 }  // namespace sycl
 }  // namespace policy
 
 using policy::sycl::sycl_exec;
 using policy::sycl::sycl_reduce;
 
+using policy::sycl::sycl_atomic;
+using policy::sycl::sycl_atomic_explicit;
+
+using policy::sycl::sycl_local_masked_direct;
+using policy::sycl::sycl_local_masked_loop;
+
+using policy::sycl::sycl_launch_t;
+  
 /*!
  * Maps indices to SYCL global id
- * Optional WORK_GROUP_SIZE to 
- */ 
+ * Optional WORK_GROUP_SIZE to
+ */
 template<int dim, int WORK_GROUP_SIZE = 1>
 struct sycl_global_012{};
 
@@ -100,7 +139,7 @@ using sycl_global_2 = sycl_global_012<2, WORK_GROUP_SIZE>;
  * Maps segment indices to SYCL group ids.
  * Loops to allow for any value
  */
-template<int dim>
+template<int ... dim>
 struct sycl_group_012_loop{};
 
 using sycl_group_0_loop = sycl_group_012_loop<0>;
@@ -111,7 +150,7 @@ using sycl_group_2_loop = sycl_group_012_loop<2>;
  * Maps segment indices to SYCL local ids.
  * Loops to allow for any value
  */
-template<int dim>
+template<int ... dim>
 struct sycl_local_012_loop{};
 
 using sycl_local_0_loop = sycl_local_012_loop<0>;
@@ -121,7 +160,7 @@ using sycl_local_2_loop = sycl_local_012_loop<2>;
 /*!
  * Maps segment indices to SYCL group ids.
  */
-template<int dim>
+template<int ... dim>
 struct sycl_group_012_direct{};
 
 using sycl_group_0_direct = sycl_group_012_direct<0>;
@@ -131,7 +170,7 @@ using sycl_group_2_direct = sycl_group_012_direct<2>;
 /*!
  * Maps segment indices to SYCL local ids.
  */
-template<int dim>
+template<int ... dim>
 struct sycl_local_012_direct{};
 
 using sycl_local_0_direct = sycl_local_012_direct<0>;
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index cf64443154..4d7a08756c 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -71,7 +71,7 @@ struct maxloc
 
 // Alias for clarity. Reduction size operates on number of  teams.
 // Ideally, MaxNumTeams = ThreadsPerTeam in omp_target_parallel_for_exec.
-static int MaxNumTeams = 256;
+static int MaxNumTeams = 1;
 
 //! Information necessary for OpenMP offload to be considered
 struct Offload_Info 
@@ -104,12 +104,19 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T /*defaultValue*/, T identityValue, Offload_Info &info)
-      : value(identityValue),
-        device{reinterpret_cast<T *>(
-            cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T),*(::RAJA::sycl::detail::getQueue())))},
-        host{new T[sycl::MaxNumTeams]}
+  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
+      : value(initValue)
   {
+    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+
+    if(!q) {
+      camp::resources::Resource res = camp::resources::Sycl();
+      q = res.get<camp::resources::Sycl>().get_queue();
+    } 
+
+    device = reinterpret_cast<T *>(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
+    host = reinterpret_cast<T *>(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
+
     if (!host) {
       printf("Unable to allocate space on host\n");
       exit(1);
@@ -122,6 +129,11 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
+  void reset(T initValue)
+  {
+    value = initValue;
+  }
+
   //! default copy constructor for POD
   Reduce_Data(const Reduce_Data &) = default;
 
@@ -130,6 +142,11 @@ struct Reduce_Data
   {
     cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
 
+    if(!q) {
+      camp::resources::Resource res = camp::resources::Sycl();
+      q = res.get<camp::resources::Sycl>().get_queue();
+    }
+
     // precondition: host and device are valid pointers
     auto e = q->memcpy(reinterpret_cast<void *>(device),
                        reinterpret_cast<void *>(host),
@@ -143,6 +160,11 @@ struct Reduce_Data
   {
     cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
 
+    if(!q) {
+      camp::resources::Resource res = camp::resources::Sycl();
+      q = res.get<camp::resources::Sycl>().get_queue();
+    } 
+
     // precondition: host and device are valid pointers
     auto e = q->memcpy(reinterpret_cast<void *>(host),
                        reinterpret_cast<void *>(device),
@@ -154,13 +176,18 @@ struct Reduce_Data
   //! frees all data from the offload information passed
   RAJA_INLINE void cleanup(Offload_Info &info)
   {
+    cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
+    if(!q) {
+      camp::resources::Resource res = camp::resources::Sycl();
+      q = res.get<camp::resources::Sycl>().get_queue();
+    }
     if (device) {
-      cl::sycl::queue* q = ::RAJA::sycl::detail::getQueue();
       cl::sycl::free(reinterpret_cast<void *>(device), *q);
       device = nullptr;
     }
     if (host) {
-      delete[] host;
+      cl::sycl::free(reinterpret_cast<void *>(host), *q);
+      //delete[] host;
       host = nullptr;
     }
   }
@@ -184,6 +211,15 @@ struct TargetReduce
   {
   }
 
+  void reset(T init_val_, T identity_ = Reducer::identity())
+  {
+    val.cleanup(info);
+    val = sycl::Reduce_Data<T>(identity_, identity_, info);
+    info.isMapped = false;
+    initVal = init_val_;
+    finalVal = identity_;
+  }
+
   //! apply reduction on device upon destruction
   ~TargetReduce()
   {
@@ -194,17 +230,18 @@ struct TargetReduce
   {
     if (!info.isMapped) {
       val.deviceToHost(info);
-
-      for (int i = 0; i < sycl::MaxNumTeams; ++i) {
+      for (int i =0; i < sycl::MaxNumTeams; ++i) {
         Reducer{}(val.value, val.host[i]);
       }
-      val.cleanup(info);
+//      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
     Reducer{}(finalVal, initVal);
     Reducer{}(finalVal, val.value);
-    return finalVal;
+    T returnVal = finalVal;
+    reset(finalVal);
+    return returnVal;
   }
   //! alias for operator T()
   T get() { return operator T(); }
@@ -214,12 +251,11 @@ struct TargetReduce
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::ext::oneapi::memory_order::relaxed, cl::sycl::ext::oneapi::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
     Reducer{}(atm, rhsVal);
     return *this;
 #else
-    auto i = 0;
-    Reducer{}(val.device[i], rhsVal);
+    Reducer{}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -229,12 +265,11 @@ struct TargetReduce
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::ext::oneapi::memory_order::relaxed, cl::sycl::ext::oneapi::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    atm.fetch_add(rhsVal);  
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
+    Reducer{}(atm, rhsVal);  
     return *this;
 #else
-    auto i = 0;
-    Reducer{}(val.device[i], rhsVal);
+    Reducer{}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -271,14 +306,6 @@ struct TargetReduceLoc
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-//    if (!omp_is_initial_device()) {
-//#pragma omp critical
-//      {
-//        int tid = omp_get_team_num();
-//        Reducer{}(val.device[tid], loc.device[tid], val.value, loc.value);
-  //    }
-    //}
   }
 
   //! map result value back to host if not done already; return aggregate value
@@ -287,36 +314,63 @@ struct TargetReduceLoc
     if (!info.isMapped) {
       val.deviceToHost(info);
       loc.deviceToHost(info);
+      
       for (int i = 0; i < sycl::MaxNumTeams; ++i) {
         Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
       }
-      val.cleanup(info);
-      loc.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
     Reducer{}(finalVal, finalLoc, initVal, initLoc);
     Reducer{}(finalVal, finalLoc, val.value, loc.value);
-    return finalVal;
+    returnVal = finalVal;
+    returnLoc = finalLoc;
+    reset(finalVal, finalLoc);
+    return returnVal;
   }
   //! alias for operator T()
   T get() { return operator T(); }
 
+  void reset(T init_val_,
+             IndexType init_local_ =
+             IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value()),
+             T identity_ = Reducer::identity)
+  {
+    val.cleanup(info);
+    val = sycl::Reduce_Data<T>(identity_, identity_, info);
+    loc.cleanup(info);
+    loc = sycl::Reduce_Data<IndexType>(reduce::detail::DefaultLoc<IndexType>().value(), reduce::detail::DefaultLoc<IndexType>().value(), info);
+    info.isMapped = false;
+    initVal = init_val_;
+    finalVal = identity_;
+    initLoc = init_local_;//IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
+    finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
+  }
+
+
   //! map result value back to host if not done already; return aggregate
   //! location
   IndexType getLoc()
   {
     if (!info.isMapped) get();
     // return loc.value;
-    return (finalLoc);
+    return (returnLoc);
   }
 
   //! apply reduction
   TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
   {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
+    Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
+    return *this;
+#else
     Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
+#endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
@@ -328,6 +382,7 @@ struct TargetReduceLoc
 
   //! storage for reduction data for value
   sycl::Reduce_Data<T> val;
+  sycl::Reduce_Data<IndexType> loc;
 
 private:
   //! storage for offload information
@@ -335,11 +390,12 @@ struct TargetReduceLoc
   //! storage for reduction data for value
 //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
-  sycl::Reduce_Data<IndexType> loc;
   T initVal;
   T finalVal;
+  T returnVal;
   IndexType initLoc;
   IndexType finalLoc;
+  IndexType returnLoc;
 };
 
 
@@ -365,8 +421,8 @@ class ReduceSum<sycl_reduce, T>
   const self &operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = __spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::ext::oneapi::memory_order::relaxed, cl::sycl::ext::oneapi::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -376,6 +432,86 @@ class ReduceSum<sycl_reduce, T>
   }
 };
 
+//! specialization of ReduceBitOr for sycl_reduce
+template <typename T>
+class ReduceBitOr<sycl_reduce, T>
+    : public TargetReduce<RAJA::reduce::or_bit<T>, T>
+{
+public:
+
+  using self = ReduceBitOr<sycl_reduce, T>;
+  using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
+  using parent::parent;
+
+  //! enable operator|= for ReduceBitOr -- alias for reduce()
+  self &operator|=(T rhsVal)
+  {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    atm |= rhsVal;
+    return *this;
+#else
+    parent::reduce(rhsVal);
+    return *this;
+#endif
+  }
+
+  //! enable operator|= for ReduceBitOr -- alias for reduce()
+  const self &operator|=(T rhsVal) const
+  {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    atm |= rhsVal;
+    return *this;
+#else
+    parent::reduce(rhsVal);
+    return *this;
+#endif
+  }
+};
+
+//! specialization of ReduceBitAnd for sycl_reduce
+template <typename T>
+class ReduceBitAnd<sycl_reduce, T>
+    : public TargetReduce<RAJA::reduce::and_bit<T>, T>
+{
+public:
+
+  using self = ReduceBitAnd<sycl_reduce, T>;
+  using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
+  using parent::parent;
+
+  //! enable operator&= for ReduceBitAnd -- alias for reduce()
+  self &operator&=(T rhsVal)
+  {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    atm &= rhsVal;
+    return *this;
+#else
+    parent::reduce(rhsVal);
+    return *this;
+#endif
+  }
+
+  //! enable operator&= for ReduceBitAnd -- alias for reduce()
+  const self &operator&=(T rhsVal) const
+  {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    atm &= rhsVal;
+    return *this;
+#else
+    parent::reduce(rhsVal);
+    return *this;
+#endif
+  }
+};
+
 
 //! specialization of ReduceMin for omp_target_reduce
 template <typename T>
@@ -392,8 +528,8 @@ class ReduceMin<sycl_reduce, T>
   self &min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = __spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::ext::oneapi::memory_order::relaxed, cl::sycl::ext::oneapi::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -406,8 +542,8 @@ class ReduceMin<sycl_reduce, T>
   const self &min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = __spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::ext::oneapi::memory_order::relaxed, cl::sycl::ext::oneapi::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -434,7 +570,7 @@ class ReduceMax<sycl_reduce, T>
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::ext::oneapi::memory_order::relaxed, cl::sycl::ext::oneapi::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -448,7 +584,7 @@ class ReduceMax<sycl_reduce, T>
   {
 #ifdef __SYCL_DEVICE_ONLY__
     auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::ext::oneapi::memory_order::relaxed, cl::sycl::ext::oneapi::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -473,15 +609,57 @@ class ReduceMinLoc<sycl_reduce, T, IndexType>
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
   self &minloc(T rhsVal, IndexType rhsLoc)
   {
+#ifdef __SYCL_DEVICE_ONLY__
+    // TODO: Race condition currently
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto oldMin = atm.fetch_min(rhsVal);
+    if(oldMin >= rhsVal) { // New min or Same min
+      if(oldMin == rhsVal) { // Same as old min
+        if(rhsLoc < parent::loc.device[i]) { // if same, only overwrite if earlier
+          if(rhsVal == atm.load()) {
+            parent::loc.device[i] = rhsLoc;
+	  }
+        }
+      } else {
+        if(rhsVal == atm.load()) {
+          parent::loc.device[i] = rhsLoc;
+	}
+      }
+    }
+    return *this;
+#else
     parent::reduce(rhsVal, rhsLoc);
     return *this;
+#endif
   }
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
   const self &minloc(T rhsVal, IndexType rhsLoc) const
   {
+#ifdef __SYCL_DEVICE_ONLY__
+    // TODO: Race condition currently
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto oldMin = atm.fetch_min(rhsVal);
+    if(oldMin >= rhsVal) { // New min or Same min
+      if(oldMin == rhsVal) { // Same as old min
+        if(rhsLoc < parent::loc.device[i]) { // if same, only overwrite if earlier
+          if(rhsVal == atm.load()) {
+            parent::loc.device[i] = rhsLoc;
+	  }
+        }
+      } else {
+        if(rhsVal == atm.load()) {
+          parent::loc.device[i] = rhsLoc;
+	}
+      }
+    }
+    return *this;
+#else
     parent::reduce(rhsVal, rhsLoc);
     return *this;
+#endif
   }
 };
 
@@ -501,21 +679,63 @@ class ReduceMaxLoc<sycl_reduce, T, IndexType>
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
   self &maxloc(T rhsVal, IndexType rhsLoc)
   {
+#ifdef __SYCL_DEVICE_ONLY__
+    // TODO: Race condition currently
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto oldMin = atm.fetch_max(rhsVal);
+    if(oldMin <= rhsVal) { // New min or Same min
+      if(oldMin == rhsVal) { // Same as old min
+        if(rhsLoc < parent::loc.device[i]) { // if same, only overwrite if earlier
+          if(rhsVal == atm.load()) {
+            parent::loc.device[i] = rhsLoc;
+	  }
+        }
+      } else {
+        if(rhsVal == atm.load()) {
+          parent::loc.device[i] = rhsLoc;
+	}
+      }
+    }
+    return *this;
+#else
     parent::reduce(rhsVal, rhsLoc);
     return *this;
+#endif
   }
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
   const self &maxloc(T rhsVal, IndexType rhsLoc) const
   {
+#ifdef __SYCL_DEVICE_ONLY__
+    // TODO: Race condition currently
+    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm = cl::sycl::ext::oneapi::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto oldMin = atm.fetch_max(rhsVal);
+    if(oldMin <= rhsVal) { // New min or Same min
+      if(oldMin == rhsVal) { // Same as old min
+        if(rhsLoc < parent::loc.device[i]) { // if same, only overwrite if earlier
+          if(rhsVal == atm.load()) {
+            parent::loc.device[i] = rhsLoc;
+	  }
+        }
+      } else {
+        if(rhsVal == atm.load()) {
+          parent::loc.device[i] = rhsLoc;
+	}
+      }
+    }
+    return *this;
+#else
     parent::reduce(rhsVal, rhsLoc);
     return *this;
+#endif
   }
 };
 
 
 }  // namespace RAJA
 
-#endif  // closing endif for RAJA_ENABLE_TARGET_OPENMP guard
+#endif  // closing endif for RAJA_ENABLE_SYCL guard
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/tbb/WorkGroup.hpp b/include/RAJA/policy/tbb/WorkGroup.hpp
index d18089e640..e45aa5373a 100644
--- a/include/RAJA/policy/tbb/WorkGroup.hpp
+++ b/include/RAJA/policy/tbb/WorkGroup.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA Vtable and WorkRunner constructs.
+ * \brief   Header file containing RAJA Dispatcher and WorkRunner constructs.
  *
  ******************************************************************************
  */
@@ -18,7 +18,7 @@
 #ifndef RAJA_tbb_WorkGroup_HPP
 #define RAJA_tbb_WorkGroup_HPP
 
-#include "RAJA/policy/tbb/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/tbb/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/tbb/WorkGroup/WorkRunner.hpp"
 
 
diff --git a/include/RAJA/policy/tbb/WorkGroup/Vtable.hpp b/include/RAJA/policy/tbb/WorkGroup/Dispatcher.hpp
similarity index 65%
rename from include/RAJA/policy/tbb/WorkGroup/Vtable.hpp
rename to include/RAJA/policy/tbb/WorkGroup/Dispatcher.hpp
index 7686d2c026..885a2f79ca 100644
--- a/include/RAJA/policy/tbb/WorkGroup/Vtable.hpp
+++ b/include/RAJA/policy/tbb/WorkGroup/Dispatcher.hpp
@@ -3,7 +3,7 @@
  *
  * \file
  *
- * \brief   Header file containing RAJA workgroup Vtable.
+ * \brief   Header file containing RAJA workgroup Dispatcher.
  *
  ******************************************************************************
  */
@@ -15,14 +15,14 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef RAJA_tbb_WorkGroup_Vtable_HPP
-#define RAJA_tbb_WorkGroup_Vtable_HPP
+#ifndef RAJA_tbb_WorkGroup_Dispatcher_HPP
+#define RAJA_tbb_WorkGroup_Dispatcher_HPP
 
 #include "RAJA/config.hpp"
 
 #include "RAJA/policy/tbb/policy.hpp"
 
-#include "RAJA/policy/loop/WorkGroup/Vtable.hpp"
+#include "RAJA/policy/loop/WorkGroup/Dispatcher.hpp"
 
 
 namespace RAJA
@@ -32,12 +32,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Vtable object
+* Populate and return a Dispatcher object
 */
-template < typename T, typename Vtable_T >
-inline const Vtable_T* get_Vtable(tbb_work const&)
+template < typename T, typename Dispatcher_T >
+inline const Dispatcher_T* get_Dispatcher(tbb_work const&)
 {
-  return get_Vtable<T, Vtable_T>(loop_work{});
+  return get_Dispatcher<T, Dispatcher_T>(loop_work{});
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/tbb/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/tbb/WorkGroup/WorkRunner.hpp
index 14f117004a..14e4c063eb 100644
--- a/include/RAJA/policy/tbb/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/tbb/WorkGroup/WorkRunner.hpp
@@ -35,12 +35,14 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::tbb_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -48,6 +50,7 @@ struct WorkRunner<
         RAJA::tbb_for_exec,
         RAJA::tbb_work,
         RAJA::ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -57,12 +60,14 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename ALLOCATOR_T,
+template <typename DISPATCH_POLICY_T,
+          typename ALLOCATOR_T,
           typename INDEX_T,
           typename ... Args>
 struct WorkRunner<
         RAJA::tbb_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
@@ -70,6 +75,7 @@ struct WorkRunner<
         RAJA::tbb_for_exec,
         RAJA::tbb_work,
         RAJA::reverse_ordered,
+        DISPATCH_POLICY_T,
         ALLOCATOR_T,
         INDEX_T,
         Args...>
diff --git a/include/RAJA/policy/tbb/forall.hpp b/include/RAJA/policy/tbb/forall.hpp
index 67926ca898..de811a4251 100644
--- a/include/RAJA/policy/tbb/forall.hpp
+++ b/include/RAJA/policy/tbb/forall.hpp
@@ -32,6 +32,7 @@
 #include "RAJA/index/RangeSegment.hpp"
 #include "RAJA/internal/fault_tolerance.hpp"
 #include "RAJA/pattern/forall.hpp"
+#include "RAJA/pattern/params/forall.hpp"
 #include "RAJA/policy/tbb/policy.hpp"
 #include "RAJA/util/types.hpp"
 
@@ -68,11 +69,65 @@ namespace tbb
  * stealing at the cost of initial start-up overhead for a top-level loop.
  */
 
-template <typename Iterable, typename Func>
-RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                               const tbb_for_dynamic& p,
-                                                               Iterable&& iter,
-                                                               Func&& loop_body)
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
+forall_impl(resources::Host host_res,
+            const tbb_for_dynamic& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using brange = ::tbb::blocked_range<size_t>;
+  auto b = begin(iter);
+  size_t dist = std::abs(distance(begin(iter), end(iter)));
+
+  expt::ParamMultiplexer::init<tbb_for_dynamic>(f_params);
+
+  f_params = ::tbb::parallel_reduce(
+      brange(0, dist, p.grain_size),
+
+      f_params,
+
+      [=](const brange& r, ForallParam fp) {
+        using RAJA::internal::thread_privatize;
+        auto privatizer = thread_privatize(loop_body);
+        auto body = privatizer.get_priv();
+        for (auto i = r.begin(); i != r.end(); ++i)
+          expt::invoke_body(fp, loop_body, b[i]);
+        return fp;
+      },
+
+      [](ForallParam lhs, ForallParam rhs) -> ForallParam {
+        expt::ParamMultiplexer::combine<tbb_for_dynamic>(lhs, rhs);
+        return lhs;
+      }
+  );
+
+  expt::ParamMultiplexer::resolve<tbb_for_dynamic>(f_params);
+
+  return resources::EventProxy<resources::Host>(host_res);
+}
+
+template <typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
+forall_impl(resources::Host host_res,
+            const tbb_for_dynamic& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   using std::begin;
   using std::distance;
@@ -111,11 +166,68 @@ RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host h
  * correctnes requires the per-thread mapping, you *must* use TBB 2017 or newer
  */
 
-template <typename Iterable, typename Func, size_t ChunkSize>
-RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                               const tbb_for_static<ChunkSize>&,
-                                                               Iterable&& iter,
-                                                               Func&& loop_body)
+template <typename Iterable, typename Func, size_t ChunkSize, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+  >
+forall_impl(resources::Host host_res,
+            const tbb_for_static<ChunkSize>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam f_params)
+{
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using brange = ::tbb::blocked_range<size_t>;
+  auto b = begin(iter);
+  size_t dist = std::abs(distance(begin(iter), end(iter)));
+
+  expt::ParamMultiplexer::init<tbb_for_dynamic>(f_params);
+
+  auto fp = ::tbb::parallel_reduce(
+      brange(0, dist, ChunkSize),
+
+      f_params,
+
+      [=](const brange& r, ForallParam fp) {
+        using RAJA::internal::thread_privatize;
+        auto privatizer = thread_privatize(loop_body);
+        auto body = privatizer.get_priv();
+        for (auto i = r.begin(); i != r.end(); ++i)
+          expt::invoke_body(fp, loop_body, b[i]);
+        return fp;
+      },
+
+      [](ForallParam lhs, ForallParam rhs) -> ForallParam {
+        expt::ParamMultiplexer::combine<tbb_for_dynamic>(lhs, rhs);
+        return lhs;
+      },
+      tbb_static_partitioner{}
+
+  );
+  expt::ParamMultiplexer::combine<tbb_for_dynamic>(f_params, fp);
+
+  expt::ParamMultiplexer::resolve<tbb_for_dynamic>(f_params);
+
+  return resources::EventProxy<resources::Host>(host_res);
+}
+
+template <typename Iterable, typename Func, size_t ChunkSize, typename ForallParam>
+RAJA_INLINE 
+concepts::enable_if_t<
+  resources::EventProxy<resources::Host>,
+  expt::type_traits::is_ForallParamPack<ForallParam>,
+  expt::type_traits::is_ForallParamPack_empty<ForallParam>
+  >
+forall_impl(resources::Host host_res,
+            const tbb_for_static<ChunkSize>&,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam)
 {
   using std::begin;
   using std::distance;
diff --git a/include/RAJA/policy/tbb/params/reduce.hpp b/include/RAJA/policy/tbb/params/reduce.hpp
new file mode 100644
index 0000000000..91ce754770
--- /dev/null
+++ b/include/RAJA/policy/tbb/params/reduce.hpp
@@ -0,0 +1,36 @@
+#ifndef NEW_REDUCE_TBB_REDUCE_HPP
+#define NEW_REDUCE_TBB_REDUCE_HPP
+
+#include "RAJA/pattern/params/reducer.hpp"
+
+#if defined(RAJA_ENABLE_TBB)
+#include "RAJA/policy/tbb/policy.hpp"
+namespace RAJA {
+namespace expt {
+namespace detail {
+
+  // Init
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::tbb_for_dynamic> >
+  init(Reducer<OP, T>& red) {
+    red.val = OP::identity();
+  }
+  // Combine
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::tbb_for_dynamic> >
+  combine(Reducer<OP, T>& out, const Reducer<OP, T>& in) {
+    out.val = OP{}(out.val, in.val);
+  }
+  // Resolve
+  template<typename EXEC_POL, typename OP, typename T>
+  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::tbb_for_dynamic> >
+  resolve(Reducer<OP, T>& red) {
+    *red.target = OP{}(red.val, *red.target);
+  }
+
+} //  namespace detail
+} //  namespace expt
+} //  namespace RAJA
+#endif
+
+#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 06ea97e11a..a69d2740ff 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -210,7 +210,7 @@ namespace expt
       RAJA_INLINE
       self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
 				// AVX512F
-				_mm512_i64scatter_ps(ptr,
+				_mm512_i32scatter_ps(ptr,
 				                     createStridedOffsets(stride),
 														 m_value,
 														 sizeof(element_type));
@@ -225,7 +225,7 @@ namespace expt
       RAJA_INLINE
       self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
 				// AVX512F
-				_mm512_mask_i64scatter_ps(ptr,
+				_mm512_mask_i32scatter_ps(ptr,
                            				createMask(N),
 				                          createStridedOffsets(stride),
 																	m_value,
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 6967dd41d4..02ffb983cd 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -134,7 +134,11 @@ namespace expt
       RAJA_INLINE
       self_type &load_packed(element_type const *ptr){
 			  // AVX512F
-        m_value = _mm512_loadu_epi32(ptr);
+        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+        m_value = _mm512_loadu_si512(ptr);
+        #else
+        m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
+        #endif
         return *this;
       }
 
@@ -188,7 +192,11 @@ namespace expt
       RAJA_INLINE
       self_type const &store_packed(element_type *ptr) const{
 				// AVX512F
-        _mm512_storeu_epi32(ptr, m_value);
+        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+        _mm512_storeu_si512(ptr, m_value);
+        #else
+        _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
+        #endif
         return *this;
       }
 
@@ -210,7 +218,7 @@ namespace expt
       RAJA_INLINE
       self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
 				// AVX512F
-				_mm512_i64scatter_epi32(ptr,
+				_mm512_i32scatter_epi32(ptr,
 				                     createStridedOffsets(stride),
 														 m_value,
 														 sizeof(element_type));
@@ -225,7 +233,7 @@ namespace expt
       RAJA_INLINE
       self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
 				// AVX512F
-				_mm512_mask_i64scatter_epi32(ptr,
+				_mm512_mask_i32scatter_epi32(ptr,
                            				createMask(N),
 				                          createStridedOffsets(stride),
 																	m_value,
@@ -241,6 +249,11 @@ namespace expt
       RAJA_INLINE
       element_type get(camp::idx_t i) const
       {
+        // GNU 7-10 are missing this instruction.
+        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
+        #define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
+        #endif
+
 				switch(i){	
 					case 0: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
 					case 1: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index f5b73cdca2..f8318802fa 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -126,7 +126,12 @@ namespace expt
       RAJA_INLINE
       self_type &load_packed(element_type const *ptr){
 			  // AVX512F
-        m_value = _mm512_loadu_epi64(ptr);
+        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) \
+            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+        m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
+        #else
+        m_value = _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
+        #endif
         return *this;
       }
 
@@ -180,7 +185,12 @@ namespace expt
       RAJA_INLINE
       self_type const &store_packed(element_type *ptr) const{
 				// AVX512F
-        _mm512_storeu_epi64(ptr, m_value);
+        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) \
+            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+        _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
+        #else
+        _mm512_storeu_epi64(ptr, m_value);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
+        #endif
         return *this;
       }
 
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index 3db792dd26..aef825c9f1 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -94,6 +94,7 @@ struct CombiningAdapter
   Lambda m_lambda;
   Layout m_layout;
 
+  RAJA_SUPPRESS_HD_WARN
   template < camp::idx_t... RangeInts >
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>)
@@ -104,6 +105,7 @@ struct CombiningAdapter
     return m_lambda(camp::get<RangeInts>(indices)...);
   }
   ///
+  RAJA_SUPPRESS_HD_WARN
   template < camp::idx_t... RangeInts >
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>) const
@@ -213,6 +215,7 @@ auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
       std::forward<Lambda>(lambda), std::forward<Layout>(layout));
 }
 ///
+RAJA_SUPPRESS_HD_WARN
 template <typename Lambda, typename... IdxTs>
 RAJA_INLINE
 auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
@@ -235,6 +238,7 @@ auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> con
                                            std::move(offset_layout));
 }
 ///
+RAJA_SUPPRESS_HD_WARN
 template <typename Perm, typename Lambda, typename... IdxTs>
 RAJA_INLINE
 auto make_PermutedCombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index ea9b6f287e..bbe887483b 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -164,6 +164,16 @@ RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
   return Span<IterType, IndexType>(begin, size);
 }
 
+template <typename Iter>
+RAJA_INLINE auto make_span(Iter &iterable)
+{
+  using std::begin;
+  using std::end;
+  using std::distance;
+  return Span<typename Iter::iterator, decltype(distance(begin(iterable), end(iterable)))>
+    (begin(iterable), end(iterable));
+}
+
 }  // end namespace RAJA
 
 #endif /* RAJA_SPAN_HPP */
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 984cd0e9dc..cf55c6598e 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -40,7 +40,7 @@ namespace detail
 {
 
 
-template <typename IdxLin, typename Range, typename Sizes, typename Strides>
+template <typename IdxLin, typename Range, typename Sizes, typename Strides, typename DimTypeList=void>
 struct StaticLayoutBase_impl;
 
 
@@ -51,7 +51,8 @@ template <typename IdxLin,
 struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
-                             camp::int_seq<IdxLin, Strides...>> {
+                             camp::int_seq<IdxLin, Strides...>,
+                             void> {
 
   using IndexLinear = IdxLin;
   using sizes = camp::int_seq<IdxLin, Sizes...>;
@@ -152,6 +153,14 @@ struct StaticLayoutBase_impl<IdxLin,
     return camp::seq_at<DIM, sizes>::value;
   }
 
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_begin() const {
+    return 0;
+  }
+
 };
 
 template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
@@ -200,17 +209,29 @@ struct StrideCalculator<IdxLin,
 };
 
 
-template <typename Layout, typename DimTypeList>
-struct TypedStaticLayoutImpl;
 
-template <typename Layout, typename... DimTypes>
-struct TypedStaticLayoutImpl<Layout, camp::list<DimTypes...>> {
+template <typename IdxLin,
+          IdxLin... RangeInts,
+          IdxLin... Sizes,
+          IdxLin... Strides,
+          typename... DimTypes>
+struct StaticLayoutBase_impl<IdxLin,
+                             camp::int_seq<IdxLin, RangeInts...>,
+                             camp::int_seq<IdxLin, Sizes...>,
+                             camp::int_seq<IdxLin, Strides...>,
+                             camp::list<DimTypes...>> {
+
 
-  using IndexLinear = typename Layout::IndexLinear;
+  using IndexLinear = IdxLin;
+  using ranges      = camp::int_seq<IdxLin, RangeInts...>;
+  using sizes       = camp::int_seq<IdxLin, Sizes...>;
+  using strides     = camp::int_seq<IdxLin, Strides...>;  
+
+  using InnerLayout = StaticLayoutBase_impl<IdxLin,ranges,sizes,strides,void>;
 
   static
   constexpr
-  camp::idx_t stride_one_dim = Layout::stride_one_dim;
+  camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
 
   static constexpr IndexLinear n_dims = sizeof...(DimTypes);
   /*!
@@ -223,12 +244,12 @@ struct TypedStaticLayoutImpl<Layout, camp::list<DimTypes...>> {
   static RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear s_oper(
       DimTypes... indices)
   {
-    return Layout::s_oper(stripIndexType(indices)...);
+    return InnerLayout::s_oper(stripIndexType(indices)...);
   }
 
 
-  static constexpr IndexLinear s_size = Layout::s_size;
-  static constexpr IndexLinear s_size_noproj = Layout::s_size_noproj;
+  static constexpr IndexLinear s_size = InnerLayout::s_size;
+  static constexpr IndexLinear s_size_noproj = InnerLayout::s_size_noproj;
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr static IndexLinear size()
   {
@@ -245,20 +266,44 @@ struct TypedStaticLayoutImpl<Layout, camp::list<DimTypes...>> {
   RAJA_HOST_DEVICE
   constexpr
   IndexLinear get_dim_stride() const {
-    return Layout{}.get_dim_stride();
+    return InnerLayout{}.get_dim_stride();
+  }
+
+  template<camp::idx_t DIM>
+  RAJA_INLINE
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_size() const {
+    return camp::seq_at<DIM, sizes>::value;
   }
 
+  template<camp::idx_t DIM>
   RAJA_INLINE
-  static void print() { Layout::print(); }
+  RAJA_HOST_DEVICE
+  constexpr
+  IndexLinear get_dim_begin() const {
+    return 0;
+  }
+
+
+  RAJA_INLINE
+  static void print() { InnerLayout::print(); }
+
 };
 
-template <typename Perm, typename IdxLin, typename Sizes, typename Indexes>
+
+
+
+
+template <typename Perm, typename IdxLin, typename Sizes, typename Indexes, typename TypeList>
 struct StaticLayoutMaker
 {
   using strides = typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
-  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides>;
+  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides,TypeList>;
 };
 
+
+
 }  // namespace detail
 
 
@@ -267,16 +312,21 @@ using StaticLayoutT = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
-    camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>
+    camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
+    void
     >::type;
 
 template <typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
 template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
-using TypedStaticLayout =
-    detail::TypedStaticLayoutImpl<StaticLayoutT<Perm, IdxLin, Sizes...>, TypeList>;
-
+using TypedStaticLayout = typename detail::StaticLayoutMaker<
+    Perm,
+    IdxLin,
+    camp::int_seq<IdxLin, Sizes...>,
+    camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
+    TypeList
+    >::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 365afffe96..71958aff9b 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -23,9 +23,13 @@
 #include "RAJA/config.hpp"
 
 #include "RAJA/pattern/atomic.hpp"
+
+#if defined(RAJA_ENABLE_VECTORIZATION)
 #include "RAJA/pattern/tensor.hpp"
+#endif
 
 #include "RAJA/util/Layout.hpp"
+#include "RAJA/util/StaticLayout.hpp"
 #include "RAJA/util/OffsetLayout.hpp"
 
 namespace RAJA
@@ -71,6 +75,7 @@ namespace internal
 
 
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
   namespace detail
   {
     /*
@@ -78,6 +83,7 @@ namespace internal
      *
      * returns -1 if none of the arguments are VectorIndexs
      */
+
     template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
     struct GetTensorArgIdxExpanded;
 
@@ -90,8 +96,8 @@ namespace internal
     };
 
 
-
   } // namespace detail
+#endif
 
 
 
@@ -101,19 +107,29 @@ namespace internal
   template<typename ... ARGS>
   struct count_num_tensor_args{
     static constexpr camp::idx_t value =
+#if defined(RAJA_ENABLE_VECTORIZATION)
         RAJA::sum<camp::idx_t>(
             (internal::expt::isTensorIndex<ARGS>() ? 1 : 0) ...);
+#else
+        0;  // There should be 0 Tensor indices if not vectorizing.
+#endif
   };
-
+  
+#if defined(RAJA_ENABLE_VECTORIZATION)
   /*
    * Returns which argument has a vector index
    */
   template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTesorArgIdx{
+  struct GetTensorArgIdx{
       static constexpr camp::idx_t value =
           detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
   };
 
+  template<camp::idx_t DIM, typename ... ARGS>
+  struct GetTensorArgIdx<DIM,camp::list<ARGS...>>{
+      static constexpr camp::idx_t value =
+          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
+  };
 
   /*
    * Returns the beginning index in a vector argument
@@ -124,7 +140,7 @@ namespace internal
   static constexpr camp::idx_t get_tensor_args_begin(LAYOUT const &layout, ARGS ... args){
     return RAJA::max<camp::idx_t>(
         internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTesorArgIdx<DIM, ARGS...>::value>())
+        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTensorArgIdx<DIM, ARGS...>::value>())
         : 0 ...);
   }
 
@@ -137,9 +153,10 @@ namespace internal
   static constexpr camp::idx_t get_tensor_args_size(LAYOUT const &layout, ARGS ... args){
     return RAJA::max<camp::idx_t>(
         internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTesorArgIdx<DIM, ARGS...>::value>())
+        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTensorArgIdx<DIM, ARGS...>::value>())
         : 0 ...);
   }
+#endif
 
 
   namespace detail {
@@ -152,19 +169,18 @@ namespace internal
    * In the future development, this may return SIMD vectors or matrices using
    * class specializations.
    */
-  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, camp::idx_t StrideOneDim>
+  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
   struct ViewReturnHelper;
 
 
   /*
    * Specialization for Scalar return types
    */
-  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, camp::idx_t StrideOneDim>
-  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, StrideOneDim>
+  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
+  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
   {
       using return_type = ElementType &;
 
-      template<typename LayoutType>
       RAJA_INLINE
       RAJA_HOST_DEVICE
       static
@@ -175,13 +191,15 @@ namespace internal
   };
 
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
   /*
    * Specialization for Tensor return types
    */
-  template<camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, camp::idx_t StrideOneDim>
-  struct ViewReturnHelper<camp::idx_seq<VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, StrideOneDim>
+  template<camp::idx_t VecHead, camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
+  struct ViewReturnHelper<camp::idx_seq<VecHead,VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
   {
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq);
+
+      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
 
       // This is the stride-one dimensions w.r.t. the tensor not the View
       // For example:
@@ -191,15 +209,16 @@ namespace internal
       //                 0 rows are stride-one
       //                 1 columns are stride-one
       static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>((GetTesorArgIdx<VecSeq, Args...>::value == StrideOneDim ?
-                    VecSeq : -1)...);
+          RAJA::max<camp::idx_t>(
+                  (GetTensorArgIdx<VecHead,Args...>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
+                  (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
+          );
 
 
-      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTesorArgIdx<0, Args...>::value>::tensor_type;
+      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTensorArgIdx<0, Args...>::value>::tensor_type;
       using ref_type = internal::expt::TensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE, s_num_dims, s_stride_one_dim>;
       using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-      template<typename LayoutType>
       RAJA_INLINE
       RAJA_HOST_DEVICE
       static
@@ -208,22 +227,124 @@ namespace internal
 
         return return_type(ref_type{
           // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndex(args))...),
+          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
           // strides
-          {(LinIdx)layout.template get_dim_stride<GetTesorArgIdx<VecSeq, Args...>::value>()...},
+          {
+              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecHead,Args...>::value>(),
+              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecSeq, Args...>::value>()...
+          },
           // tile
           {
               // begin
-              {(LinIdx)(get_tensor_args_begin<VecSeq>(layout, args...))...},
+              {
+                  (LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
+                  (LinIdx)(get_tensor_args_begin<VecSeq> (layout, args...))...
+              },
 
               // size
-              {(LinIdx)get_tensor_args_size<VecSeq>(layout, args...)...}
+              {
+                  (LinIdx)get_tensor_args_size<VecHead>(layout, args...),
+                  (LinIdx)get_tensor_args_size<VecSeq> (layout, args...)...
+              }
           }
         });
       }
   };
 
 
+
+
+
+  /*
+   * Specialization for Tensor return types and static layout types
+   */
+  template<
+      camp::idx_t VecHead, camp::idx_t ... VecSeq,
+      typename ... INDEX_TYPES,
+      typename ElementType, typename PointerType, typename LinIdx,
+      LinIdx... RangeInts, LinIdx... SizeInts, LinIdx... StrideInts,
+      typename DIM_LIST
+  >
+  struct ViewReturnHelper<
+      camp::idx_seq<VecHead,VecSeq...>,
+      camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
+      ElementType, PointerType,
+      LinIdx,
+      RAJA::detail::StaticLayoutBase_impl<
+          LinIdx,
+          camp::int_seq<LinIdx,RangeInts...>,
+          camp::int_seq<LinIdx,SizeInts...>,
+          camp::int_seq<LinIdx,StrideInts...>,
+          DIM_LIST
+      >
+  > {
+      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+      using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
+
+      using range_seq  = camp::int_seq<LinIdx,RangeInts... >;
+      using size_seq   = camp::int_seq<LinIdx,SizeInts...  >;
+      using stride_seq = camp::int_seq<LinIdx,StrideInts...>;
+      using LayoutType = RAJA::detail::StaticLayoutBase_impl<LinIdx,range_seq,size_seq,stride_seq,DIM_LIST>;
+
+      // This is the stride-one dimensions w.r.t. the tensor not the View
+      // For example:
+      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+      //  For a matrix, s_stride_one_dim is either:
+      //                 -1 neither row nor column are packed
+      //                 0 rows are stride-one
+      //                 1 columns are stride-one
+      static constexpr camp::idx_t s_stride_one_dim =
+          RAJA::max<camp::idx_t>(
+                  (GetTensorArgIdx<VecHead,index_list>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
+                  (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
+          );
+
+
+
+
+      using new_begin_seq = camp::int_seq<
+                LinIdx,
+                (LinIdx)get_tensor_args_begin<VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                (LinIdx)get_tensor_args_begin<VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
+            >;
+      using new_size_seq  = camp::int_seq<
+                LinIdx,
+                (LinIdx)get_tensor_args_size <VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                (LinIdx)get_tensor_args_size <VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
+            >;
+
+      using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
+      using new_size_type  = internal::expt::StaticIndexArray<new_size_seq >;
+
+
+      using tensor_reg_type = typename camp::at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
+      using ref_type = internal::expt::StaticTensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE,stride_seq,new_begin_seq,new_size_seq, s_stride_one_dim>;
+      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
+
+
+      RAJA_INLINE
+      RAJA_HOST_DEVICE
+      static
+      constexpr
+      return_type make_return(LayoutType const &layout, PointerType const &data, RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &... args){
+
+        return return_type(ref_type{
+          // data pointer
+          &data[0] + layout(internal::expt::isTensorIndex<typename RAJA::expt::StaticTensorIndex<INDEX_TYPES>::base_type>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
+          // strides
+          typename ref_type::stride_type(),
+          // tile
+          {
+              new_begin_type(),
+              new_size_type()
+          }
+        });
+      }
+  };
+#endif
+
+
   } // namespace detail
 
 
@@ -243,7 +364,7 @@ namespace internal
         ElementType,
         PointerType,
         LinIdx,
-        LayoutType::stride_one_dim>::return_type;
+        LayoutType>::return_type;
 
   /*
    * Creates the return value for a View
@@ -265,11 +386,9 @@ namespace internal
         ElementType,
         PointerType,
         LinIdx,
-        LayoutType::stride_one_dim>::make_return(layout, data, args...);
+        LayoutType>::make_return(layout, data, args...);
   }
 
-
-
   namespace detail
   {
 
@@ -298,6 +417,7 @@ namespace internal
   };
 
 
+#if defined(RAJA_ENABLE_VECTORIZATION)
   /**
    * Specialization where expected type is wrapped in a VectorIndex type
    *
@@ -319,6 +439,7 @@ namespace internal
       return type(stripIndexType(*vec_arg), vec_arg.size());
     }
   };
+#endif
 
   } //namespace detail
 
diff --git a/include/RAJA/util/camp_aliases.hpp b/include/RAJA/util/camp_aliases.hpp
index 418967530a..3cb4545555 100644
--- a/include/RAJA/util/camp_aliases.hpp
+++ b/include/RAJA/util/camp_aliases.hpp
@@ -44,49 +44,13 @@ using ::camp::make_tuple;
 
 using ::camp::tuple;
 
-using ::camp::resources::Platform;
+using ::camp::tuple_element;
+
+using ::camp::tuple_element_t;
 
-// make own tuple_element
-template < camp::idx_t I, typename Tuple >
-struct tuple_element;
-
-// specialization for RAJA/camp::tuple
-template < camp::idx_t I, typename ... Ts >
-struct tuple_element<I, tuple<Ts...>>
-  : camp::tuple_element<I, tuple<Ts...>>
-{ };
-
-// convenience alias
-template < camp::idx_t I, typename Tuple >
-using tuple_element_t = typename tuple_element<I, Tuple>::type;
-
-// get function overloads for tuple
-// the reference type returned by get depends on the reference type
-// of the zip_tuple that get is called on
-template < camp::idx_t I, typename ... Ts >
-// RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, tuple<Ts...>>             &
-// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...>      & >()))
-RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...>      &  t)
-  -> decltype(camp::get<I>(t))
-{ return camp::get<I>(          t ); }
-template < camp::idx_t I, typename ... Ts >
-// RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, tuple<Ts...>>        const&
-// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...> const& >()))
-RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...> const&  t)
-  -> decltype(camp::get<I>(t))
-{ return camp::get<I>(          t ); }
-template < camp::idx_t I, typename ... Ts >
-// RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, tuple<Ts...>>>::type      &&
-// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...>      &&>()))
-RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...>      && t)
-  -> decltype(camp::get<I>(std::move(t)))
-{ return camp::get<I>(std::move(t)); }
-template < camp::idx_t I, typename ... Ts >
-// RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, tuple<Ts...>>>::type const&&
-// RAJA_HOST_DEVICE RAJA_INLINE decltype(camp::get<I>(camp::val<tuple<Ts...> const&&>()))
-RAJA_HOST_DEVICE RAJA_INLINE auto get(tuple<Ts...> const&& t)
-  -> decltype(camp::get<I>(std::move(t)))
-{ return camp::get<I>(std::move(t)); }
+using ::camp::get;
+
+using ::camp::resources::Platform;
 
 }  // end namespace RAJA
 
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 4e28334b08..dd899687f0 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -34,13 +34,15 @@
 // this stuff in an application.
 //
 #if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
- || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__))
+  || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) \
+  || (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define RAJA_DEVICE_CODE
 #endif
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_HOST_DEVICE __host__ __device__
 #define RAJA_DEVICE __device__
+#define RAJA_HOST __host__
 
 #if defined(RAJA_ENABLE_CLANG_CUDA)
 #define RAJA_SUPPRESS_HD_WARN
@@ -51,12 +53,14 @@
 #elif defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)
 #define RAJA_HOST_DEVICE __host__ __device__
 #define RAJA_DEVICE __device__
+#define RAJA_HOST __host__
 #define RAJA_SUPPRESS_HD_WARN
 
 #else
 
 #define RAJA_HOST_DEVICE
 #define RAJA_DEVICE
+#define RAJA_HOST
 #define RAJA_SUPPRESS_HD_WARN
 #endif
 
@@ -127,6 +131,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
 RAJA_HOST_DEVICE
 inline void RAJA_ABORT_OR_THROW(const char *str)
 {
+#if defined(__SYCL_DEVICE_ONLY__)
+  abort();
+#else
   printf ( "%s\n", str );
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
@@ -160,6 +167,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
     throw std::runtime_error(str);
   }
 #endif
+#endif
 }
 
 //! Macros for marking deprecated features in RAJA
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index fc0393ad37..be2a3f19b1 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -27,7 +27,9 @@
 #if defined(RAJA_HIP_ACTIVE)
 #include "RAJA/policy/hip/policy.hpp"
 #endif
+#if defined(RAJA_SYCL_ACTIVE)
 #include "RAJA/policy/sycl/policy.hpp"
+#endif
 #include "RAJA/policy/sequential/policy.hpp"
 #include "RAJA/policy/openmp_target/policy.hpp"
 #include "RAJA/internal/get_platform.hpp"
@@ -101,7 +103,12 @@ namespace RAJA
   };
 #endif
 
-#if defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_SYCL_ACTIVE)
+  template<>
+  struct get_resource_from_platform<Platform::sycl>{
+    using type = camp::resources::Sycl;
+  };
+
   template<size_t BlockSize, bool Async>
   struct get_resource<sycl_exec<BlockSize, Async>>{
     using type = camp::resources::Sycl;
@@ -152,7 +159,7 @@ namespace RAJA
 #if defined(RAJA_HIP_ACTIVE)
     template <> struct is_resource<resources::Hip> : std::true_type {};
 #endif
-#if defined(RAJA_ENABLE_SYCL)
+#if defined(RAJA_SYCL_ACTIVE)
     template <> struct is_resource<resources::Sycl> : std::true_type {};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index a841940e17..05993a2578 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -34,8 +34,8 @@ namespace RAJA
 {
 
 ///
-/// Enumeration used to indicate whether IndexSet objects own data
-/// representing their indices.
+/// Enumeration used to indicate whether ListSegment object owns data
+/// representing its indices.
 ///
 enum IndexOwnership { Unowned, Owned };
 
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index 81017da572..e58b7416aa 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -34,25 +34,36 @@ namespace RAJA
 template < bool is_val, typename ... Ts >
 struct zip_tuple;
 
+template < camp::idx_t I, typename ZT >
+struct zip_tuple_element;
+
 template < camp::idx_t I, bool is_val, typename ... Ts >
-struct tuple_element<I, zip_tuple<is_val, Ts...>>
+struct zip_tuple_element<I, zip_tuple<is_val, Ts...>>
   : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
 { };
 
+template < camp::idx_t I, typename ZT >
+using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
+
+
 // get function declarations for zip_tuple
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
 template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, zip_tuple<is_val, Ts...>>         &
-get(zip_tuple<is_val, Ts...>      & z)
+RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> &
+get(zip_tuple<is_val, Ts...>      &  z) noexcept
 { return           z .template get<I>(); }
 template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE RAJA_INLINE                                RAJA::tuple_element_t<I, zip_tuple<is_val, Ts...>>    const&
-get(zip_tuple<is_val, Ts...> const& z)
+RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
+get(zip_tuple<is_val, Ts...> const&  z) noexcept
 { return           z .template get<I>(); }
 template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, zip_tuple<is_val, Ts...>>>::type &&
-get(zip_tuple<is_val, Ts...>     && z)
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> &&
+get(zip_tuple<is_val, Ts...>      && z) noexcept
+{ return std::move(z).template get<I>(); }
+template < camp::idx_t I, bool is_val, typename ... Ts >
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
+get(zip_tuple<is_val, Ts...> const&& z) noexcept
 { return std::move(z).template get<I>(); }
 
 namespace detail
@@ -268,13 +279,16 @@ struct zip_tuple
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
   template < camp::idx_t I >
-  RAJA_HOST_DEVICE RAJA_INLINE RAJA::tuple_element_t<I, value_type> & get() &
+  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> & get() & noexcept
   { return RAJA::get<I>(m_tuple); }
   template < camp::idx_t I >
-  RAJA_HOST_DEVICE RAJA_INLINE RAJA::tuple_element_t<I, value_type> const& get() const&
+  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> const& get() const& noexcept
   { return RAJA::get<I>(m_tuple); }
   template < camp::idx_t I >
-  RAJA_HOST_DEVICE RAJA_INLINE typename std::remove_reference<RAJA::tuple_element_t<I, value_type>>::type && get() &&
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> && get() && noexcept
+  { return std::move(RAJA::get<I>(m_tuple)); }
+  template < camp::idx_t I >
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> const&& get() const&& noexcept
   { return std::move(RAJA::get<I>(m_tuple)); }
 
   // safe_swap that calls swap on each pair in the tuple
diff --git a/scripts/alcf-builds/sycl.sh b/scripts/alcf-builds/sycl.sh
index b81b1d40aa..ed5c6fa4bd 100755
--- a/scripts/alcf-builds/sycl.sh
+++ b/scripts/alcf-builds/sycl.sh
@@ -35,6 +35,7 @@ cmake \
   -DRAJA_ENABLE_SYCL=On \
   -DCMAKE_LINKER=clang++ \
   -DCMAKE_CXX_STANDARD=17 \
+  -DBLT_CXX_STD=c++17 \
   -DENABLE_TESTS=Off \
   -DENABLE_EXAMPLES=On \
   "$@" \
diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
index 7141e6ac76..4e96cfcf45 100755
--- a/scripts/gitlab/build_and_test.sh
+++ b/scripts/gitlab/build_and_test.sh
@@ -1,8 +1,14 @@
 #!/usr/bin/env bash
 
+# Initialize modules for users not using bash as a default shell
+if test -e /usr/share/lmod/lmod/init/bash
+then
+  . /usr/share/lmod/lmod/init/bash
+fi
+
 ###############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC and RAJA
+# project contributors. See the RAJA/LICENSE file for details.
 #
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
@@ -20,11 +26,28 @@ hostconfig=${HOST_CONFIG:-""}
 spec=${SPEC:-""}
 job_unique_id=${CI_JOB_ID:-""}
 
-sys_type=${SYS_TYPE:-""}
-py_env_path=${PYTHON_ENVIRONMENT_PATH:-""}
+prefix=""
+
+if [[ -d /dev/shm ]]
+then
+    prefix="/dev/shm/${hostname}"
+    if [[ -z ${job_unique_id} ]]; then
+      job_unique_id=manual_job_$(date +%s)
+      while [[ -d ${prefix}-${job_unique_id} ]] ; do
+          sleep 1
+          job_unique_id=manual_job_$(date +%s)
+      done
+    fi
+
+    prefix="${prefix}-${job_unique_id}"
+    mkdir -p ${prefix}
+fi
 
 # Dependencies
 date
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "~~~~~ Build and test started"
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 if [[ "${option}" != "--build-only" && "${option}" != "--test-only" ]]
 then
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
@@ -41,23 +64,23 @@ then
 
     if [[ -d /dev/shm ]]
     then
-        prefix="/dev/shm/${hostname}"
-        if [[ -z ${job_unique_id} ]]; then
-          job_unique_id=manual_job_$(date +%s)
-          while [[ -d ${prefix}/${job_unique_id} ]] ; do
-              sleep 1
-              job_unique_id=manual_job_$(date +%s)
-          done
-        fi
-
-        prefix="${prefix}/${job_unique_id}"
-        mkdir -p ${prefix}
         prefix_opt="--prefix=${prefix}"
+
+        # We force Spack to put all generated files (cache and configuration of
+        # all sorts) in a unique location so that there can be no collision
+        # with existing or concurrent Spack.
+        spack_user_cache="${prefix}/spack-user-cache"
+        export SPACK_DISABLE_LOCAL_CONFIG=""
+        export SPACK_USER_CACHE_PATH="${spack_user_cache}"
+        mkdir -p ${spack_user_cache}
     fi
 
-    python3 scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt}
+    ./scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt}
 
 fi
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+  echo "~~~~~ Dependencies Built"
+  echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 date
 
 # Host config file
@@ -65,18 +88,18 @@ if [[ -z ${hostconfig} ]]
 then
     # If no host config file was provided, we assume it was generated.
     # This means we are looking of a unique one in project dir.
-    hostconfigs=( $( ls "${project_dir}/"hc-*.cmake ) )
+    hostconfigs=( $( ls "${project_dir}/"*.cmake ) )
     if [[ ${#hostconfigs[@]} == 1 ]]
     then
         hostconfig_path=${hostconfigs[0]}
         echo "Found host config file: ${hostconfig_path}"
     elif [[ ${#hostconfigs[@]} == 0 ]]
     then
-        echo "No result for: ${project_dir}/hc-*.cmake"
+        echo "No result for: ${project_dir}/*.cmake"
         echo "Spack generated host-config not found."
         exit 1
     else
-        echo "More than one result for: ${project_dir}/hc-*.cmake"
+        echo "More than one result for: ${project_dir}/*.cmake"
         echo "${hostconfigs[@]}"
         echo "Please specify one with HOST_CONFIG variable"
         exit 1
@@ -86,34 +109,44 @@ else
     hostconfig_path="${project_dir}/host-configs/${hostconfig}"
 fi
 
+hostconfig=$(basename ${hostconfig_path})
+
 # Build Directory
 if [[ -z ${build_root} ]]
 then
-    build_root=$(pwd)
+    if [[ -d /dev/shm ]]
+    then
+        build_root="${prefix}"
+    else
+        build_root="$(pwd)"
+    fi
+else
+    build_root="${build_root}"
 fi
 
 build_dir="${build_root}/build_${hostconfig//.cmake/}"
+install_dir="${build_root}/install_${hostconfig//.cmake/}"
+
+# TODO: This is from Umpire, could it work with RAJA ?
+#cmake_exe=`grep 'CMake executable' ${hostconfig_path} | cut -d ':' -f 2 | xargs`
 
 # Build
 if [[ "${option}" != "--deps-only" && "${option}" != "--test-only" ]]
 then
     date
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~ Host-config: ${hostconfig_path}"
-    echo "~ Build Dir:   ${build_dir}"
-    echo "~ Project Dir: ${project_dir}"
+    echo "~~~~~ Host-config: ${hostconfig_path}"
+    echo "~~~~~ Build Dir:   ${build_dir}"
+    echo "~~~~~ Project Dir: ${project_dir}"
+    echo "~~~~~ Install Dir: ${install_dir}"
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     echo ""
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~ ENV ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     echo "~~~~~ Building RAJA"
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
 
     # Map CPU core allocations
-    declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32)
+    declare -A core_counts=(["lassen"]=40 ["ruby"]=28 ["corona"]=32 ["rzansel"]=48)
 
     # If building, then delete everything first
     # NOTE: 'cmake --build . -j core_counts' attempts to reduce individual build resources.
@@ -129,16 +162,30 @@ then
         module unload rocm
     fi
 
+    module load cmake/3.23.1 || module load cmake/3.20.2
+
     cmake \
       -C ${hostconfig_path} \
+      -DCMAKE_INSTALL_PREFIX=${install_dir} \
       ${project_dir}
-    cmake --build . -j ${core_counts[$truehostname]}
+    if ! cmake --build . -j ${core_counts[$truehostname]}
+    then
+        echo "ERROR: compilation failed, building with verbose output..."
+        cmake --build . --verbose -j 1
+    else
+        make install
+    fi
+
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    echo "~~~~~ RAJA Built"
+    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     date
 fi
 
 # Test
 if [[ "${option}" != "--build-only" ]] && grep -q -i "ENABLE_TESTS.*ON" ${hostconfig_path}
 then
+    date
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
     echo "~~~~~ Testing RAJA"
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
@@ -173,14 +220,43 @@ then
     xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml
     mv junit.xml ${project_dir}/junit.xml
 
-
     if grep -q "Errors while running CTest" ./tests_output.txt
     then
         echo "ERROR: failure(s) while running CTest" && exit 1
     fi
 
+    if grep -q -i "ENABLE_HIP.*ON" ${hostconfig_path} || grep -q -i "RAJA_ENABLE_DESUL_ATOMICS.*ON" ${hostconfig_path}
+    then
+        echo "WARNING: not testing install with HIP or desul"
+    else
+        if [[ ! -d ${install_dir} ]]
+        then
+            echo "ERROR: install directory not found : ${install_dir}" && exit 1
+        fi
+
+        cd ${install_dir}/examples/RAJA/using-with-cmake
+        mkdir build && cd build
+        if ! cmake -C ../host-config.cmake ..; then
+        echo "ERROR: running cmake for using-with-cmake test" && exit 1
+        fi
+
+        if ! make; then
+        echo "ERROR: running make for using-with-cmake test" && exit 1
+        fi
+    fi
+
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~ CLEAN UP"
+    echo "~~~~~ RAJA Tests Complete"
     echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    make clean
+    date
+
+    # echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    # echo "~~~~~ CLEAN UP"
+    # echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    # make clean
 fi
+
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+echo "~~~~~ Build and test completed"
+echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+date
diff --git a/scripts/lc-builds/blueos_clang.sh b/scripts/lc-builds/blueos_clang.sh
index 9787acd0fb..af502762e9 100755
--- a/scripts/lc-builds/blueos_clang.sh
+++ b/scripts/lc-builds/blueos_clang.sh
@@ -30,7 +30,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_clang_omptarget.sh b/scripts/lc-builds/blueos_clang_omptarget.sh
index a9a9b2c465..cc3fb42427 100755
--- a/scripts/lc-builds/blueos_clang_omptarget.sh
+++ b/scripts/lc-builds/blueos_clang_omptarget.sh
@@ -30,7 +30,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_clangcuda.sh b/scripts/lc-builds/blueos_clangcuda.sh
index 455134861d..e69769cce8 100755
--- a/scripts/lc-builds/blueos_clangcuda.sh
+++ b/scripts/lc-builds/blueos_clangcuda.sh
@@ -35,7 +35,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_gcc.sh b/scripts/lc-builds/blueos_gcc.sh
index 2369185b64..cb87907825 100755
--- a/scripts/lc-builds/blueos_gcc.sh
+++ b/scripts/lc-builds/blueos_gcc.sh
@@ -28,7 +28,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_nvcc_clang.sh b/scripts/lc-builds/blueos_nvcc_clang.sh
index 04fe84c899..89ecaafa5d 100755
--- a/scripts/lc-builds/blueos_nvcc_clang.sh
+++ b/scripts/lc-builds/blueos_nvcc_clang.sh
@@ -35,7 +35,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_nvcc_gcc.sh b/scripts/lc-builds/blueos_nvcc_gcc.sh
index 1ca9a8d562..4b1faa9560 100755
--- a/scripts/lc-builds/blueos_nvcc_gcc.sh
+++ b/scripts/lc-builds/blueos_nvcc_gcc.sh
@@ -35,7 +35,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_nvcc_xl.sh b/scripts/lc-builds/blueos_nvcc_xl.sh
index aaf12c408e..5a60bd51ec 100755
--- a/scripts/lc-builds/blueos_nvcc_xl.sh
+++ b/scripts/lc-builds/blueos_nvcc_xl.sh
@@ -35,7 +35,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_pgi.sh b/scripts/lc-builds/blueos_pgi.sh
index 63c8404d8a..46cbec349b 100755
--- a/scripts/lc-builds/blueos_pgi.sh
+++ b/scripts/lc-builds/blueos_pgi.sh
@@ -28,7 +28,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_xl.sh b/scripts/lc-builds/blueos_xl.sh
index 0b25d5f45a..f9f22de707 100755
--- a/scripts/lc-builds/blueos_xl.sh
+++ b/scripts/lc-builds/blueos_xl.sh
@@ -28,7 +28,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/blueos_xl_omptarget.sh b/scripts/lc-builds/blueos_xl_omptarget.sh
index bfca84db00..abad7c893d 100755
--- a/scripts/lc-builds/blueos_xl_omptarget.sh
+++ b/scripts/lc-builds/blueos_xl_omptarget.sh
@@ -28,7 +28,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/corona_sycl.sh b/scripts/lc-builds/corona_sycl.sh
new file mode 100755
index 0000000000..0ea7c095ab
--- /dev/null
+++ b/scripts/lc-builds/corona_sycl.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 1 ]]; then
+  echo
+  echo "You must pass 1 argument to the script (in this order): "
+  echo "   1) SYCL compiler installation path"
+  echo
+  echo "For example: "
+  echo "    corona_sycl.sh /usr/workspace/raja-dev/clang_sycl_hip_gcc10.2.1_rocm5.1.0/install"
+  exit
+fi
+
+SYCL_PATH=$1
+shift 1
+
+BUILD_SUFFIX=corona-sycl
+: ${BUILD_TYPE:=RelWithDebInfo}
+RAJA_HOSTCONFIG=../host-configs/lc-builds/toss4/corona_sycl.cmake
+
+echo
+echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX}_${USER} >/dev/null
+mkdir build_${BUILD_SUFFIX}_${USER} && cd build_${BUILD_SUFFIX}_${USER}
+
+DATE=$(printf '%(%Y-%m-%d)T\n' -1)
+
+export PATH=${SYCL_PATH}/bin:$PATH
+
+## NOTE: RAJA tests are turned off due to compilation issues.
+
+cmake \
+  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+  -DSYCL_LIB_PATH:STRING="${SYCL_PATH}/lib" \
+  -C ${RAJA_HOSTCONFIG} \
+  -DENABLE_OPENMP=Off \
+  -DENABLE_CUDA=Off \
+  -DRAJA_ENABLE_TARGET_OPENMP=Off \
+  -DENABLE_ALL_WARNINGS=Off \
+  -DRAJA_ENABLE_SYCL=On \
+  -DCMAKE_C_COMPILER=clang \
+  -DCMAKE_CXX_COMPILER=clang++ \
+  -DCMAKE_LINKER=clang++ \
+  -DCMAKE_CXX_STANDARD=17 \
+  -DENABLE_TESTS=Off \
+  -DENABLE_EXAMPLES=On \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "Remember to export PATH=${SYCL_PATH}/bin:\$PATH to obtain the correct compiler paths."
+echo
+echo "cd into directory build_${BUILD_SUFFIX}_${USER} and run make to build RAJA"
+echo
+echo "***********************************************************************"
diff --git a/scripts/lc-builds/toss3_clang.sh b/scripts/lc-builds/toss3_clang.sh
index b2e4588546..9be93b7ef6 100755
--- a/scripts/lc-builds/toss3_clang.sh
+++ b/scripts/lc-builds/toss3_clang.sh
@@ -28,7 +28,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/toss3_clangcuda6.0.0_nvcc8.0.sh b/scripts/lc-builds/toss3_clangcuda6.0.0_nvcc8.0.sh
index a7f52d5b3d..abd2813983 100755
--- a/scripts/lc-builds/toss3_clangcuda6.0.0_nvcc8.0.sh
+++ b/scripts/lc-builds/toss3_clangcuda6.0.0_nvcc8.0.sh
@@ -12,7 +12,7 @@ BUILD_SUFFIX=lc_toss3-clangcuda-6.0.0_nvcc-8.0
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/toss3_gcc.sh b/scripts/lc-builds/toss3_gcc.sh
index 300c78d547..45b1681db1 100755
--- a/scripts/lc-builds/toss3_gcc.sh
+++ b/scripts/lc-builds/toss3_gcc.sh
@@ -28,7 +28,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/toss3_hipcc.sh b/scripts/lc-builds/toss3_hipcc.sh
index 43d1f8f166..c5bf546756 100755
--- a/scripts/lc-builds/toss3_hipcc.sh
+++ b/scripts/lc-builds/toss3_hipcc.sh
@@ -53,7 +53,7 @@ rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
 
-module load cmake/3.14.5
+module load cmake/3.23.1
 
 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
 # are inconsistent causing the rocprim from the module to be used unexpectedly
@@ -70,7 +70,7 @@ cmake \
   -DHIP_CLANG_FLAGS="${HIP_CLANG_FLAGS}" \
   -C "../host-configs/lc-builds/toss3/${HOSTCONFIG}.cmake" \
   -DENABLE_HIP=ON \
-  -DENABLE_OPENMP=OFF \
+  -DENABLE_OPENMP=ON \
   -DENABLE_CUDA=OFF \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/toss3_icpc.sh b/scripts/lc-builds/toss3_icpc.sh
index 01fd23ed9a..2b10559dc7 100755
--- a/scripts/lc-builds/toss3_icpc.sh
+++ b/scripts/lc-builds/toss3_icpc.sh
@@ -42,7 +42,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 ##
 # CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
@@ -50,6 +50,7 @@ module load cmake/3.14.5
 ##
 
 cmake \
+  -DBLT_CXX_STD=c++14 \
   -DCMAKE_BUILD_TYPE=Release \
   -DCMAKE_CXX_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icpc \
   -DCMAKE_C_COMPILER=/usr/tce/packages/intel/intel-${COMP_VER}/bin/icc \
diff --git a/scripts/lc-builds/toss3_oneapi.sh b/scripts/lc-builds/toss3_oneapi.sh
new file mode 100755
index 0000000000..bf33f44532
--- /dev/null
+++ b/scripts/lc-builds/toss3_oneapi.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [ "$1" == "" ]; then
+  echo
+  echo "You must pass a compiler version number to script. For example,"
+  echo "    toss3_oneapi.sh 2022.2"
+  echo
+  echo "NOTE: This script only works with 2022.2."
+  echo "      Change the -DCMAKE_CXX_COMPILER and -DCMAKE_C_COMPILER paths for other versions."
+  exit
+fi
+
+COMP_VER=$1
+shift 1
+
+USE_TBB=Off
+
+BUILD_SUFFIX=lc_toss3-oneapi-${COMP_VER}
+
+echo
+echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} 2>/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+module load cmake/3.20.2
+
+##
+# CMake option -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off used to speed up compile
+# times at a potential cost of slower 'forall' execution.
+##
+
+source /usr/tce/packages/oneapi/oneapi-${COMP_VER}/setvars.sh
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_CXX_COMPILER=/usr/tce/packages/oneapi/oneapi-${COMP_VER}/compiler/2022.1.0/linux/bin/icpx \
+  -DCMAKE_C_COMPILER=/usr/tce/packages/oneapi/oneapi-${COMP_VER}/compiler/2022.1.0/linux/bin/icx \
+  -C ../host-configs/lc-builds/toss3/oneapi_X.cmake \
+  -DRAJA_ENABLE_FORCEINLINE_RECURSIVE=Off \
+  -DENABLE_OPENMP=On \
+  -DRAJA_ENABLE_TBB=${USE_TBB} \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
diff --git a/scripts/lc-builds/toss3_pgi.sh b/scripts/lc-builds/toss3_pgi.sh
index 94e64c3429..c5aa5a1ab2 100755
--- a/scripts/lc-builds/toss3_pgi.sh
+++ b/scripts/lc-builds/toss3_pgi.sh
@@ -28,7 +28,7 @@ echo
 rm -rf build_${BUILD_SUFFIX} 2>/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
-module load cmake/3.14.5
+module load cmake/3.20.2
 
 cmake \
   -DCMAKE_BUILD_TYPE=Release \
diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh
index ed4b2a6500..2041dcc517 100755
--- a/scripts/lc-builds/toss4_amdclang.sh
+++ b/scripts/lc-builds/toss4_amdclang.sh
@@ -49,7 +49,7 @@ rm -rf build_${BUILD_SUFFIX} >/dev/null
 mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
 
 
-module load cmake/3.14.5
+module load cmake/3.23.1
 
 # unload rocm to avoid configuration problems where the loaded rocm and COMP_VER
 # are inconsistent causing the rocprim from the module to be used unexpectedly
@@ -66,7 +66,7 @@ cmake \
   -DCMAKE_HIP_ARCHITECTURES="${MY_HIP_ARCH_FLAGS}" \
   -C "../host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake" \
   -DENABLE_HIP=ON \
-  -DENABLE_OPENMP=OFF \
+  -DENABLE_OPENMP=ON \
   -DENABLE_CUDA=OFF \
   -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
   "$@" \
diff --git a/scripts/lc-builds/toss4_cce_hip.sh b/scripts/lc-builds/toss4_cce_hip.sh
new file mode 100755
index 0000000000..56b01b5ff2
--- /dev/null
+++ b/scripts/lc-builds/toss4_cce_hip.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+if [[ $# -lt 3 ]]; then
+  echo
+  echo "You must pass 3 or more arguments to the script (in this order): "
+  echo "   1) compiler version number"
+  echo "   2) HIP version"
+  echo "   3) HIP compute architecture"
+  echo "   4...) optional arguments to cmake"
+  echo
+  echo "For example: "
+  echo "    toss4_cce_hip.sh 14.0.3 5.2.3 gfx90a"
+  exit
+fi
+
+COMP_VER=$1
+HIP_VER=$2
+HIP_ARCH=$3
+shift 3
+
+HOSTCONFIG="hip_3_X"
+
+BUILD_SUFFIX=lc_toss4-cce-${COMP_VER}-hip-${HIP_VER}-${HIP_ARCH}
+
+echo
+echo "Creating build directory ${BUILD_SUFFIX} and generating configuration in it"
+echo "Configuration extra arguments:"
+echo "   $@"
+echo
+
+rm -rf build_${BUILD_SUFFIX} >/dev/null
+mkdir build_${BUILD_SUFFIX} && cd build_${BUILD_SUFFIX}
+
+
+module load cmake/3.24.2
+
+module load cce/${COMP_VER}
+
+cmake \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_C_COMPILER=$(which cc) \
+  -DCMAKE_CXX_COMPILER=$(which CC) \
+  -DHIP_PATH=/opt/rocm-${HIP_VER}/hip \
+  -DCMAKE_HIP_ARCHITECTURES=${HIP_ARCH} \
+  -DGPU_TARGETS=${HIP_ARCH} \
+  -DAMDGPU_TARGETS=${HIP_ARCH} \
+  -C "../host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake" \
+  -DENABLE_HIP=ON \
+  -DENABLE_OPENMP=ON \
+  -DENABLE_CUDA=OFF \
+  -DCMAKE_INSTALL_PREFIX=../install_${BUILD_SUFFIX} \
+  "$@" \
+  ..
+
+echo
+echo "***********************************************************************"
+echo
+echo "cd into directory build_${BUILD_SUFFIX} and run make to build RAJA"
+echo
+echo "  Please note that you have to have a consistent build environment"
+echo "  when you make RAJA as cmake may reconfigure; load the appropriate"
+echo "  cce module (${COMP_VER}) when building."
+echo
+echo "    module load cce/${COMP_VER}"
+echo "    srun -n1 make"
+echo
+echo "***********************************************************************"
diff --git a/scripts/make_local_branch_from_fork_pr.sh b/scripts/make_local_branch_from_fork_pr.sh
index fb13b0af25..4024654af6 100755
--- a/scripts/make_local_branch_from_fork_pr.sh
+++ b/scripts/make_local_branch_from_fork_pr.sh
@@ -17,7 +17,7 @@ Help()
    echo "This script will make a branch in a local git repo for a PR from a "
    echo "branch in a forked repo. The script must be run inside the local repo."
    echo 
-   echo "Syntax: make_local_branch_from_fork_pr [-h | -c num | num]"
+   echo "Syntax: make_local_branch_from_fork_pr [-h | -c num | -b num]"
    echo "options:"
    echo "-h       Print this help usage message."
    echo "-c <#>   Check whether there is a PR with given number."
diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 7759bb0f9f..1ce0f4421c 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 7759bb0f9f04fee0b85e9afc065cf2b5445c849e
+Subproject commit 1ce0f4421cfe6be4200ae9aa8abd113e09ee4c2d
diff --git a/scripts/spack_packages/camp/package.py b/scripts/spack_packages/camp/package.py
new file mode 100644
index 0000000000..3bff14ef61
--- /dev/null
+++ b/scripts/spack_packages/camp/package.py
@@ -0,0 +1,83 @@
+# Copyright 2013-2022 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+import glob
+
+from spack.package import *
+
+
+class Camp(CMakePackage, CudaPackage, ROCmPackage):
+    """
+    Compiler agnostic metaprogramming library providing concepts,
+    type operations and tuples for C++ and cuda
+    """
+
+    homepage = "https://github.com/LLNL/camp"
+    git      = "https://github.com/LLNL/camp.git"
+    url      = "https://github.com/LLNL/camp/archive/v0.1.0.tar.gz"
+
+    maintainers = ['trws']
+
+    version('main', branch='main', submodules='True')
+    version('2022.03.0', sha256='e9090d5ee191ea3a8e36b47a8fe78f3ac95d51804f1d986d931e85b8f8dad721')
+    version('0.3.0', sha256='129431a049ca5825443038ad5a37a86ba6d09b2618d5fe65d35f83136575afdb')
+    version('0.2.3', sha256='58a0f3bd5eadb588d7dc83f3d050aff8c8db639fc89e8d6553f9ce34fc2421a7')
+    version('0.2.2', sha256='194d38b57e50e3494482a7f94940b27f37a2bee8291f2574d64db342b981d819')
+    version('0.1.0', sha256='fd4f0f2a60b82a12a1d9f943f8893dc6fe770db493f8fae5ef6f7d0c439bebcc')
+
+    # TODO: figure out gtest dependency and then set this default True.
+    variant('tests', default=False, description='Build tests')
+    variant('openmp', default=False, description='Build with OpenMP support')
+
+    depends_on('cub', when='+cuda')
+
+    depends_on('blt')
+
+    def cmake_args(self):
+        spec = self.spec
+
+        options = []
+
+        options.append("-DBLT_SOURCE_DIR={0}".format(spec['blt'].prefix))
+
+        if '+cuda' in spec:
+            options.extend([
+                '-DENABLE_CUDA=ON',
+                '-DCUDA_TOOLKIT_ROOT_DIR=%s' % (spec['cuda'].prefix)])
+
+            if not spec.satisfies('cuda_arch=none'):
+                cuda_arch = spec.variants['cuda_arch'].value
+                options.append('-DCMAKE_CUDA_ARCHITECTURES={0}'.format(cuda_arch[0]))
+                options.append('-DCUDA_ARCH=sm_{0}'.format(cuda_arch[0]))
+                flag = '-arch sm_{0}'.format(cuda_arch[0])
+                options.append('-DCMAKE_CUDA_FLAGS:STRING={0}'.format(flag))
+        else:
+            options.append('-DENABLE_CUDA=OFF')
+
+        if '+rocm' in spec:
+            options.extend([
+                '-DENABLE_HIP=ON',
+                '-DHIP_ROOT_DIR={0}'.format(spec['hip'].prefix)
+            ])
+            archs = self.spec.variants['amdgpu_target'].value
+            if archs != 'none':
+                arch_str = ",".join(archs)
+                options.append(
+                    '-DHIP_HIPCC_FLAGS=--amdgpu-target={0}'.format(arch_str)
+                )
+            # there is only one dir like this, but the version component is unknown
+            options.append(
+                "-DHIP_CLANG_INCLUDE_PATH=" + glob.glob(
+                    "{}/lib/clang/*/include".format(spec['llvm-amdgpu'].prefix)
+                )[0]
+            )
+        else:
+            options.append('-DENABLE_HIP=OFF')
+
+        options.append(self.define_from_variant('ENABLE_TESTS', 'tests'))
+        options.append(self.define_from_variant('ENABLE_OPENMP', 'openmp'))
+
+
+        return options
diff --git a/scripts/spack_packages/hip/package.py b/scripts/spack_packages/hip/package.py
deleted file mode 100644
index e63317ec5d..0000000000
--- a/scripts/spack_packages/hip/package.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from spack import *
-
-
-class Hip(CMakePackage):
-    """HIP is a C++ Runtime API and Kernel Language that allows developers to
-       create portable applications for AMD and NVIDIA GPUs from
-       single source code."""
-
-    homepage = "https://github.com/ROCm-Developer-Tools/HIP"
-    url      = "https://github.com/ROCm-Developer-Tools/HIP/archive/rocm-3.10.0.tar.gz"
-
-    maintainers = ['srekolam', 'arjun-raj-kuppala']
-
-    version('3.10.0', sha256='0082c402f890391023acdfd546760f41cb276dffc0ffeddc325999fd2331d4e8')
-    version('3.9.0', sha256='25ad58691456de7fd9e985629d0ed775ba36a2a0e0b21c086bd96ba2fb0f7ed1')
-
-    depends_on('cmake@3:', type='build')
-    depends_on('perl@5.10:', type=('build', 'run'))
-    depends_on('mesa~llvm@18.3:')
-
-    for ver in ['3.9.0', '3.10.0']:
-        depends_on('rocclr@' + ver,  type='build', when='@' + ver)
-        depends_on('hsakmt-roct@' + ver, type='build', when='@' + ver)
-        depends_on('hsa-rocr-dev@' + ver, type='link', when='@' + ver)
-        depends_on('comgr@' + ver, type='build', when='@' + ver)
-        depends_on('llvm-amdgpu@' + ver, type='build', when='@' + ver)
-        depends_on('rocm-device-libs@' + ver, type='build', when='@' + ver)
-        depends_on('rocminfo@' + ver, type='build', when='@' + ver)
-
-    def setup_dependent_package(self, module, dependent_spec):
-        self.spec.hipcc = join_path(self.prefix.bin, 'hipcc')
-
-    @run_before('install')
-    def filter_sbang(self):
-        perl = self.spec['perl'].command
-        kwargs = {'ignore_absent': False, 'backup': False, 'string': False}
-
-        with working_dir('bin'):
-            match = '^#!/usr/bin/perl'
-            substitute = "#!{perl}".format(perl=perl)
-            files = [
-                'hipify-perl', 'hipcc', 'extractkernel',
-                'hipconfig', 'hipify-cmakefile'
-            ]
-            filter_file(match, substitute, *files, **kwargs)
-
-    def cmake_args(self):
-        args = [
-            '-DHIP_COMPILER=clang',
-            '-DHIP_PLATFORM=rocclr',
-            '-DHSA_PATH={0}'.format(self.spec['hsa-rocr-dev'].prefix),
-            '-DLIBROCclr_STATIC_DIR={0}/lib'.format(self.spec['rocclr'].prefix)
-        ]
-        return args
diff --git a/scripts/spack_packages/raja/package.py b/scripts/spack_packages/raja/package.py
index 68ae04cffa..2db852891e 100644
--- a/scripts/spack_packages/raja/package.py
+++ b/scripts/spack_packages/raja/package.py
@@ -6,8 +6,9 @@
 
 from spack import *
 
-import socket
+import glob
 import os
+import socket
 
 from os import environ as env
 from os.path import join as pjoin
@@ -94,8 +95,9 @@ class Raja(CMakePackage, CudaPackage, ROCmPackage):
     depends_on('blt@0.4.1:', type='build')
 
     depends_on('camp')
-    depends_on('camp@0.2.2')
+    depends_on('camp@main') # TODO: remove this ASAP
     depends_on('camp+rocm', when='+rocm')
+    depends_on('camp+openmp', when='+openmp')
     for val in ROCmPackage.amdgpu_targets:
         depends_on('camp amdgpu_target=%s' % val, when='amdgpu_target=%s' % val)
 
@@ -105,6 +107,7 @@ class Raja(CMakePackage, CudaPackage, ROCmPackage):
                    when='cuda_arch={0}'.format(sm_))
 
     conflicts('+openmp', when='+rocm')
+    depends_on('rocprim', when='+rocm')
 
     phases = ['hostconfig', 'cmake', 'build', 'install']
 
@@ -291,6 +294,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
 
             if not spec.satisfies('cuda_arch=none'):
                 cuda_arch = spec.variants['cuda_arch'].value
+                cfg.write(cmake_cache_string("CMAKE_CUDA_ARCHITECTURES", '{0}'.format(cuda_arch[0])))
                 cfg.write(cmake_cache_string("CUDA_ARCH", 'sm_{0}'.format(cuda_arch[0])))
 
         else:
@@ -305,23 +309,24 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
 
             hip_root = spec['hip'].prefix
             rocm_root = hip_root + "/.."
+            hip_arch = spec.variants['amdgpu_target'].value
             cfg.write(cmake_cache_entry("HIP_ROOT_DIR",
                                         hip_root))
+            # there is only one dir like this, but the version component is unknown
+            cfg.write(
+                cmake_cache_path(
+                    "HIP_CLANG_INCLUDE_PATH",
+                    glob.glob(
+                        "{}/lib/clang/*/include".format(spec['llvm-amdgpu'].prefix)
+                    )[0]
+                )
+            )
             cfg.write(cmake_cache_entry("ROCM_ROOT_DIR",
                                         rocm_root))
             cfg.write(cmake_cache_entry("HIP_PATH",
                                         rocm_root + '/llvm/bin'))
-            cfg.write(cmake_cache_entry("CMAKE_HIP_ARCHITECTURES", 'fx906'))
-
-            hipcc_flags = ['--amdgpu-target=gfx906']
-            if "+desul" in spec:
-                hipcc_flags.append('-std=c++14')
-            
-            cfg.write(cmake_cache_entry("HIP_HIPCC_FLAGS", ';'.join(hipcc_flags)))
+            cfg.write(cmake_cache_entry("CMAKE_HIP_ARCHITECTURES", hip_arch[0]))
 
-            #cfg.write(cmake_cache_entry("HIP_RUNTIME_INCLUDE_DIRS",
-            #                            "{0}/include;{0}/../hsa/include".format(hip_root)))
-            #hip_link_flags = "-Wl,--disable-new-dtags -L{0}/lib -L{0}/../lib64 -L{0}/../lib -Wl,-rpath,{0}/lib:{0}/../lib:{0}/../lib64 -lamdhip64 -lhsakmt -lhsa-runtime64".format(hip_root)
             if ('%gcc' in spec) or (using_toolchain):
                 if ('%gcc' in spec):
                     gcc_bin = os.path.dirname(self.compiler.cxx)
@@ -332,8 +337,6 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
                 "--gcc-toolchain={0}".format(gcc_prefix))) 
                 cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS",
                 " -Wl,-rpath {}/lib64".format(gcc_prefix)))
-            #else:
-            #    cfg.write(cmake_cache_entry("CMAKE_EXE_LINKER_FLAGS", hip_link_flags))
 
         else:
             cfg.write(cmake_cache_option("ENABLE_HIP", False))
@@ -350,7 +353,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
 
         # shared vs static libs
         cfg.write(cmake_cache_option("BUILD_SHARED_LIBS","+shared" in spec))
-        cfg.write(cmake_cache_option("RAJA_ENABLE_OPENMP","+openmp" in spec))
+        cfg.write(cmake_cache_option("ENABLE_OPENMP","+openmp" in spec))
         cfg.write(cmake_cache_option("RAJA_ENABLE_DESUL_ATOMICS","+desul" in spec))
 
         if "+desul" in spec:
@@ -360,6 +363,7 @@ def hostconfig(self, spec, prefix, py_site_pkgs_dir=None):
 
         cfg.write(cmake_cache_option("ENABLE_BENCHMARKS", 'tests=benchmarks' in spec))
         cfg.write(cmake_cache_option("ENABLE_TESTS", not 'tests=none' in spec or self.run_tests))
+        cfg.write(cmake_cache_string("camp_DIR", spec['camp'].prefix))
 
         #######################
         # Close and save
diff --git a/scripts/uberenv b/scripts/uberenv
index 105e384f58..4941c237ee 160000
--- a/scripts/uberenv
+++ b/scripts/uberenv
@@ -1 +1 @@
-Subproject commit 105e384f585e2391c42b2def93124a6580319c1c
+Subproject commit 4941c237eec514d6d68872243efb9f4af8843f4d
diff --git a/share/raja/cmake/RAJA-config.cmake.in b/share/raja/cmake/RAJA-config.cmake.in
index a02ee856cb..3baa8b68a2 100644
--- a/share/raja/cmake/RAJA-config.cmake.in
+++ b/share/raja/cmake/RAJA-config.cmake.in
@@ -32,11 +32,31 @@
 #
 @PACKAGE_INIT@
 
+include(CMakeFindDependencyMacro)
+
+if (@RAJA_NEEDS_BLT_TPLS@)
+  if (@RAJA_ENABLE_CUDA@)
+    find_dependency(Threads)
+  endif ()
+endif()
+
 if (NOT TARGET camp)
-  include(CMakeFindDependencyMacro)
-  find_dependency(camp REQUIRED PATHS "@PACKAGE_CMAKE_INSTALL_PREFIX@")
-endif ()
+  set(RAJA_CAMP_DIR "@camp_DIR@")
+  if(NOT camp_DIR) 
+    set(camp_DIR ${RAJA_CAMP_DIR}) 
+  endif()
 
-include("${CMAKE_CURRENT_LIST_DIR}/RAJA.cmake")
+  find_dependency(camp CONFIG NO_DEFAULT_PATH PATHS 
+    ${camp_DIR}
+    ${camp_DIR}/lib/cmake/camp
+    @PACKAGE_CMAKE_INSTALL_PREFIX@
+    @PACKAGE_CMAKE_INSTALL_PREFIX@/lib/cmake/camp)
+endif ()
 
+set(BLT_TGTS "${CMAKE_CURRENT_LIST_DIR}/bltTargets.cmake")
+if(EXISTS "${BLT_TGTS}")
+include("${BLT_TGTS}")
+endif()
+unset(BLT_TGTS)
+include("${CMAKE_CURRENT_LIST_DIR}/RAJATargets.cmake")
 check_required_components("@PROJECT_NAME@")
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index 25b8b62979..0e265599de 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -46,7 +46,7 @@ cudaInfo g_status;
 
 //! State of the host code in this thread
 cudaInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
 
diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp
index 63d0b7abfd..516fd18313 100644
--- a/src/MemUtils_HIP.cpp
+++ b/src/MemUtils_HIP.cpp
@@ -46,7 +46,7 @@ hipInfo g_status;
 
 //! State of the host code in this thread
 hipInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
 
diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp
index fcf32063c7..911ff132b6 100644
--- a/src/MemUtils_SYCL.cpp
+++ b/src/MemUtils_SYCL.cpp
@@ -44,7 +44,7 @@ syclInfo g_status;
 
 //! State of the host code in this thread
 syclInfo tl_status;
-#if defined(RAJA_ENABLE_OPENMP) && defined(_OPENMP)
+#if defined(RAJA_ENABLE_OPENMP)
 #pragma omp threadprivate(tl_status)
 #endif
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3d6d498fc5..fbf09f0af2 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -15,3 +15,5 @@ add_subdirectory(unit)
 
 add_subdirectory(old-tests)
 
+add_subdirectory(install)
+
diff --git a/test/functional/CMakeLists.txt b/test/functional/CMakeLists.txt
index 94c518e24b..dc5cc788c3 100644
--- a/test/functional/CMakeLists.txt
+++ b/test/functional/CMakeLists.txt
@@ -7,6 +7,8 @@
 
 add_subdirectory(forall)
 
+add_subdirectory(dynamic_forall)
+
 add_subdirectory(indexset-build)
 
 add_subdirectory(kernel)
@@ -15,8 +17,10 @@ add_subdirectory(scan)
 
 add_subdirectory(workgroup)
 
-add_subdirectory(teams)
+add_subdirectory(launch)
 
-add_subdirectory(tensor)
+if (RAJA_ENABLE_VECTORIZATION)
+  add_subdirectory(tensor)
+endif()
 
 add_subdirectory(util)
diff --git a/test/functional/dynamic_forall/CMakeLists.txt b/test/functional/dynamic_forall/CMakeLists.txt
new file mode 100644
index 0000000000..20efe66d9c
--- /dev/null
+++ b/test/functional/dynamic_forall/CMakeLists.txt
@@ -0,0 +1,26 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+list(APPEND DYNAMIC_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND DYNAMIC_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND DYNAMIC_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND DYNAMIC_BACKENDS Hip)
+endif()
+
+add_subdirectory(segment)
+
+add_subdirectory(resource-segment)
+
+unset( DYNAMIC_BACKENDS )
diff --git a/test/functional/dynamic_forall/resource-segment/CMakeLists.txt b/test/functional/dynamic_forall/resource-segment/CMakeLists.txt
new file mode 100644
index 0000000000..2475204cc3
--- /dev/null
+++ b/test/functional/dynamic_forall/resource-segment/CMakeLists.txt
@@ -0,0 +1,25 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(SEGTYPE RangeSegment)
+
+foreach( BACKEND ${DYNAMIC_BACKENDS} )
+  foreach( TESTTYPE ${SEGTYPE} )
+    configure_file( test-dynamic-forall-resource-segments.cpp.in
+                    test-dynamic-forall-resource-segments-${SEGTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-dynamic-forall-resource-segments-${SEGTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-dynamic-forall-resource-segments-${SEGTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-dynamic-forall-resource-segments-${SEGTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( SEGTYPE )
diff --git a/test/functional/dynamic_forall/resource-segment/test-dynamic-forall-resource-segments.cpp.in b/test/functional/dynamic_forall/resource-segment/test-dynamic-forall-resource-segments.cpp.in
new file mode 100644
index 0000000000..bf26e3e106
--- /dev/null
+++ b/test/functional/dynamic_forall/resource-segment/test-dynamic-forall-resource-segments.cpp.in
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-dynamic-forall.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-dynamic-forall-resource-@SEGTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@DynamicForallTypes =
+  Test< camp::cartesian_product<StrongIdxTypeList, 
+                                @BACKEND@ResourceList,
+                                policy_list>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               DynamicForallResource@SEGTYPE@Test,
+                               @BACKEND@DynamicForallTypes);
diff --git a/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
new file mode 100644
index 0000000000..0b09079093
--- /dev/null
+++ b/test/functional/dynamic_forall/resource-segment/tests/test-dynamic-forall-resource-RangeSegment.hpp
@@ -0,0 +1,112 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_DYANMIC_FORALL_RESOURCE_RANGESEGMENT_HPP__
+#define __TEST_DYANMIC_FORALL_RESOURCE_RANGESEGMENT_HPP__
+
+#include <numeric>
+#include <iostream>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
+void DynamicForallResourceRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+{
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  INDEX_TYPE N = INDEX_TYPE(r1.end() - r1.begin());
+
+  WORKING_RES working_res;
+  camp::resources::Resource erased_working_res{working_res};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  allocateForallTestData<INDEX_TYPE>(N,
+                                     erased_working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  const INDEX_TYPE rbegin = *r1.begin();
+
+  std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
+
+  RAJA::expt::dynamic_forall<POLICY_LIST>(working_res, pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+    working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+  });
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * RAJA::stripIndexType(N));
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(erased_working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+
+}
+
+
+TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest);
+template <typename T>
+class DynamicForallResourceRangeSegmentTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(DynamicForallResourceRangeSegmentTest, RangeSegmentForallResource)
+{
+
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using POLICY_LIST = typename camp::at<TypeParam, camp::num<2>>::type;
+
+
+  constexpr int N = camp::size<POLICY_LIST>::value;
+
+  //If N == 2 host, no openmp is available
+  //If N == 3 host, openmp is available
+  //If N == 4 host, device is available
+  //If N == 5 host, openmp, device are on
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+
+  if(is_on_host) { 
+    int host_range = 2;
+#if defined(RAJA_ENABLE_OPENMP)
+    host_range = 3; 
+#endif      
+      //Loop through policy list
+      for(int pol=0; pol<host_range; ++pol) 
+        {
+          DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+        }
+  }
+#if defined(RAJA_DEVICE_ACTIVE)
+  else
+  {
+    int device_start = 2;
+#if defined(RAJA_ENABLE_OPENMP)
+    device_start = 3; 
+#endif      
+    for(int pol=device_start; pol<N; ++pol) 
+    {
+    DynamicForallResourceRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
+  }
+#endif
+
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(DynamicForallResourceRangeSegmentTest,
+                            RangeSegmentForallResource);
+
+#endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/dynamic_forall/segment/CMakeLists.txt b/test/functional/dynamic_forall/segment/CMakeLists.txt
new file mode 100644
index 0000000000..da983619e7
--- /dev/null
+++ b/test/functional/dynamic_forall/segment/CMakeLists.txt
@@ -0,0 +1,25 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(SEGTYPE RangeSegment)
+
+foreach( BACKEND ${DYNAMIC_BACKENDS} )
+  foreach( TESTTYPE ${SEGTYPE} )
+    configure_file( test-dynamic-forall-segments.cpp.in
+                    test-dynamic-forall-segments-${SEGTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-dynamic-forall-segments-${SEGTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-dynamic-forall-segments-${SEGTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-dynamic-forall-segments-${SEGTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( SEGTYPE )
diff --git a/test/functional/dynamic_forall/segment/test-dynamic-forall-segments.cpp.in b/test/functional/dynamic_forall/segment/test-dynamic-forall-segments.cpp.in
new file mode 100644
index 0000000000..dc8285ac0a
--- /dev/null
+++ b/test/functional/dynamic_forall/segment/test-dynamic-forall-segments.cpp.in
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-dynamic-forall.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-dynamic-forall-@SEGTYPE@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@DynamicForallTypes =
+  Test< camp::cartesian_product<StrongIdxTypeList, 
+                                @BACKEND@ResourceList,
+                                policy_list>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               DynamicForall@SEGTYPE@Test,
+                               @BACKEND@DynamicForallTypes);
diff --git a/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
new file mode 100644
index 0000000000..3e9114d80e
--- /dev/null
+++ b/test/functional/dynamic_forall/segment/tests/test-dynamic-forall-RangeSegment.hpp
@@ -0,0 +1,129 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_DYANMIC_FORALL_RANGESEGMENT_HPP__
+#define __TEST_DYANMIC_FORALL_RANGESEGMENT_HPP__
+
+#include <numeric>
+#include <iostream>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename POLICY_LIST>
+void DynamicForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, const int pol)
+{
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  size_t data_len = RAJA::stripIndexType(N);
+  if ( data_len == 0 ) {
+    data_len = 1;
+  }
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  if ( RAJA::stripIndexType(N) > 0 ) {
+
+    const INDEX_TYPE rbegin = *r1.begin();
+
+    std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
+
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+    });
+
+  } else { // zero-length segment 
+
+    memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
+
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::expt::dynamic_forall<POLICY_LIST>(pol, r1, [=] RAJA_HOST_DEVICE(INDEX_TYPE idx) {
+      (void) idx;
+      working_array[0]++;
+    });
+
+  }
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest);
+template <typename T>
+class DynamicForallRangeSegmentTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(DynamicForallRangeSegmentTest, RangeSegmentForall)
+{
+
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using POLICY_LIST = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  constexpr int N = camp::size<POLICY_LIST>::value;
+
+  //If N == 2 host, no openmp is available
+  //If N == 3 host, openmp is available
+  //If N == 4 host, device is available
+  //If N == 5 host, openmp, device are on
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  bool is_on_host = working_res.get_platform() == camp::resources::Platform::host ? true : false;
+
+  if(is_on_host) { 
+    int host_range = 2;
+#if defined(RAJA_ENABLE_OPENMP)
+    host_range = 3; 
+#endif      
+      //Loop through policy list
+      for(int pol=0; pol<host_range; ++pol) 
+        {
+          DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+            (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+        }
+  }
+#if defined(RAJA_DEVICE_ACTIVE)
+  else
+  {
+    int device_start = 2;
+#if defined(RAJA_ENABLE_OPENMP)
+    device_start = 3; 
+#endif      
+    for(int pol=device_start; pol<N; ++pol) 
+    {
+    DynamicForallRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, POLICY_LIST>
+      (INDEX_TYPE(0), INDEX_TYPE(27), pol);
+    }
+  }
+#endif
+
+
+}
+
+REGISTER_TYPED_TEST_SUITE_P(DynamicForallRangeSegmentTest,
+                            RangeSegmentForall);
+
+#endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/forall/CMakeLists.txt b/test/functional/forall/CMakeLists.txt
index 298e811a43..c8818c3081 100644
--- a/test/functional/forall/CMakeLists.txt
+++ b/test/functional/forall/CMakeLists.txt
@@ -23,6 +23,10 @@ if(RAJA_ENABLE_HIP)
   list(APPEND FORALL_BACKENDS Hip)
 endif()
 
+if(RAJA_ENABLE_SYCL)
+  list(APPEND FORALL_BACKENDS Sycl)
+endif()
+
 if(RAJA_ENABLE_TARGET_OPENMP)
   list(APPEND FORALL_BACKENDS OpenMPTarget)
 endif()
@@ -69,6 +73,10 @@ if(RAJA_ENABLE_HIP)
   list(APPEND FORALL_ATOMIC_BACKENDS Hip)
 endif()
 
+if(RAJA_ENABLE_SYCL)
+  list(APPEND FORALL_ATOMIC_BACKENDS Sycl)
+endif()
+
 if(RAJA_ENABLE_TARGET_OPENMP)
   list(APPEND FORALL_ATOMIC_BACKENDS OpenMPTarget)
 endif()
diff --git a/test/functional/forall/indexset/CMakeLists.txt b/test/functional/forall/indexset/CMakeLists.txt
index 89056f89ee..0cd6b3fcf7 100644
--- a/test/functional/forall/indexset/CMakeLists.txt
+++ b/test/functional/forall/indexset/CMakeLists.txt
@@ -10,7 +10,6 @@
 #
 set(INDEXSETTESTTYPES IndexSet IcountIndexSet)
 
-
 #
 # Generate tests for each enabled RAJA back-end.
 #
diff --git a/test/functional/forall/reduce-basic/CMakeLists.txt b/test/functional/forall/reduce-basic/CMakeLists.txt
index 736910ca52..be601ecf5b 100644
--- a/test/functional/forall/reduce-basic/CMakeLists.txt
+++ b/test/functional/forall/reduce-basic/CMakeLists.txt
@@ -5,6 +5,80 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
+#
+# List of experimental reduction types for generating test files.
+#
+set(REDUCETYPES ReduceSum ReduceMin ReduceMax ReduceMaxLoc ReduceMinLoc)
+
+set(DATATYPES CoreReductionDataTypeList)
+
+#
+# If building a subset of openmp target tests, remove the back-end from
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_TARGET_OPENMP)
+  if(RAJA_TEST_OPENMP_TARGET_SUBSET)
+    list(REMOVE_ITEM FORALL_BACKENDS OpenMPTarget)
+  endif()
+endif()
+
+#
+# If building SYCL tests, remove the back-end from
+# from the list of tests to generate here for the 
+# expt-reduce tests.
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM FORALL_BACKENDS Sycl)
+endif()
+
+
+#
+# Generate core reduction tests for each enabled RAJA back-end
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-forall-basic-expt-reduce.cpp.in
+                    test-forall-basic-expt-${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-basic-expt-${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-expt-${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-basic-expt-${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( DATATYPES )
+unset( REDUCETYPES )
+
+#
+# List of bitwise reduction types for generating test files.
+#
+set(REDUCETYPES ReduceBitAnd ReduceBitOr)
+
+set(DATATYPES BitwiseReductionDataTypeList)
+
+#
+# Generate bitwise reduction tests for each enabled RAJA back-end
+#
+# Note: FORALL_BACKENDS is defined in ../CMakeLists.txt
+#
+foreach( BACKEND ${FORALL_BACKENDS} )
+  foreach( REDUCETYPE ${REDUCETYPES} )
+    configure_file( test-forall-basic-expt-reduce.cpp.in
+                    test-forall-basic-expt-${REDUCETYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-forall-basic-expt-${REDUCETYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-forall-basic-expt-${REDUCETYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-forall-basic-expt-${REDUCETYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( DATATYPES )
+unset( REDUCETYPES )
+
 #
 # List of core reduction types for generating test files.
 #
@@ -22,6 +96,15 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   endif()
 endif()
 
+#
+# If building SYCL tests, re-add the back-end to
+# from the list of tests to generate here.
+#
+if(RAJA_ENABLE_SYCL)
+  list(APPEND FORALL_BACKENDS Sycl)
+endif()
+
+
 #
 # Generate core reduction tests for each enabled RAJA back-end
 #
diff --git a/test/functional/forall/reduce-basic/test-forall-basic-expt-reduce.cpp.in b/test/functional/forall/reduce-basic/test-forall-basic-expt-reduce.cpp.in
new file mode 100644
index 0000000000..f5c336e71d
--- /dev/null
+++ b/test/functional/forall/reduce-basic/test-forall-basic-expt-reduce.cpp.in
@@ -0,0 +1,64 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#define RAJA_EXPT_FORALL
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-forall-execpol.hpp"
+#include "RAJA_test-reducepol.hpp"
+
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-forall-basic-expt-@REDUCETYPE@.hpp"
+
+//
+// Data types for core reduction basic tests
+//
+using CoreReductionDataTypeList = camp::list< int,
+                                              float,
+                                              double >;
+
+//
+// Data types for bitwise reduction basic tests
+//
+using BitwiseReductionDataTypeList = camp::list< int,
+                                                 unsigned int >;
+
+
+//
+// These tests exercise only one index type. We parameterize here to 
+// make it easier to expand types in the future if needed.
+//
+using TestIdxTypeList = camp::list< RAJA::Index_type >;
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@ForallReduceBasicTypes =
+  Test< camp::cartesian_product<TestIdxTypeList,
+                                @DATATYPES@,
+                                @BACKEND@ResourceList,
+                                @BACKEND@ForallReduceExecPols,
+                                @BACKEND@ReducePols>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Forall@REDUCETYPE@BasicTest,
+                               @BACKEND@ForallReduceBasicTypes);
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
new file mode 100644
index 0000000000..a1562a45e5
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitAnd.hpp
@@ -0,0 +1,184 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_EXPT_REDUCEBITAND_HPP__
+#define __TEST_FORALL_BASIC_EXPT_REDUCEBITAND_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceBitAndBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  //
+  // First a simple non-trivial test that is mildly interesting
+  //
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = 13;
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  RAJA::ReduceBitAnd<REDUCE_POLICY, DATA_TYPE> simpand(21);
+
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    simpand &= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(simpand.get()), 5);
+
+  
+  // 
+  // And now a randomized test that pushes zeros around
+  // 
+
+  const int modval = 100;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  DATA_TYPE ref_and = 0;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_and &= test_array[ seg_idx[i] ];
+  }
+
+  DATA_TYPE redand(0);
+  DATA_TYPE redand2(2);
+
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+    RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand2),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1, DATA_TYPE &r2) {
+      r1 &= working_array[idx];
+      r2 &= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
+  ASSERT_EQ(static_cast<DATA_TYPE>(redand2), ref_and);
+
+  redand = 0;
+
+  const int nloops = 3;
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg,
+      RAJA::expt::Reduce<RAJA::operators::bit_and>(&redand),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1) {
+        r1 &= working_array[idx];
+    });
+  }
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redand), ref_and);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest);
+template <typename T>
+class ForallReduceBitAndBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceBitAndBasicTest, ReduceBitAndBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
+
+  // List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceBitAndBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitAndBasicTest,
+                            ReduceBitAndBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
new file mode 100644
index 0000000000..3dc50c45f6
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceBitOr.hpp
@@ -0,0 +1,184 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
+#define __TEST_FORALL_BASIC_EXPT_REDUCEBITOR_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceBitOrBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  //
+  // First a simple non-trivial test that is mildly interesting
+  //
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = 9;
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  RAJA::ReduceBitOr<REDUCE_POLICY, DATA_TYPE> simpor(5);
+
+  RAJA::forall<EXEC_POLICY>(seg, [=] RAJA_HOST_DEVICE(IDX_TYPE idx) {
+    simpor |= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(simpor.get()), 13);
+
+ 
+  //
+  // And now a randomized test that pushes zeros around
+  //
+
+  const int modval = 100;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  DATA_TYPE ref_or = 0;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_or |= test_array[ seg_idx[i] ];
+  }
+
+  DATA_TYPE redor(0);
+  DATA_TYPE redor2(2);
+
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+    RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor2),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1, DATA_TYPE &r2) {
+      r1 |= working_array[idx];
+      r2 |= working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
+  ASSERT_EQ(static_cast<DATA_TYPE>(redor2), ref_or);
+
+  redor = 0;
+
+  const int nloops = 3;
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg,
+      RAJA::expt::Reduce<RAJA::operators::bit_or>(&redor),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &r1) {
+        r1 |= working_array[idx];
+    });
+  }
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(redor), ref_or);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest);
+template <typename T>
+class ForallReduceBitOrBasicTest : public ::testing::Test
+{
+};
+
+
+TYPED_TEST_P(ForallReduceBitOrBasicTest, ReduceBitOrBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+    camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   r5, seg_idx, working_res);
+
+  // List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceBitOrBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                 RAJA::TypedListSegment<IDX_TYPE>,
+                                 EXEC_POLICY, REDUCE_POLICY>(
+                                   l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceBitOrBasicTest,
+                            ReduceBitOrBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEBITOR_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
new file mode 100644
index 0000000000..c2a4bd7696
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMax.hpp
@@ -0,0 +1,171 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMAX_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+ 
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE max_init = -1;
+  const DATA_TYPE big_max = modval + 1;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+
+  DATA_TYPE ref_max = max_init;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_max = RAJA_MAX(test_array[ seg_idx[i] ], ref_max); 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  DATA_TYPE maxinit = big_max;
+  DATA_TYPE max(max_init);
+
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &mi, DATA_TYPE &m) {
+      mi = RAJA_MAX(working_array[idx], mi);
+      m  = RAJA_MAX(working_array[idx], m);
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(maxinit), big_max);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max);
+
+  max = max_init;
+  ASSERT_EQ(static_cast<DATA_TYPE>(max), max_init);
+
+  DATA_TYPE factor = 2;
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
+      m = RAJA_MAX(working_array[idx] * factor, m);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
+   
+  factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
+      m = RAJA_MAX(working_array[idx] * factor, m);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max), ref_max * factor);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest);
+template <typename T>
+class ForallReduceMaxBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMaxBasicTest, ReduceMaxBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
+
+// List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceMaxBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxBasicTest,
+                            ReduceMaxBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMAX_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
new file mode 100644
index 0000000000..a94bf121bb
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMaxLoc.hpp
@@ -0,0 +1,186 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMaxLocBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE max_init = -modval;
+  const IDX_TYPE maxloc_init = -1;
+  const IDX_TYPE maxloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE big_max = modval*10;
+  const IDX_TYPE big_maxloc = maxloc_init;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( 1000 % modval );
+  }
+  test_array[maxloc_idx] = static_cast<DATA_TYPE>(big_max);
+
+  DATA_TYPE ref_max = max_init;
+  IDX_TYPE ref_maxloc = maxloc_init;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] > ref_max ) {
+       ref_max = test_array[ seg_idx[i] ];
+       ref_maxloc = seg_idx[i];
+    } 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+
+  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE>;
+  VL_TYPE maxinit(big_max, maxloc_init);
+  VL_TYPE max(max_init, maxloc_init);
+
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&maxinit),
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &mi, VL_TYPE &m) {
+      mi.max( working_array[idx], idx );
+      m.max( working_array[idx], idx );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(maxinit.getVal()), big_max);
+  ASSERT_EQ(static_cast<IDX_TYPE>(maxinit.getLoc()), big_maxloc);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+
+  max = VL_TYPE(max_init, maxloc_init);
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), max_init);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), maxloc_init);
+
+  DATA_TYPE factor = 2;
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
+      m.max( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+  
+  factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::maximum>(&max),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
+      m.max( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(max.getVal()), ref_max * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(max.getLoc()), ref_maxloc);
+ 
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest);
+template <typename T>
+class ForallReduceMaxLocBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMaxLocBasicTest, ReduceMaxLocBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
+
+// List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceMaxLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMaxLocBasicTest,
+                            ReduceMaxLocBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMAXLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
new file mode 100644
index 0000000000..7199ce15fb
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMin.hpp
@@ -0,0 +1,171 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMIN_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinBasicTestImpl(const SEG_TYPE& seg,
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
+  const DATA_TYPE small_min = -modval;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+
+  DATA_TYPE ref_min = min_init;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_min = RAJA_MIN(test_array[ seg_idx[i] ], ref_min); 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  DATA_TYPE mininit = small_min;
+  DATA_TYPE min(min_init);
+
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &mi, DATA_TYPE &m) {
+      mi = RAJA_MIN(working_array[idx], mi);
+      m  = RAJA_MIN(working_array[idx], m);
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(mininit), small_min);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min);
+
+  min = min_init;
+  ASSERT_EQ(static_cast<DATA_TYPE>(min), min_init);
+
+  DATA_TYPE factor = 3; 
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
+      m = RAJA_MIN(working_array[idx] * factor, m);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
+
+  factor = 2;
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &m) {
+      m = RAJA_MIN(working_array[idx] * factor, m);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min), ref_min * factor);
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallReduceMinBasicTest);
+template <typename T>
+class ForallReduceMinBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMinBasicTest, ReduceMinBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
+
+// List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceMinBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinBasicTest,
+                            ReduceMinBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMIN_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
new file mode 100644
index 0000000000..2cb32c5aff
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceMinLoc.hpp
@@ -0,0 +1,186 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+#define __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceMinLocBasicTestImpl(const SEG_TYPE& seg,
+                                     const std::vector<IDX_TYPE>& seg_idx,
+                                     camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+  const DATA_TYPE min_init = modval+1;
+  const IDX_TYPE minloc_init = -1;
+  const IDX_TYPE minloc_idx = seg_idx[ idx_len * 2/3 ];
+  const DATA_TYPE small_min = -modval;
+  const IDX_TYPE small_minloc = minloc_init;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+  test_array[minloc_idx] = static_cast<DATA_TYPE>(small_min);
+
+  DATA_TYPE ref_min = min_init;
+  IDX_TYPE ref_minloc = minloc_init;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    if ( test_array[ seg_idx[i] ] < ref_min ) {
+       ref_min = test_array[ seg_idx[i] ];
+       ref_minloc = seg_idx[i];
+    } 
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+
+  using VL_TYPE = RAJA::expt::ValLoc<DATA_TYPE>;
+  VL_TYPE mininit(small_min, minloc_init);
+  VL_TYPE min(min_init, minloc_init);
+
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&mininit),
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &mi, VL_TYPE &m) {
+      mi.min( working_array[idx], idx );
+      m.min( working_array[idx], idx );
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(mininit.getVal()), small_min);
+  ASSERT_EQ(static_cast<IDX_TYPE>(mininit.getLoc()), small_minloc);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+
+  min = VL_TYPE(min_init, minloc_init);
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), min_init);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), minloc_init);
+
+  DATA_TYPE factor = 2;
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
+      m.min( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+
+  factor = 3;
+  RAJA::forall<EXEC_POLICY>(seg,
+    RAJA::expt::Reduce<RAJA::operators::minimum>(&min),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, VL_TYPE &m) {
+      m.min( working_array[idx] * factor, idx);
+  });
+  ASSERT_EQ(static_cast<DATA_TYPE>(min.getVal()), ref_min * factor);
+  ASSERT_EQ(static_cast<IDX_TYPE>(min.getLoc()), ref_minloc);
+   
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest);
+template <typename T>
+class ForallReduceMinLocBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceMinLocBasicTest, ReduceMinLocBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    r5, seg_idx, working_res);
+
+// List segment tests
+  seg_idx.clear();
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(),
+                                       working_res );
+  ForallReduceMinLocBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                                  RAJA::TypedListSegment<IDX_TYPE>,
+                                  EXEC_POLICY, REDUCE_POLICY>(
+                                    l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceMinLocBasicTest,
+                            ReduceMinLocBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCEMINLOC_HPP__
diff --git a/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
new file mode 100644
index 0000000000..abdd8c7be1
--- /dev/null
+++ b/test/functional/forall/reduce-basic/tests/test-forall-basic-expt-ReduceSum.hpp
@@ -0,0 +1,165 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_FORALL_BASIC_REDUCESUM_HPP__
+#define __TEST_FORALL_BASIC_REDUCESUM_HPP__
+
+#include <cstdlib>
+#include <ctime>
+#include <numeric>
+#include <vector>
+
+template <typename IDX_TYPE, typename DATA_TYPE,
+          typename SEG_TYPE,
+          typename EXEC_POLICY, typename REDUCE_POLICY>
+void ForallReduceSumBasicTestImpl(const SEG_TYPE& seg, 
+                                  const std::vector<IDX_TYPE>& seg_idx,
+                                  camp::resources::Resource working_res)
+{
+  IDX_TYPE data_len = seg_idx[seg_idx.size() - 1] + 1;
+  IDX_TYPE idx_len = static_cast<IDX_TYPE>( seg_idx.size() );
+
+  DATA_TYPE* working_array;
+  DATA_TYPE* check_array;
+  DATA_TYPE* test_array;
+
+  allocateForallTestData<DATA_TYPE>(data_len,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  const int modval = 100;
+
+  for (IDX_TYPE i = 0; i < data_len; ++i) {
+    test_array[i] = static_cast<DATA_TYPE>( rand() % modval );
+  }
+
+  DATA_TYPE ref_sum = 0;
+  for (IDX_TYPE i = 0; i < idx_len; ++i) {
+    ref_sum += test_array[ seg_idx[i] ];
+  }
+
+  working_res.memcpy(working_array, test_array, sizeof(DATA_TYPE) * data_len);
+
+  DATA_TYPE sum = 0;
+  DATA_TYPE sum2 = 2;
+
+  RAJA::forall<EXEC_POLICY>(seg, 
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+    RAJA::expt::Reduce<RAJA::operators::plus>(&sum2),
+    [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &s1, DATA_TYPE &s2) {
+      s1 += working_array[idx];
+      s2 += working_array[idx];
+  });
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(sum), ref_sum);
+  ASSERT_EQ(static_cast<DATA_TYPE>(sum2), ref_sum + 2);
+
+  sum = 0;
+
+  const int nloops = 2;
+
+  for (int j = 0; j < nloops; ++j) {
+    RAJA::forall<EXEC_POLICY>(seg, 
+      RAJA::expt::Reduce<RAJA::operators::plus>(&sum),
+      [=] RAJA_HOST_DEVICE(IDX_TYPE idx, DATA_TYPE &s) {
+        s += working_array[idx];
+    });
+  }
+
+  ASSERT_EQ(static_cast<DATA_TYPE>(sum), nloops * ref_sum);
+
+
+  deallocateForallTestData<DATA_TYPE>(working_res,
+                                      working_array,
+                                      check_array,
+                                      test_array);
+}
+
+
+TYPED_TEST_SUITE_P(ForallReduceSumBasicTest);
+template <typename T>
+class ForallReduceSumBasicTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(ForallReduceSumBasicTest, ReduceSumBasicForall)
+{
+  using IDX_TYPE      = typename camp::at<TypeParam, camp::num<0>>::type;
+  using DATA_TYPE     = typename camp::at<TypeParam, camp::num<1>>::type;
+  using WORKING_RES   = typename camp::at<TypeParam, camp::num<2>>::type;
+  using EXEC_POLICY   = typename camp::at<TypeParam, camp::num<3>>::type;
+  using REDUCE_POLICY = typename camp::at<TypeParam, camp::num<4>>::type;
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  std::vector<IDX_TYPE> seg_idx;
+
+// Range segment tests
+  RAJA::TypedRangeSegment<IDX_TYPE> r1( 0, 28 );
+  RAJA::getIndices(seg_idx, r1);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r1, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r2( 3, 642 );
+  RAJA::getIndices(seg_idx, r2);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r2, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeSegment<IDX_TYPE> r3( 0, 2057 );
+  RAJA::getIndices(seg_idx, r3);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r3, seg_idx, working_res);
+
+// Range-stride segment tests
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r4( 0, 188, 2 );
+  RAJA::getIndices(seg_idx, r4);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r4, seg_idx, working_res);
+
+  seg_idx.clear();
+  RAJA::TypedRangeStrideSegment<IDX_TYPE> r5( 3, 1029, 3 );
+  RAJA::getIndices(seg_idx, r5);
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedRangeStrideSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 r5, seg_idx, working_res);
+
+// List segment tests
+  seg_idx.clear(); 
+  IDX_TYPE last = 10567;
+  srand( time(NULL) );
+  for (IDX_TYPE i = 0; i < last; ++i) {
+    IDX_TYPE randval = IDX_TYPE( rand() % RAJA::stripIndexType(last) );
+    if ( i < randval ) {
+      seg_idx.push_back(i);
+    }
+  }
+  RAJA::TypedListSegment<IDX_TYPE> l1( &seg_idx[0], seg_idx.size(), 
+                                       working_res );
+  ForallReduceSumBasicTestImpl<IDX_TYPE, DATA_TYPE,
+                               RAJA::TypedListSegment<IDX_TYPE>,
+                               EXEC_POLICY, REDUCE_POLICY>(
+                                 l1, seg_idx, working_res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ForallReduceSumBasicTest,
+                            ReduceSumBasicForall);
+
+#endif  // __TEST_FORALL_BASIC_REDUCESUM_HPP__
diff --git a/test/functional/forall/reduce-multiple-indexset/CMakeLists.txt b/test/functional/forall/reduce-multiple-indexset/CMakeLists.txt
index cee84a83b8..f5dab842f0 100644
--- a/test/functional/forall/reduce-multiple-indexset/CMakeLists.txt
+++ b/test/functional/forall/reduce-multiple-indexset/CMakeLists.txt
@@ -16,6 +16,15 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   endif()
 endif()
 
+#
+# While we're adding SYCL tests, enable it for each test set like this.
+#
+# Eventually, remove this and enable in the top-level CMakeLists.txt file.
+#
+#if(RAJA_ENABLE_SYCL)
+#  list(APPEND FORALL_BACKENDS Sycl)
+#endif()
+
 #
 # Generate tests for each enabled RAJA back-end
 #
diff --git a/test/functional/forall/reduce-multiple-segment/CMakeLists.txt b/test/functional/forall/reduce-multiple-segment/CMakeLists.txt
index 7573226ea4..90da514476 100644
--- a/test/functional/forall/reduce-multiple-segment/CMakeLists.txt
+++ b/test/functional/forall/reduce-multiple-segment/CMakeLists.txt
@@ -16,6 +16,15 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   endif()
 endif()
 
+#
+# While we're adding SYCL tests, enable it for each test set like this.
+#
+# Eventually, remove this and enable in the top-level CMakeLists.txt file.
+#
+#if(RAJA_ENABLE_SYCL)
+#  list(APPEND FORALL_BACKENDS Sycl)
+#endif()
+
 #
 # Generate tests for each enabled RAJA back-end
 #
diff --git a/test/functional/forall/resource-segment/CMakeLists.txt b/test/functional/forall/resource-segment/CMakeLists.txt
index e3cab50be4..99c2a21c5a 100644
--- a/test/functional/forall/resource-segment/CMakeLists.txt
+++ b/test/functional/forall/resource-segment/CMakeLists.txt
@@ -10,7 +10,6 @@
 #
 set(SEGTYPES ListSegment RangeSegment RangeStrideSegment)
 
-
 #
 # Generate tests for each enabled RAJA back-end. 
 # 
diff --git a/test/functional/forall/segment-view/CMakeLists.txt b/test/functional/forall/segment-view/CMakeLists.txt
index 7871d826dc..ed7df70d67 100644
--- a/test/functional/forall/segment-view/CMakeLists.txt
+++ b/test/functional/forall/segment-view/CMakeLists.txt
@@ -10,7 +10,6 @@
 #
 set(SEGVIEWTYPES ListSegmentView RangeSegmentView RangeSegment2DView RangeStrideSegmentView)
 
-
 #
 # Generate tests for each enabled RAJA back-end.
 #
diff --git a/test/functional/forall/segment/CMakeLists.txt b/test/functional/forall/segment/CMakeLists.txt
index bdebca6bb7..ea1c379c37 100644
--- a/test/functional/forall/segment/CMakeLists.txt
+++ b/test/functional/forall/segment/CMakeLists.txt
@@ -10,7 +10,6 @@
 #
 set(SEGTYPES ListSegment RangeSegment RangeStrideSegment)
 
-
 #
 # Generate tests for each enabled RAJA back-end. 
 # 
diff --git a/test/functional/kernel/CMakeLists.txt b/test/functional/kernel/CMakeLists.txt
index c67cbc2289..4792da38b2 100644
--- a/test/functional/kernel/CMakeLists.txt
+++ b/test/functional/kernel/CMakeLists.txt
@@ -23,6 +23,10 @@ if(RAJA_ENABLE_HIP)
   list(APPEND KERNEL_BACKENDS Hip)
 endif()
 
+if(RAJA_ENABLE_SYCL)
+  list(APPEND KERNEL_BACKENDS Sycl)
+endif()
+
 if(RAJA_ENABLE_TARGET_OPENMP)
   list(APPEND KERNEL_BACKENDS OpenMPTarget)
 endif()
@@ -49,6 +53,8 @@ add_subdirectory(single-loop-tile-icount-tcount)
 
 add_subdirectory(tile-variants)
 
+add_subdirectory(warp-thread)
+
 unset( KERNEL_BACKENDS )
 
 #
diff --git a/test/functional/kernel/basic-fission-fusion-loop/CMakeLists.txt b/test/functional/kernel/basic-fission-fusion-loop/CMakeLists.txt
index bd2aeb5039..c07d08e186 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/CMakeLists.txt
+++ b/test/functional/kernel/basic-fission-fusion-loop/CMakeLists.txt
@@ -10,6 +10,7 @@
 #
 # Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
 #
+
 foreach( BACKEND ${KERNEL_BACKENDS} )
       configure_file( test-kernel-basic-fission-fusion-loop.cpp.in
                       test-kernel-basic-fission-fusion-loop-${BACKEND}.cpp )
diff --git a/test/functional/kernel/basic-fission-fusion-loop/test-kernel-basic-fission-fusion-loop.cpp.in b/test/functional/kernel/basic-fission-fusion-loop/test-kernel-basic-fission-fusion-loop.cpp.in
index a54e248b38..35aa143825 100644
--- a/test/functional/kernel/basic-fission-fusion-loop/test-kernel-basic-fission-fusion-loop.cpp.in
+++ b/test/functional/kernel/basic-fission-fusion-loop/test-kernel-basic-fission-fusion-loop.cpp.in
@@ -158,6 +158,32 @@ using HipKernelExecPols = camp::list<
 >;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclKernelExecPols = camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+        RAJA::statement::Lambda<0, RAJA::Segs<0>>,
+        RAJA::statement::Lambda<1, RAJA::Segs<0>>
+      >
+    >
+  >,
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+        RAJA::statement::Lambda<0, RAJA::Segs<0>>
+      >,
+      RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+        RAJA::statement::Lambda<1, RAJA::Segs<0>>
+      >
+    >
+  >
+
+>;
+#endif
+
 
 //
 // Cartesian product of types used in parameterized tests
diff --git a/test/functional/kernel/basic-single-icount-loop/CMakeLists.txt b/test/functional/kernel/basic-single-icount-loop/CMakeLists.txt
index d80a3a75bb..494ce09051 100644
--- a/test/functional/kernel/basic-single-icount-loop/CMakeLists.txt
+++ b/test/functional/kernel/basic-single-icount-loop/CMakeLists.txt
@@ -10,6 +10,7 @@
 # 
 # Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
 #
+
 foreach( BACKEND ${KERNEL_BACKENDS} )
   configure_file( test-kernel-basic-single-icount-loop.cpp.in
                   test-kernel-basic-single-icount-loop-${BACKEND}.cpp )
diff --git a/test/functional/kernel/basic-single-icount-loop/test-kernel-basic-single-icount-loop.cpp.in b/test/functional/kernel/basic-single-icount-loop/test-kernel-basic-single-icount-loop.cpp.in
index 94a98851ee..b509518aa9 100644
--- a/test/functional/kernel/basic-single-icount-loop/test-kernel-basic-single-icount-loop.cpp.in
+++ b/test/functional/kernel/basic-single-icount-loop/test-kernel-basic-single-icount-loop.cpp.in
@@ -198,6 +198,37 @@ camp::list<
 >;
 #endif  // if defined(RAJA_ENABLE_HIP)
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclKernelExecPols =
+camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::sycl_local_0_loop,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >,
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::sycl_local_1_loop,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >,
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::sycl_local_2_loop,
+        RAJA::statement::Lambda<0>
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_SYCL)
+
 //
 // Cartesian product of types used in parameterized tests
 //
diff --git a/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in b/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in
index 65b32a6c8b..329ff0d9bc 100644
--- a/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in
+++ b/test/functional/kernel/basic-single-loop/test-kernel-basic-single-loop.cpp.in
@@ -192,6 +192,37 @@ camp::list<
 >;
 #endif  // if defined(RAJA_ENABLE_HIP)
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclKernelExecPols =
+camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+        RAJA::statement::Lambda<0, RAJA::Segs<0>>
+      >
+    >
+  >,
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_1_loop,
+        RAJA::statement::Lambda<0, RAJA::Segs<0>>
+      >
+    >
+  >,
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_2_loop,
+        RAJA::statement::Lambda<0, RAJA::Segs<0>>
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_SYCL)
+
 //
 // Cartesian product of types used in parameterized tests
 //
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/CMakeLists.txt b/test/functional/kernel/conditional-fission-fusion-loop/CMakeLists.txt
index 9f2dfccf74..3fb9bd5ea9 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/CMakeLists.txt
+++ b/test/functional/kernel/conditional-fission-fusion-loop/CMakeLists.txt
@@ -10,6 +10,16 @@
 #
 # Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
 #
+
+#
+# While we're adding SYCL tests, enable it for each test set like this.
+#
+# Eventually, remove this and enable in the top-level CMakeLists.txt file.
+#
+#if(RAJA_ENABLE_SYCL)
+#  list(APPEND KERNEL_BACKENDS Sycl)
+#endif()
+
 foreach( BACKEND ${KERNEL_BACKENDS} )
       configure_file( test-kernel-conditional-fission-fusion-loop.cpp.in
                       test-kernel-conditional-fission-fusion-loop-${BACKEND}.cpp )
diff --git a/test/functional/kernel/conditional-fission-fusion-loop/test-kernel-conditional-fission-fusion-loop.cpp.in b/test/functional/kernel/conditional-fission-fusion-loop/test-kernel-conditional-fission-fusion-loop.cpp.in
index 077e2f98f8..ffa9657de4 100644
--- a/test/functional/kernel/conditional-fission-fusion-loop/test-kernel-conditional-fission-fusion-loop.cpp.in
+++ b/test/functional/kernel/conditional-fission-fusion-loop/test-kernel-conditional-fission-fusion-loop.cpp.in
@@ -173,6 +173,33 @@ using HipKernelExecPols = camp::list<
 >;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclKernelExecPols = camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::If<RAJA::statement::Equals<RAJA::statement::Param<0>,
+                                                  RAJA::statement::Value<0>>,
+        RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+          RAJA::statement::Lambda<0, RAJA::Segs<0>>,
+          RAJA::statement::Lambda<1, RAJA::Segs<0>>
+        >
+      >,
+      RAJA::statement::If<RAJA::statement::Equals<RAJA::statement::Param<0>,
+                                                  RAJA::statement::Value<1>>,
+        RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+          RAJA::statement::Lambda<0, RAJA::Segs<0>>
+        >,
+        RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+          RAJA::statement::Lambda<1, RAJA::Segs<0>>
+        >
+      >
+    >
+  >
+
+>;
+#endif
+
 
 //
 // Cartesian product of types used in parameterized tests
diff --git a/test/functional/kernel/nested-loop-reducesum/CMakeLists.txt b/test/functional/kernel/nested-loop-reducesum/CMakeLists.txt
index 545dbd7ec0..b91554414d 100644
--- a/test/functional/kernel/nested-loop-reducesum/CMakeLists.txt
+++ b/test/functional/kernel/nested-loop-reducesum/CMakeLists.txt
@@ -19,6 +19,13 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   endif()
 endif()
 
+#
+# Remove SYCL until kernel reduction support is added
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM KERNEL_BACKENDS Sycl)
+endif()
+
 #
 # Generate kernel region basic tests for each enabled RAJA back-end.
 #
diff --git a/test/functional/kernel/nested-loop-segment-types/CMakeLists.txt b/test/functional/kernel/nested-loop-segment-types/CMakeLists.txt
index 8ce97988c1..a1e21f7fff 100644
--- a/test/functional/kernel/nested-loop-segment-types/CMakeLists.txt
+++ b/test/functional/kernel/nested-loop-segment-types/CMakeLists.txt
@@ -17,6 +17,22 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   endif()
 endif()
 
+#
+# Remove SYCL until kernel reduction support is added
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM KERNEL_BACKENDS Sycl)
+endif()
+
+#
+# While we're adding SYCL tests, enable it for each test set like this.
+#
+# Eventually, remove this and enable in the top-level CMakeLists.txt file.
+#
+#if(RAJA_ENABLE_SYCL)
+#  list(APPEND KERNEL_BACKENDS Sycl)
+#endif()
+
 foreach( BACKEND ${KERNEL_BACKENDS} )
   configure_file( test-kernel-nested-loop-segments.cpp.in
                   test-kernel-nested-loop-segments-${BACKEND}.cpp )
diff --git a/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in b/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
index 678ef10c9a..eb4672983f 100644
--- a/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
+++ b/test/functional/kernel/nested-loop-segment-types/test-kernel-nested-loop-segments.cpp.in
@@ -151,7 +151,37 @@ using HipKernelExecPols = camp::list<
   >
 
 >;
-#endif  // if defined(RAJA_ENABLE_HIP)
+#endif  // if defined(RAJA_ENABLE_SYCL)
+
+#if defined(RAJA_ENABLE_SYCL)
+using SyclKernelExecPols = camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernelAsync<
+      RAJA::statement::For<0, RAJA::sycl_group_2_loop,
+        RAJA::statement::For<1, RAJA::sycl_group_1_loop,
+          RAJA::statement::For<2, RAJA::sycl_local_0_loop,
+            RAJA::statement::Lambda<0, RAJA::Segs<0, 1, 2>>
+          >
+        >
+      >
+    >
+  >,
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+        RAJA::statement::For<1, RAJA::seq_exec,
+          RAJA::statement::For<2, RAJA::seq_exec,
+            RAJA::statement::Lambda<0, RAJA::Segs<0, 1, 2>>
+          >
+        >
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_SYCL)
 
 
 //
diff --git a/test/functional/kernel/nested-loop-view-types/CMakeLists.txt b/test/functional/kernel/nested-loop-view-types/CMakeLists.txt
index 81b1302654..999add72fc 100644
--- a/test/functional/kernel/nested-loop-view-types/CMakeLists.txt
+++ b/test/functional/kernel/nested-loop-view-types/CMakeLists.txt
@@ -16,6 +16,7 @@ set(DIMTYPES 2D 3D)
 #
 # Note: KERNEL_BACKENDS is defined in ../CMakeLists.txt
 #
+
 foreach( BACKEND ${KERNEL_BACKENDS} )
   foreach( TESTTYPE ${TESTTYPES} )
     foreach( DIM ${DIMTYPES} )
diff --git a/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in b/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
index 5ec0d5c8c2..debc71e907 100644
--- a/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
+++ b/test/functional/kernel/nested-loop-view-types/test-kernel-nested-loop-view.cpp.in
@@ -316,6 +316,41 @@ camp::list<
 >;
 #endif  // if defined(RAJA_ENABLE_HIP)
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclKernel2DExecPols =
+camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+        RAJA::statement::For<1, RAJA::sycl_local_1_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+
+using SyclKernel3DExecPols =
+camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+        RAJA::statement::For<1, RAJA::sycl_local_1_loop,
+          RAJA::statement::For<2, RAJA::sycl_local_2_loop,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_SYCL)
+
+
 //
 // Cartesian product of types used in parameterized tests
 //
diff --git a/test/functional/kernel/nested-loop/CMakeLists.txt b/test/functional/kernel/nested-loop/CMakeLists.txt
index a75815e358..2313335bfc 100644
--- a/test/functional/kernel/nested-loop/CMakeLists.txt
+++ b/test/functional/kernel/nested-loop/CMakeLists.txt
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-set(NESTED_LOOPTYPES Basic MultiLambda MultiLambdaParam)
+set(NESTED_LOOPTYPES Basic)
 
 set( USE_RESOURCE "-resource-" "-" )
 
@@ -20,8 +20,41 @@ if(RAJA_ENABLE_TARGET_OPENMP)
 endif()
 
 #
-# Generate kernel region basic tests for each enabled RAJA back-end.
+# Generate kernel basic tests for each enabled RAJA back-end.
 #
+foreach( NESTED_LOOP_BACKEND ${KERNEL_BACKENDS} )
+  foreach( RESOURCE ${USE_RESOURCE} )
+    foreach( NESTED_LOOP_TYPE ${NESTED_LOOPTYPES} )
+      configure_file( test-kernel-nested-loop.cpp.in
+                      test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
+
+      raja_add_test( NAME test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}
+                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
+
+      target_include_directories(test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+
+    endforeach()
+  endforeach()
+endforeach()
+
+unset( NESTED_LOOPTYPES )
+
+
+#
+# Now, do multi-lambda tests...
+#
+
+set(NESTED_LOOPTYPES MultiLambda MultiLambdaParam)
+
+#
+# If building SYCL tests, remove the back-end from
+# from the list of tests to generate here for the
+# multiple lambda tests.
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM KERNEL_BACKENDS Sycl)
+endif()
+
 foreach( NESTED_LOOP_BACKEND ${KERNEL_BACKENDS} )
   foreach( RESOURCE ${USE_RESOURCE} )
     foreach( NESTED_LOOP_TYPE ${NESTED_LOOPTYPES} )
@@ -30,17 +63,19 @@ foreach( NESTED_LOOP_BACKEND ${KERNEL_BACKENDS} )
 
       #Some tests are known to fail for Hip, mark those tests (Will not be run in Gitlab CI)
       if(${NESTED_LOOP_BACKEND} STREQUAL "Hip" AND ${NESTED_LOOP_TYPE} STREQUAL "MultiLambda")
-          raja_add_test( NAME test-kernel${RESOURCE}nested-loop-Known-Hip-Failure-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}
-              SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
 
-          target_include_directories(test-kernel${RESOURCE}nested-loop-Known-Hip-Failure-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.exe
-                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+        raja_add_test( NAME test-kernel${RESOURCE}nested-loop-Known-Hip-Failure-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}
+                       SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
+
+        target_include_directories(test-kernel${RESOURCE}nested-loop-Known-Hip-Failure-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+
       else()
-          raja_add_test( NAME test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}
-              SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
 
-          target_include_directories(test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.exe
-                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+        raja_add_test( NAME test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}
+                       SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
+
+        target_include_directories(test-kernel${RESOURCE}nested-loop-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+
       endif()
 
     endforeach()
diff --git a/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in b/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
index 0f0dbe0bc4..a7c372d6fa 100644
--- a/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
+++ b/test/functional/kernel/nested-loop/test-kernel-nested-loop.cpp.in
@@ -118,6 +118,22 @@ using HipKernelNestedLoopExecPols = camp::list<
 
 #endif  // RAJA_ENABLE_HIP
 
+#if defined(RAJA_ENABLE_SYCL)
+
+using SyclKernelNestedLoopExecPols = camp::list<
+
+    // Depth 2 Exec Pols
+    NestedLoopData<DEVICE_DEPTH_2, RAJA::sycl_local_0_loop, RAJA::sycl_local_1_loop >,
+    NestedLoopData<DEVICE_DEPTH_2, RAJA::seq_exec, RAJA::sycl_local_0_loop >,
+    NestedLoopData<DEVICE_DEPTH_2, RAJA::sycl_local_0_loop, RAJA::seq_exec >,
+
+    // Depth 3 Exec Pols
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::sycl_local_0_loop, RAJA::sycl_local_1_loop, RAJA::seq_exec >,
+    NestedLoopData<DEVICE_DEPTH_3, RAJA::sycl_group_0_loop, RAJA::sycl_local_1_loop, RAJA::seq_exec >
+  >;
+
+#endif  // RAJA_ENABLE_SYCL
+
 //
 // Build out list of supported Nested Loop data for tests' suported types.
 //
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
index 60de7d7504..4c49527dab 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-Basic-impl.hpp
@@ -248,7 +248,7 @@ struct BasicNestedLoopExec<DEPTH_3_COLLAPSE_SEQ_INNER, POLICY_DATA> {
     >;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
 
 template<typename POLICY_DATA>
 struct BasicNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
index 80ad0f17e5..c0f2e87145 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambda-impl.hpp
@@ -161,7 +161,7 @@ struct MultiLambdaNestedLoopExec<DEPTH_2_COLLAPSE, POLICY_DATA> {
     >;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
 
 template<typename POLICY_DATA>
 struct MultiLambdaNestedLoopExec<DEVICE_DEPTH_2, POLICY_DATA> {
diff --git a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
index 1955c7c3fa..e3d7affdba 100644
--- a/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
+++ b/test/functional/kernel/nested-loop/tests/nested-loop-MultiLambdaParam-impl.hpp
@@ -167,7 +167,7 @@ struct MultiLambdaParamNestedLoopExec<DEPTH_3, POLICY_DATA> {
     >;
 };
 
-#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP) or defined(RAJA_ENABLE_SYCL)
 
 template<typename POLICY_DATA>
 struct MultiLambdaParamNestedLoopExec<DEVICE_DEPTH_3, POLICY_DATA> {
diff --git a/test/functional/kernel/reduce-loc/CMakeLists.txt b/test/functional/kernel/reduce-loc/CMakeLists.txt
index 0bcb46ba0d..7a761c58d9 100644
--- a/test/functional/kernel/reduce-loc/CMakeLists.txt
+++ b/test/functional/kernel/reduce-loc/CMakeLists.txt
@@ -17,6 +17,13 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   endif()
 endif()
 
+#
+# Remove SYCL until kernel reduction support is added
+#
+if(RAJA_ENABLE_SYCL)
+  list(REMOVE_ITEM KERNEL_BACKENDS Sycl)
+endif()
+
 #
 # Generate kernel region basic tests for each enabled RAJA back-end.
 #
diff --git a/test/functional/kernel/single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in b/test/functional/kernel/single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in
index 937455733b..a21550a583 100644
--- a/test/functional/kernel/single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in
+++ b/test/functional/kernel/single-loop-tile-icount-tcount/test-kernel-single-loop-tile-count.cpp.in
@@ -225,6 +225,41 @@ using HipKernelTileTCountExecPols = camp::list<
 >;
 #endif  // if defined(RAJA_ENABLE_HIP)
 
+#if defined(RAJA_ENABLE_SYCL)
+//
+// Num reduction policies must match num exec policies.
+//
+using SyclReducePols = camp::list< RAJA::sycl_reduce >;
+
+using SyclKernelForICountExecPols = camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::Tile<0, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_loop,
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::sycl_local_0_direct,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+
+using SyclKernelTileTCountExecPols = camp::list<
+
+  RAJA::KernelPolicy<
+    RAJA::statement::SyclKernel<
+      RAJA::statement::TileTCount<0, RAJA::statement::Param<0>, RAJA::tile_fixed<@TILESIZE@>, RAJA::sycl_group_0_loop,
+        RAJA::statement::For<0, RAJA::sycl_local_0_direct,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >
+
+>;
+#endif  // if defined(RAJA_ENABLE_SYCL)
+
 //
 // Cartesian product of types used in parameterized tests
 //
diff --git a/test/functional/kernel/tile-variants/CMakeLists.txt b/test/functional/kernel/tile-variants/CMakeLists.txt
index 5ea9c32b79..0300bdf9dd 100644
--- a/test/functional/kernel/tile-variants/CMakeLists.txt
+++ b/test/functional/kernel/tile-variants/CMakeLists.txt
@@ -33,7 +33,7 @@ foreach( TILE_BACKEND ${KERNEL_BACKENDS} )
   foreach( TILE_TYPE ${TILETYPES} )
     # Dynamic tiling not yet implemented for Cuda or Hip
     # Removing OpenMPTarget because XLC compilation requires ~50 minutes
-    if( NOT ((TILE_BACKEND STREQUAL "Cuda") OR (TILE_BACKEND STREQUAL "Hip") OR (TILE_BACKEND STREQUAL "OpenMPTarget")) )
+    if( NOT ((TILE_BACKEND STREQUAL "Cuda") OR (TILE_BACKEND STREQUAL "Hip") OR (TILE_BACKEND STREQUAL "OpenMPTarget") OR (TILE_BACKEND STREQUAL "Sycl")) )
       configure_file( test-kernel-tiledyn.cpp.in
                       test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp )
       raja_add_test( NAME test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}
@@ -56,7 +56,7 @@ foreach( TILE_BACKEND ${KERNEL_BACKENDS} )
   foreach( TILE_TYPE ${TILETYPES} )
     # Removing OpenMPTarget because local array capability does not exist
     # TBB not working for local array
-    if( NOT ((TILE_BACKEND STREQUAL "OpenMPTarget") OR (TILE_BACKEND STREQUAL "TBB")) )
+    if( NOT ((TILE_BACKEND STREQUAL "OpenMPTarget") OR (TILE_BACKEND STREQUAL "TBB") OR (TILE_BACKEND STREQUAL "Sycl")) )
       configure_file( test-kernel-tilelocal.cpp.in
                       test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}.cpp )
       raja_add_test( NAME test-kernel-tile-${TILE_TYPE}-${TILE_BACKEND}
diff --git a/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
index 9168b6667f..47756bcad6 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tilefixed.cpp.in
@@ -293,6 +293,57 @@ using HipKernelTileExecPols =
 
 #endif  // RAJA_ENABLE_HIP
 
+#if defined(RAJA_ENABLE_SYCL)
+
+using SyclKernelTileExecPols =
+  camp::list<
+
+    RAJA::KernelPolicy<
+      RAJA::statement::SyclKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::sycl_group_0_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::sycl_group_1_direct,
+            RAJA::statement::For<1, RAJA::sycl_local_0_loop,
+              RAJA::statement::For<0, RAJA::sycl_local_1_direct,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >,
+
+    RAJA::KernelPolicy<
+      RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::seq_exec,
+        RAJA::statement::For<1, RAJA::seq_exec,
+          RAJA::statement::SyclKernel<
+            RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::sycl_group_0_loop,
+              RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >,
+
+    RAJA::KernelPolicy<
+      RAJA::statement::SyclKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::seq_exec,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::sycl_group_0_loop,
+            RAJA::statement::For<1, RAJA::seq_exec,
+              RAJA::statement::For<0, RAJA::sycl_local_0_loop,
+                RAJA::statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >
+    >
+
+  >;
+
+#endif  // RAJA_ENABLE_SYCL
+
 //
 // Cartesian product of types used in parameterized tests
 //
diff --git a/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in b/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
index 3ca99eb90c..f67cc9d53e 100644
--- a/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
+++ b/test/functional/kernel/tile-variants/test-kernel-tilelocal.cpp.in
@@ -154,6 +154,41 @@ using HipKernelTileExecPols =
 
 #endif  // RAJA_ENABLE_HIP
 
+#if defined(RAJA_ENABLE_SYCL)
+
+/*using SyclKernelTileExecPols =
+  camp::list<
+
+    RAJA::KernelPolicy<
+      RAJA::statement::SyclKernel<
+        RAJA::statement::Tile<1, RAJA::tile_fixed<tile_dim_x>, RAJA::sycl_group_0_loop,
+          RAJA::statement::Tile<0, RAJA::tile_fixed<tile_dim_y>, RAJA::sycl_group_1_direct,
+            RAJA::statement::InitLocalMem<RAJA::sycl_shared_mem, RAJA::ParamList<2>,
+              RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::sycl_local_0_loop,
+                RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::sycl_local_1_direct,
+                  RAJA::statement::Lambda<0>
+                >
+              >,
+
+              RAJA::statement::SyclSyncThreads,
+
+              RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::sycl_local_0_loop,
+                RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::sycl_local_1_direct,
+                  RAJA::statement::Lambda<1>
+                >
+              >,
+
+              RAJA::statement::SyclSyncThreads
+            >
+          >
+        >
+      >
+    >
+
+  >;
+*/
+#endif  // RAJA_ENABLE_SYCL
+
 //
 // Cartesian product of types used in parameterized tests
 //
diff --git a/test/functional/kernel/warp-thread/CMakeLists.txt b/test/functional/kernel/warp-thread/CMakeLists.txt
new file mode 100644
index 0000000000..f208f8bb6b
--- /dev/null
+++ b/test/functional/kernel/warp-thread/CMakeLists.txt
@@ -0,0 +1,34 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+set(NESTED_LOOPTYPES WarpLoop ReduceMask ReduceWarp)
+
+set( USE_RESOURCE "-resource-" "-" )
+
+#
+# Generate kernel region basic tests for each enabled RAJA back-end.
+#
+foreach( NESTED_LOOP_BACKEND ${KERNEL_BACKENDS} )
+  foreach( RESOURCE ${USE_RESOURCE} )
+    foreach( NESTED_LOOP_TYPE ${NESTED_LOOPTYPES} )
+      if( (${NESTED_LOOP_BACKEND} STREQUAL "Cuda" OR ${NESTED_LOOP_BACKEND} STREQUAL "Hip" ) # allow only device tests
+        )
+        configure_file( test-kernel-warp-thread.cpp.in
+                        test-kernel${RESOURCE}warp-thread-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
+
+        raja_add_test( NAME test-kernel${RESOURCE}warp-thread-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}
+            SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-kernel${RESOURCE}warp-thread-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.cpp )
+
+        target_include_directories(test-kernel${RESOURCE}warp-thread-${NESTED_LOOP_TYPE}-${NESTED_LOOP_BACKEND}.exe
+                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+      endif()
+
+    endforeach()
+  endforeach()
+endforeach()
+
+unset( NESTED_LOOPTYPES )
diff --git a/test/functional/kernel/warp-thread/test-kernel-warp-thread.cpp.in b/test/functional/kernel/warp-thread/test-kernel-warp-thread.cpp.in
new file mode 100644
index 0000000000..753450035f
--- /dev/null
+++ b/test/functional/kernel/warp-thread/test-kernel-warp-thread.cpp.in
@@ -0,0 +1,83 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-kernel-nested-loop-types.hpp"
+
+#include "RAJA_test-reducepol.hpp"
+
+#include "test-kernel@RESOURCE@warp-thread-@NESTED_LOOP_TYPE@.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+using CudaKernelWarpThreadExecPols = camp::list<
+
+    // Device Depth 1 WarpLoop Exec Pols
+    NestedLoopData<DEVICE_DEPTH_1_REDUCESUM_WARP, RAJA::cuda_warp_loop >,
+    NestedLoopData<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE, RAJA::cuda_warp_direct >,
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARP, RAJA::cuda_warp_direct >,
+
+    // Device Depth 2 ReduceMask Exec Pols
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARPMASK, RAJA::cuda_thread_masked_direct<RAJA::BitMask<6,0>>, RAJA::cuda_thread_masked_loop<RAJA::BitMask<2,6>> >,
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI, RAJA::cuda_thread_masked_direct<RAJA::BitMask<6,0>>, RAJA::cuda_thread_masked_loop<RAJA::BitMask<2,6>> >,
+
+    // ReduceWarp Exec Pols
+    NestedLoopData<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE, RAJA::cuda_warp_loop, RAJA::cuda_warp_reduce >,
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE, RAJA::cuda_thread_y_loop, RAJA::cuda_warp_loop, RAJA::cuda_warp_reduce >,
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE, RAJA::cuda_block_x_loop, RAJA::cuda_thread_y_direct, RAJA::cuda_warp_direct, RAJA::statement::CudaSyncWarp, RAJA::cuda_warp_reduce >
+
+  >;
+
+#endif  // RAJA_ENABLE_CUDA
+
+#if defined(RAJA_ENABLE_HIP)
+  
+using HipKernelWarpThreadExecPols = camp::list<
+
+    // Device Depth 1 WarpLoop Exec Pols
+    NestedLoopData<DEVICE_DEPTH_1_REDUCESUM_WARP, RAJA::hip_warp_loop >,
+    NestedLoopData<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE, RAJA::hip_warp_direct >,
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARP, RAJA::hip_warp_direct >,
+
+    // Device Depth 2 ReduceMask Exec Pols
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARPMASK, RAJA::hip_thread_masked_direct<RAJA::BitMask<6,0>>, RAJA::hip_thread_masked_loop<RAJA::BitMask<2,6>> >,
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI, RAJA::hip_thread_masked_direct<RAJA::BitMask<6,0>>, RAJA::hip_thread_masked_loop<RAJA::BitMask<2,6>> >,
+
+    // ReduceWarp Exec Pols
+    NestedLoopData<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE, RAJA::hip_warp_loop, RAJA::hip_warp_reduce >,
+    NestedLoopData<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE, RAJA::hip_thread_y_loop, RAJA::hip_warp_loop, RAJA::hip_warp_reduce >,
+    NestedLoopData<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE, RAJA::hip_block_x_loop, RAJA::hip_thread_y_direct, RAJA::hip_warp_direct, RAJA::statement::HipSyncWarp, RAJA::hip_warp_reduce >
+
+  >;
+
+#endif  // RAJA_ENABLE_HIP
+
+//
+// Build out list of supported Nested Loop data for tests' suported types.
+//
+using @NESTED_LOOP_TYPE@KernelWarpThreadExecPols = 
+  KernelExecListBuilder<
+    @NESTED_LOOP_TYPE@SupportedLoopTypeList,
+    @NESTED_LOOP_BACKEND@KernelWarpThreadExecPols
+  >::type;
+
+//
+// Cartesian product of types used in parameterized tests.
+//
+using @NESTED_LOOP_BACKEND@KernelWarpThreadTypes =
+  Test< camp::cartesian_product<@NESTED_LOOP_BACKEND@ResourceList,
+                                @NESTED_LOOP_BACKEND@ReducePols,
+                                @NESTED_LOOP_TYPE@KernelWarpThreadExecPols>>::Types;
+
+//
+// Instantiate parameterized test.
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@NESTED_LOOP_BACKEND@,
+                               KernelWarpThread@NESTED_LOOP_TYPE@Test,
+                               @NESTED_LOOP_BACKEND@KernelWarpThreadTypes);
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
new file mode 100644
index 0000000000..a3fc80d2f7
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceMask.hpp
@@ -0,0 +1,45 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
+#define __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
+
+#include "warp-thread-ReduceMask-impl.hpp"
+
+//
+//
+// Setup the Warp Reduction ReduceMask g-tests.
+//
+//
+TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
+template <typename T>
+class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  // Attain the loop depth type from execpol data.
+  using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
+
+  // Get List of loop exec policies.
+  using LOOP_POLS = typename EXEC_POL_DATA::type;
+
+  // Build proper basic kernel exec policy type.
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+
+  constexpr bool USE_RES = true;
+
+  // For double nested loop tests the third arg is ignored.
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
+                            WarpThreadReduceMaskKernel);
+
+#endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
new file mode 100644
index 0000000000..3d36ac20f2
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-ReduceWarp.hpp
@@ -0,0 +1,46 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
+#define __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
+
+#include "warp-thread-ReduceWarp-impl.hpp"
+
+//
+//
+// Setup the Warp Reduction ReduceWarp g-tests.
+//
+//
+TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
+template <typename T>
+class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  // Attain the loop depth type from execpol data.
+  using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
+
+  // Get List of loop exec policies.
+  using LOOP_POLS = typename EXEC_POL_DATA::type;
+
+  // Build proper basic kernel exec policy type.
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+
+  constexpr bool USE_RES = true;
+
+  // For double nested loop tests the third arg is ignored.
+  // Integer argument needs to be divisible by 10, and 16.
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
+                            WarpThreadReduceWarpKernel);
+
+#endif  // __TEST_WARP_THREAD_RESOURCE_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
new file mode 100644
index 0000000000..ff60bbc90a
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-resource-warp-thread-WarpLoop.hpp
@@ -0,0 +1,45 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
+#define __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
+
+#include "warp-thread-WarpLoop-impl.hpp"
+
+//
+//
+// Setup the Warp Reduction WarpLoop g-tests.
+//
+//
+TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
+template <typename T>
+class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  // Attain the loop depth type from execpol data.
+  using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
+
+  // Get List of loop exec policies.
+  using LOOP_POLS = typename EXEC_POL_DATA::type;
+
+  // Build proper basic kernel exec policy type.
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+
+  constexpr bool USE_RES = true;
+
+  // For double nested loop tests the third arg is ignored.
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
+                            WarpThreadWarpLoopKernel);
+
+#endif  // __TEST_WARP_THREAD_RESOURCE_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
new file mode 100644
index 0000000000..aaf0c22546
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceMask.hpp
@@ -0,0 +1,45 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_WARP_THREAD_REDUCEMASK_HPP__
+#define __TEST_WARP_THREAD_REDUCEMASK_HPP__
+
+#include "warp-thread-ReduceMask-impl.hpp"
+
+//
+//
+// Setup the Warp Reduction ReduceMask g-tests.
+//
+//
+TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest);
+template <typename T>
+class KernelWarpThreadReduceMaskTest : public ::testing::Test {};
+
+TYPED_TEST_P(KernelWarpThreadReduceMaskTest, WarpThreadReduceMaskKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  // Attain the loop depth type from execpol data.
+  using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
+
+  // Get List of loop exec policies.
+  using LOOP_POLS = typename EXEC_POL_DATA::type;
+
+  // Build proper basic kernel exec policy type.
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+
+  constexpr bool USE_RES = false;
+
+  // For double nested loop tests the third arg is ignored.
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 64, 4*123 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceMaskTest,
+                            WarpThreadReduceMaskKernel);
+
+#endif  // __TEST_WARP_THREAD_REDUCEMASK_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
new file mode 100644
index 0000000000..c9cf83f939
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-ReduceWarp.hpp
@@ -0,0 +1,46 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_WARP_THREAD_REDUCEWARP_HPP__
+#define __TEST_WARP_THREAD_REDUCEWARP_HPP__
+
+#include "warp-thread-ReduceWarp-impl.hpp"
+
+//
+//
+// Setup the Warp Reduction ReduceWarp g-tests.
+//
+//
+TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest);
+template <typename T>
+class KernelWarpThreadReduceWarpTest : public ::testing::Test {};
+
+TYPED_TEST_P(KernelWarpThreadReduceWarpTest, WarpThreadReduceWarpKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  // Attain the loop depth type from execpol data.
+  using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
+
+  // Get List of loop exec policies.
+  using LOOP_POLS = typename EXEC_POL_DATA::type;
+
+  // Build proper basic kernel exec policy type.
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+
+  constexpr bool USE_RES = false;
+
+  // For double nested loop tests the third arg is ignored.
+  // Integer argument needs to be divisible by 10, and 16.
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 4000 );
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadReduceWarpTest,
+                            WarpThreadReduceWarpKernel);
+
+#endif  // __TEST_WARP_THREAD_REDUCEWARP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
new file mode 100644
index 0000000000..c1dfb2586b
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/test-kernel-warp-thread-WarpLoop.hpp
@@ -0,0 +1,45 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_WARP_THREAD_WARPLOOP_HPP__
+#define __TEST_WARP_THREAD_WARPLOOP_HPP__
+
+#include "warp-thread-WarpLoop-impl.hpp"
+
+//
+//
+// Setup the Warp Reduction WarpLoop g-tests.
+//
+//
+TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest);
+template <typename T>
+class KernelWarpThreadWarpLoopTest : public ::testing::Test {};
+
+TYPED_TEST_P(KernelWarpThreadWarpLoopTest, WarpThreadWarpLoopKernel) {
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
+  using REDUCE_POL = typename camp::at<TypeParam, camp::num<1>>::type;
+  using EXEC_POL_DATA = typename camp::at<TypeParam, camp::num<2>>::type;
+
+  // Attain the loop depth type from execpol data.
+  using LOOP_TYPE = typename EXEC_POL_DATA::LoopType;
+
+  // Get List of loop exec policies.
+  using LOOP_POLS = typename EXEC_POL_DATA::type;
+
+  // Build proper basic kernel exec policy type.
+  using EXEC_POLICY = typename WarpThreadExec<LOOP_TYPE, REDUCE_POL, LOOP_POLS>::type;
+
+  constexpr bool USE_RES = false;
+
+  // For double nested loop tests the third arg is ignored.
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RES>( LOOP_TYPE(), 2345);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(KernelWarpThreadWarpLoopTest,
+                            WarpThreadWarpLoopKernel);
+
+#endif  // __TEST_WARP_THREAD_WARPLOOP_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
new file mode 100644
index 0000000000..27417f1d37
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceMask-impl.hpp
@@ -0,0 +1,185 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __WARP_THREAD_REDUCEMASK_IMPL_HPP__
+#define __WARP_THREAD_REDUCEMASK_IMPL_HPP__
+
+#include <numeric>
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+}
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
+}
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+}
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+}
+
+//
+//
+// Define list of nested loop types the ReduceMask test supports.
+//
+//
+using ReduceMaskSupportedLoopTypeList = camp::list<
+  DEVICE_DEPTH_2_REDUCESUM_WARPMASK,
+  DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI
+>;
+
+//
+//
+// Sum of array of elements with GPU-specific policies.
+//
+//
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK&,
+                          const RAJA::Index_type directlen,
+                          const RAJA::Index_type looplen)
+{
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
+
+  RAJA::Index_type* work_array;
+  RAJA::Index_type* check_array;
+  RAJA::Index_type* test_array;
+
+  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
+
+  RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+                            work_res,
+                            [=] RAJA_DEVICE (RAJA::Index_type i, RAJA::Index_type RAJA_UNUSED_ARG(j)) {
+                              trip_count += 1;
+                              worksum += i; // i should only be 0..directlen-1
+                              max_thread.max(threadIdx.x);
+                            });
+
+  ASSERT_EQ(max_thread.get(), 255);
+  ASSERT_EQ(trip_count.get(), looplen*directlen);
+  ASSERT_EQ(worksum.get(), looplen*directlen*(directlen-1)/2);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI&,
+                          const RAJA::Index_type directlen,
+                          const RAJA::Index_type looplen)
+{
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
+
+  RAJA::Index_type* work_array;
+  RAJA::Index_type* check_array;
+  RAJA::Index_type* test_array;
+
+  allocateForallTestData<RAJA::Index_type>(directlen*looplen,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
+
+  RAJA::ReduceMax<REDUCE_POL, int> max_thread(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> trip_count(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+
+  call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, directlen), RAJA::TypedRangeSegment<RAJA::Index_type>(0, looplen)),
+                            RAJA::make_tuple((RAJA::Index_type)0, (RAJA::Index_type)0),
+                            work_res,
+                            [=] RAJA_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type RAJA_UNUSED_ARG(j), RAJA::Index_type RAJA_UNUSED_ARG(x), RAJA::Index_type y) {
+                              trip_count += 1;
+                              worksum += y; // y should only be 0..3
+                              max_thread.max(threadIdx.x);
+                            });
+
+  ASSERT_EQ(max_thread.get(), 255);
+  ASSERT_EQ(trip_count.get(), looplen*directlen);
+  ASSERT_EQ(worksum.get(), looplen*directlen*(looplen-1)/2);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
+}
+
+//
+//
+// Defining the Kernel Loop structure for ReduceMask Nested Loop Tests.
+//
+//
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec;
+
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::ForICount<1, RAJA::statement::Param<1>, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+
+#endif  // __WARP_THREAD_REDUCEMASK_IMPL_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
new file mode 100644
index 0000000000..1738ba5389
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-ReduceWarp-impl.hpp
@@ -0,0 +1,249 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __WARP_THREAD_REDUCEWARP_IMPL_HPP__
+#define __WARP_THREAD_REDUCEWARP_IMPL_HPP__
+
+#include <numeric>
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+}
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+}
+
+//
+//
+// Define list of nested loop types the ReduceWarp test supports.
+//
+//
+using ReduceWarpSupportedLoopTypeList = camp::list<
+  DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE,
+  DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE,
+  DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE
+>;
+
+//
+//
+// Sum of array of elements with GPU-specific policies.
+//
+//
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE&,
+                          const RAJA::Index_type len)
+{
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
+
+  RAJA::Index_type* work_array;
+  RAJA::Index_type* check_array;
+  RAJA::Index_type* test_array;
+
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
+
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
+
+  call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type &value) {
+                              value += i;
+                            },
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
+                              // This only gets executed on the "root" thread which received the reduced value.
+                              worksum += value;
+                              reduce_count += 1;
+                            });
+
+  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+  ASSERT_EQ(reduce_count.get(), 1);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE&,
+                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+{
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
+
+  RAJA::Index_type* work_array;
+  RAJA::Index_type* check_array;
+  RAJA::Index_type* test_array;
+
+  RAJA::Index_type innerlen = 10;
+  RAJA::Index_type outerlen = len / innerlen;
+
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
+
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
+
+  call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type &value) {
+                              value += i + j * outerlen;
+                            },
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
+                              // This only gets executed on the "root" thread which received the reduced value.
+                              worksum += value;
+                              reduce_count += 1;
+                            });
+
+  ASSERT_EQ(worksum.get(), outerlen*innerlen*(outerlen*innerlen-1)/2);
+  ASSERT_EQ(reduce_count.get(), innerlen);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE&,
+                          const RAJA::Index_type len) // len needs to be divisible by 10 and 16
+{
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
+
+  RAJA::Index_type* work_array;
+  RAJA::Index_type* check_array;
+  RAJA::Index_type* test_array;
+
+  RAJA::Index_type innerlen = 10;
+  RAJA::Index_type middlelen = 16;
+  RAJA::Index_type outerlen = len / (innerlen*middlelen);
+
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
+
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> reduce_count(0);
+
+  call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, outerlen),
+                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, middlelen),
+                                             RAJA::TypedRangeSegment<RAJA::Index_type>(0, innerlen)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i, RAJA::Index_type j, RAJA::Index_type k, RAJA::Index_type &value) {
+                              value += i + j * outerlen + k * outerlen * middlelen;
+                            },
+
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type &value) {
+                              // This only gets executed on the "root" thread which received the reduced value.
+                              worksum += value;
+                              reduce_count += 1;
+                            });
+
+  ASSERT_EQ(worksum.get(), outerlen*middlelen*innerlen*(outerlen*middlelen*innerlen-1)/2);
+  ASSERT_EQ(reduce_count.get(), middlelen*innerlen);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
+}
+
+//
+//
+// Defining the Kernel Loop structure for ReduceWarp Nested Loop Tests.
+//
+//
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec;
+
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type, RAJA::statement::Lambda<0>>,
+        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<1>>::type, RAJA::statement::Lambda<0>
+          >
+        >,
+        RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+          RAJA::statement::Lambda<1, RAJA::Params<0>>
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<2, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::For<1, typename camp::at<POLICY_DATA, camp::num<1>>::type,
+            RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<2>>::type, RAJA::statement::Lambda<0>
+            > // end For 0
+          >,  // end For 1
+          typename camp::at<POLICY_DATA, camp::num<3>>::type, // warp synchronize
+          RAJA::statement::Reduce<typename camp::at<POLICY_DATA, camp::num<4>>::type, RAJA::operators::plus, RAJA::statement::Param<0>,
+            RAJA::statement::Lambda<1, RAJA::Params<0>>
+          >
+        > // end For 2
+      > // end DEVICE_KERNEL
+    >;
+};
+
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+
+#endif  // __WARP_THREAD_REDUCEWARP_IMPL_HPP__
diff --git a/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
new file mode 100644
index 0000000000..5afa5d4de4
--- /dev/null
+++ b/test/functional/kernel/warp-thread/tests/warp-thread-WarpLoop-impl.hpp
@@ -0,0 +1,193 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __WARP_THREAD_WARPLOOP_IMPL_HPP__
+#define __WARP_THREAD_WARPLOOP_IMPL_HPP__
+
+#include <numeric>
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_resource<EXEC_POL>( segs, work_res, args...);
+}
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel(SEGMENTS&& segs, WORKING_RES, Args&&... args) {
+  RAJA::kernel<EXEC_POL>( segs, args...);
+}
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES work_res, Args&&... args) {
+  RAJA::kernel_param_resource<EXEC_POL>( segs, params, work_res, args...);
+}
+
+template<typename EXEC_POL, bool USE_RESOURCE,
+         typename SEGMENTS,
+         typename PARAMS,
+         typename WORKING_RES,
+         typename... Args>
+typename std::enable_if< !USE_RESOURCE >::type call_kernel_param(SEGMENTS&& segs, PARAMS&& params, WORKING_RES, Args&&... args) {
+  RAJA::kernel_param<EXEC_POL>( segs, params, args...);
+}
+
+//
+//
+// Define list of nested loop types the WarpLoop test supports.
+//
+//
+using WarpLoopSupportedLoopTypeList = camp::list<
+  DEVICE_DEPTH_1_REDUCESUM_WARP,
+  DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE,
+  DEVICE_DEPTH_2_REDUCESUM_WARP
+>;
+
+//
+//
+// Sum of array of elements with GPU-specific policies.
+//
+//
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARP&,
+                          const RAJA::Index_type len)
+{
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
+
+  RAJA::Index_type* work_array;
+  RAJA::Index_type* check_array;
+  RAJA::Index_type* test_array;
+
+  allocateForallTestData<RAJA::Index_type>(len,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
+
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, len);
+
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+
+  call_kernel<EXEC_POLICY, USE_RESOURCE>(RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, len)), work_res,
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type i) {
+                              worksum += i;
+                            });
+
+  ASSERT_EQ(worksum.get(), len*(len-1)/2);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
+}
+
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE>
+void KernelWarpThreadTest(const DEVICE_DEPTH_2_REDUCESUM_WARP&,
+                          const RAJA::Index_type numtiles)
+{
+  WORKING_RES work_res{WORKING_RES::get_default()};
+  camp::resources::Resource erased_work_res{work_res};
+
+  RAJA::Index_type flatSize = 32 * numtiles;
+  RAJA::Index_type* work_array;
+  RAJA::Index_type* check_array;
+  RAJA::Index_type* test_array;
+
+  allocateForallTestData<RAJA::Index_type>(flatSize,
+                                     erased_work_res,
+                                     &work_array,
+                                     &check_array,
+                                     &test_array);
+
+  RAJA::TypedRangeSegment<RAJA::Index_type> rangelen(0, flatSize);
+
+  RAJA::ReduceSum<REDUCE_POL, RAJA::Index_type> worksum(0);
+
+  call_kernel_param<EXEC_POLICY, USE_RESOURCE>(
+                            RAJA::make_tuple(RAJA::TypedRangeSegment<RAJA::Index_type>(0, flatSize)),
+                            RAJA::make_tuple((RAJA::Index_type)0),
+                            work_res,
+                            [=] RAJA_HOST_DEVICE (RAJA::Index_type RAJA_UNUSED_ARG(i), RAJA::Index_type j) {
+                              worksum += j; // j should only be 0..31
+                            });
+
+  ASSERT_EQ(worksum.get(), numtiles*32*(32-1)/2);
+
+  deallocateForallTestData<RAJA::Index_type>(erased_work_res,
+                                       work_array,
+                                       check_array,
+                                       test_array);
+}
+
+// More specific execution policies that use the above DEVICE_DEPTH_1_REDUCESUM_WARP test.
+template <typename WORKING_RES, typename EXEC_POLICY, typename REDUCE_POL, bool USE_RESOURCE, typename... Args>
+void KernelWarpThreadTest(const DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE&, Args... args){
+  KernelWarpThreadTest<WORKING_RES, EXEC_POLICY, REDUCE_POL, USE_RESOURCE>(DEVICE_DEPTH_1_REDUCESUM_WARP(), args...);
+}
+
+//
+//
+// Defining the Kernel Loop structure for WarpLoop Nested Loop Tests.
+//
+//
+template<typename POLICY_TYPE, typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec;
+
+#if defined(RAJA_ENABLE_CUDA) or defined(RAJA_ENABLE_HIP)
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+          RAJA::statement::Lambda<0>
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
+          RAJA::statement::For<0, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+template<typename REDUCE_POL, typename POLICY_DATA>
+struct WarpThreadExec<DEVICE_DEPTH_2_REDUCESUM_WARP, REDUCE_POL, POLICY_DATA> {
+  using type = 
+    RAJA::KernelPolicy<
+      RAJA::statement::DEVICE_KERNEL<
+        RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::seq_exec,
+          RAJA::statement::ForICount<0, RAJA::statement::Param<0>, typename camp::at<POLICY_DATA, camp::num<0>>::type,
+            RAJA::statement::Lambda<0>
+          >
+        >
+      > // end DEVICE_KERNEL
+    >;
+};
+
+#endif  // RAJA_ENABLE_CUDA or RAJA_ENABLE_HIP
+
+#endif  // __WARP_THREAD_WARPLOOP_IMPL_HPP__
diff --git a/test/functional/launch/CMakeLists.txt b/test/functional/launch/CMakeLists.txt
new file mode 100644
index 0000000000..b05ae6fc41
--- /dev/null
+++ b/test/functional/launch/CMakeLists.txt
@@ -0,0 +1,26 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+list(APPEND TEAMS_BACKENDS Sequential)
+
+if(RAJA_ENABLE_OPENMP)
+  list(APPEND TEAMS_BACKENDS OpenMP)
+endif()
+
+if(RAJA_ENABLE_CUDA)
+  list(APPEND TEAMS_BACKENDS Cuda)
+endif()
+
+if(RAJA_ENABLE_HIP)
+  list(APPEND TEAMS_BACKENDS Hip)
+endif()
+
+add_subdirectory(run-time-switch)
+
+add_subdirectory(segment)
+
+unset( TEAMS_BACKENDS )
diff --git a/test/functional/launch/run-time-switch/CMakeLists.txt b/test/functional/launch/run-time-switch/CMakeLists.txt
new file mode 100644
index 0000000000..5367405f6f
--- /dev/null
+++ b/test/functional/launch/run-time-switch/CMakeLists.txt
@@ -0,0 +1,25 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+set(TEST_TYPES BasicShared)
+
+foreach( BACKEND ${LAUNCH_BACKENDS} )
+  foreach( TESTTYPE ${TEST_TYPES} )
+    configure_file( test-launch.cpp.in
+                    test-launch-${TESTTYPE}-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-${TESTTYPE}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-${TESTTYPE}-${BACKEND}.cpp )
+
+    target_include_directories(test-launch-${TESTTYPE}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( TEST_TYPES )
diff --git a/test/functional/teams/test-teams.cpp.in b/test/functional/launch/run-time-switch/test-launch.cpp.in
similarity index 95%
rename from test/functional/teams/test-teams.cpp.in
rename to test/functional/launch/run-time-switch/test-launch.cpp.in
index 005207e88f..fc8fc51070 100644
--- a/test/functional/teams/test-teams.cpp.in
+++ b/test/functional/launch/run-time-switch/test-launch.cpp.in
@@ -11,7 +11,7 @@
 #include "RAJA_test-base.hpp"
 #include "RAJA_test-camp.hpp"
 
-#include "RAJA_test-teams-execpol.hpp"
+#include "RAJA_test-teams-runtime-execpol.hpp"
 
 #include "RAJA_test-forall-data.hpp"
 
diff --git a/test/functional/teams/tests/test-teams-BasicShared.hpp b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
similarity index 70%
rename from test/functional/teams/tests/test-teams-BasicShared.hpp
rename to test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
index e1e1e09258..3ba87f87b7 100644
--- a/test/functional/teams/tests/test-teams-BasicShared.hpp
+++ b/test/functional/launch/run-time-switch/tests/test-launch-BasicShared.hpp
@@ -5,13 +5,13 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
-#ifndef __TEST_TEAMS_BASIC_SHARED_HPP__
-#define __TEST_TEAMS_BASIC_SHARED_HPP__
+#ifndef __TEST_LAUNCH_BASIC_SHARED_HPP__
+#define __TEST_LAUNCH_BASIC_SHARED_HPP__
 
 #include <numeric>
 
 template <typename WORKING_RES, typename LAUNCH_POLICY, typename TEAM_POLICY, typename THREAD_POLICY>
-void TeamsBasicSharedTestImpl()
+void LaunchBasicSharedTestImpl()
 {
 
   int N = 1000;
@@ -30,31 +30,31 @@ void TeamsBasicSharedTestImpl()
 
 
   //Select platform
-  RAJA::expt::ExecPlace select_cpu_or_gpu;
+  RAJA::ExecPlace select_cpu_or_gpu;
   if (working_res.get_platform()  == camp::resources::Platform::host){
-    select_cpu_or_gpu = RAJA::expt::HOST;
+    select_cpu_or_gpu = RAJA::HOST;
   }else{  
-    select_cpu_or_gpu = RAJA::expt::DEVICE;
+    select_cpu_or_gpu = RAJA::DEVICE;
   }
 
 
-  RAJA::expt::launch<LAUNCH_POLICY>(select_cpu_or_gpu,
-    RAJA::expt::Grid(RAJA::expt::Teams(N), RAJA::expt::Threads(N)),
-        [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {
+  RAJA::launch<LAUNCH_POLICY>(select_cpu_or_gpu,
+    RAJA::LaunchParams(RAJA::Teams(N), RAJA::Threads(N)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
-          RAJA::expt::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
+          RAJA::loop<TEAM_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int r) {
 
                 // Array shared within threads of the same team
                 RAJA_TEAM_SHARED int s_A[1];
 
-                RAJA::expt::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
                     s_A[c] = r; 
                 });
 
                 ctx.teamSync();
 
                 //broadcast shared value to all threads and write to array
-                RAJA::expt::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
+                RAJA::loop<THREAD_POLICY>(ctx, RAJA::RangeSegment(0, N), [&](int c) {
                     const int idx = c + N*r;
                     working_array[idx] = s_A[0];
                 });  // loop j
@@ -79,13 +79,13 @@ void TeamsBasicSharedTestImpl()
 }
 
 
-TYPED_TEST_SUITE_P(TeamsBasicSharedTest);
+TYPED_TEST_SUITE_P(LaunchBasicSharedTest);
 template <typename T>
-class TeamsBasicSharedTest : public ::testing::Test
+class LaunchBasicSharedTest : public ::testing::Test
 {
 };
 
-TYPED_TEST_P(TeamsBasicSharedTest, BasicSharedTeams)
+TYPED_TEST_P(LaunchBasicSharedTest, BasicSharedTeams)
 {
 
   using WORKING_RES = typename camp::at<TypeParam, camp::num<0>>::type;
@@ -93,12 +93,12 @@ TYPED_TEST_P(TeamsBasicSharedTest, BasicSharedTeams)
   using TEAM_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<1>>::type;
   using THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<1>>::type, camp::num<2>>::type;
 
-  TeamsBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
+  LaunchBasicSharedTestImpl<WORKING_RES, LAUNCH_POLICY, TEAM_POLICY, THREAD_POLICY>();
 
 
 }
 
-REGISTER_TYPED_TEST_SUITE_P(TeamsBasicSharedTest,
+REGISTER_TYPED_TEST_SUITE_P(LaunchBasicSharedTest,
                             BasicSharedTeams);
 
 #endif  // __TEST_BASIC_SHARED_HPP__
diff --git a/test/functional/launch/segment/CMakeLists.txt b/test/functional/launch/segment/CMakeLists.txt
new file mode 100644
index 0000000000..5a93c88b93
--- /dev/null
+++ b/test/functional/launch/segment/CMakeLists.txt
@@ -0,0 +1,33 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+#
+# List of segment types for generating test files.
+#
+#  TODO: Support list segments with RAJA Launch
+#
+set(SEGTYPES ListSegment RangeSegment RangeStrideSegment)
+
+
+#
+# Generate tests for each enabled RAJA back-end.
+#
+#
+
+foreach( BACKEND ${LAUNCH_BACKENDS} )
+  foreach( SEGTYPES ${SEGTYPES} )
+    configure_file( test-launch-segment.cpp.in
+                    test-launch-segment-${SEGTYPES}-${BACKEND}.cpp )
+    raja_add_test( NAME test-launch-segment-${SEGTYPES}-${BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-launch-segment-${SEGTYPES}-${BACKEND}.cpp )
+
+    target_include_directories(test-launch-segment-${SEGTYPES}-${BACKEND}.exe
+                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+  endforeach()
+endforeach()
+
+unset( SEGTYPES )
diff --git a/test/functional/launch/segment/test-launch-segment.cpp.in b/test/functional/launch/segment/test-launch-segment.cpp.in
new file mode 100644
index 0000000000..2c62be06d8
--- /dev/null
+++ b/test/functional/launch/segment/test-launch-segment.cpp.in
@@ -0,0 +1,39 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-index-types.hpp"
+
+#include "RAJA_test-forall-data.hpp"
+#include "RAJA_test-launch-execpol.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-launch-@SEGTYPES@.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @BACKEND@LaunchSegmentTypes =
+  Test< camp::cartesian_product<StrongIdxTypeList,
+                                @BACKEND@ResourceList,
+                                @BACKEND@_launch_policies>>::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@,
+                               Launch@SEGTYPES@Test,
+                               @BACKEND@LaunchSegmentTypes);
diff --git a/test/functional/launch/segment/tests/test-launch-ListSegment.hpp b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
new file mode 100644
index 0000000000..da9370f879
--- /dev/null
+++ b/test/functional/launch/segment/tests/test-launch-ListSegment.hpp
@@ -0,0 +1,138 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_LAUNCH_LISTSEGMENT_HPP__
+#define __TEST_LAUNCH_LISTSEGMENT_HPP__
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <algorithm>
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+void LaunchListSegmentTestImpl(INDEX_TYPE N)
+{
+
+  // Create and initialize indices in idx_array used to create list segment
+  std::vector<INDEX_TYPE> idx_array;
+
+  srand ( time(NULL) );
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+    INDEX_TYPE randval = INDEX_TYPE(rand() % RAJA::stripIndexType(N));
+    if ( i < randval ) {
+      idx_array.push_back(i);
+    }
+  }
+
+  size_t idxlen = idx_array.size();
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+
+  // Create list segment for tests
+  INDEX_TYPE* idx_vals = nullptr;
+  if (N > 0) {
+    idx_vals = &idx_array[0];
+  }
+  RAJA::TypedListSegment<INDEX_TYPE> lseg(idx_vals, idxlen,
+                                          working_res);
+
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  size_t data_len = RAJA::stripIndexType(N);
+  if ( data_len == 0 ) {
+    data_len = 1;
+  }
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  constexpr int threads = 256;
+  int blocks = (data_len - 1)/threads + 1;
+
+  if ( RAJA::stripIndexType(N) > 0 ) {
+
+    for (size_t i = 0; i < idxlen; ++i) {
+      test_array[ RAJA::stripIndexType(idx_vals[i]) ] = idx_vals[i];
+    }
+
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
+            working_array[RAJA::stripIndexType(idx)] = idx;
+          });
+      });
+
+  } else { // zero-length segment
+
+    memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
+
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, lseg, [&](INDEX_TYPE idx) {
+            (void) idx;
+            working_array[0]++;
+          });
+      });
+
+  }
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(LaunchListSegmentTest);
+template <typename T>
+class LaunchListSegmentTest : public ::testing::Test
+{
+};
+
+TYPED_TEST_P(LaunchListSegmentTest, ListSegmentTeams)
+{
+  using INDEX_TYPE       = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+
+  // test zero-length list segment
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0));
+
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(13));
+
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(2047));
+
+  LaunchListSegmentTestImpl<INDEX_TYPE, WORKING_RESOURCE, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(32000));
+}
+
+REGISTER_TYPED_TEST_SUITE_P(LaunchListSegmentTest,
+                            ListSegmentTeams);
+
+#endif  // __TEST_TEAMS_LISTSEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
new file mode 100644
index 0000000000..d28cae05cd
--- /dev/null
+++ b/test/functional/launch/segment/tests/test-launch-RangeSegment.hpp
@@ -0,0 +1,129 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_LAUNCH_RANGE_SEGMENT_HPP__
+#define __TEST_LAUNCH_RANGE_SEGMENT_HPP__
+
+#include <numeric>
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+void LaunchRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last)
+{
+
+  RAJA::TypedRangeSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last));
+  INDEX_TYPE N = static_cast<INDEX_TYPE>(r1.end() - r1.begin());
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  size_t data_len = RAJA::stripIndexType(N);
+  if ( data_len == 0 ) {
+    data_len = 1;
+  }
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  constexpr int threads = 256;
+  int blocks = (data_len - 1)/threads + 1;
+
+  if ( RAJA::stripIndexType(N) > 0 ) {
+
+    const INDEX_TYPE rbegin = *r1.begin();
+
+    std::iota(test_array, test_array + RAJA::stripIndexType(N), rbegin);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+  
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
+            working_array[RAJA::stripIndexType(idx - rbegin)] = idx;
+          });         
+    });
+
+  } else { // zero-length segment 
+
+    memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
+
+    working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len);
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+  
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
+            (void) idx;
+            working_array[0]++;
+        }); 
+    });
+
+  }
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(LaunchRangeSegmentTest);
+template <typename T>
+class LaunchRangeSegmentTest : public ::testing::Test
+{
+};
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeTests()
+{
+}
+
+template <typename INDEX_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeTests()
+{
+  // test zero-length range segment
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(-5));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5));
+}
+
+TYPED_TEST_P(LaunchRangeSegmentTest, RangeSegmentTeams)             
+{
+
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+
+  // test zero-length range segment
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(3), INDEX_TYPE(3));
+
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(27));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(2047));
+  LaunchRangeSegmentTestImpl<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(32000));
+
+  runNegativeTests<INDEX_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(LaunchRangeSegmentTest,
+                            RangeSegmentTeams);
+
+#endif  // __TEST_RANGE_SEGMENT_HPP__
diff --git a/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
new file mode 100644
index 0000000000..030ec12f27
--- /dev/null
+++ b/test/functional/launch/segment/tests/test-launch-RangeStrideSegment.hpp
@@ -0,0 +1,143 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef __TEST_LAUNCH_RANGESTRIDESEGMENT_HPP__
+#define __TEST_LAUNCH_RANGESTRIDESEGMENT_HPP__
+
+#include <cstring>
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, 
+          typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY>
+void LaunchRangeStrideSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last, 
+                                      DIFF_TYPE stride)
+{
+  RAJA::TypedRangeStrideSegment<INDEX_TYPE> r1(RAJA::stripIndexType(first), RAJA::stripIndexType(last), stride);
+  INDEX_TYPE N = INDEX_TYPE(r1.size());
+
+  camp::resources::Resource working_res{WORKING_RES::get_default()};
+  camp::resources::Resource host_res{camp::resources::Host()};
+  INDEX_TYPE* working_array;
+  INDEX_TYPE* check_array;
+  INDEX_TYPE* test_array;
+
+  size_t data_len = RAJA::stripIndexType(N);
+  if ( data_len == 0 ) {
+    data_len = 1;
+  }
+
+  allocateForallTestData<INDEX_TYPE>(data_len,
+                                     working_res,
+                                     &working_array,
+                                     &check_array,
+                                     &test_array);
+
+  memset(static_cast<void*>(test_array), 0, sizeof(INDEX_TYPE) * data_len);
+
+  working_res.memcpy(working_array, test_array, sizeof(INDEX_TYPE) * data_len); 
+
+  constexpr int threads = 256;
+  int blocks = (data_len - 1)/threads + 1;
+
+  if ( RAJA::stripIndexType(N) > 0 ) {
+
+    INDEX_TYPE idx = first;
+    for (INDEX_TYPE i = INDEX_TYPE(0); i < N; ++i) {
+      test_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+      idx += stride; 
+    }
+    
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+  
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
+            working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+          });
+
+      });
+
+  } else { // zero-length segment
+
+    RAJA::launch<LAUNCH_POLICY>
+      (RAJA::LaunchParams(RAJA::Teams(blocks), RAJA::Threads(threads)),
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+  
+        RAJA::loop<GLOBAL_THREAD_POICY>(ctx, r1, [&](INDEX_TYPE idx) {
+            working_array[ RAJA::stripIndexType((idx-first)/stride) ] = idx;
+            (void) idx;
+            working_array[0]++;
+          });
+
+      });
+  }
+
+  working_res.memcpy(check_array, working_array, sizeof(INDEX_TYPE) * data_len);
+
+  for (INDEX_TYPE i = INDEX_TYPE(0); i < N; i++) {
+    ASSERT_EQ(test_array[RAJA::stripIndexType(i)], check_array[RAJA::stripIndexType(i)]);
+  }
+
+  deallocateForallTestData<INDEX_TYPE>(working_res,
+                                       working_array,
+                                       check_array,
+                                       test_array);
+}
+
+
+TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest);
+template <typename T>
+class LaunchRangeStrideSegmentTest : public ::testing::Test
+{
+};
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POICY,
+  typename std::enable_if<std::is_unsigned<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeStrideTests()
+{
+}
+
+template <typename INDEX_TYPE, typename DIFF_TYPE, typename WORKING_RES, typename LAUNCH_POLICY, typename GLOBAL_THREAD_POLICY,
+  typename std::enable_if<std::is_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::value>::type* = nullptr>
+void runNegativeStrideTests()
+{
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-10), INDEX_TYPE(-1), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(0), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(-5), INDEX_TYPE(5), DIFF_TYPE(3));
+
+// Test negative strides
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(-1), DIFF_TYPE(-1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(10), INDEX_TYPE(0), DIFF_TYPE(-2));
+}
+
+
+TYPED_TEST_P(LaunchRangeStrideSegmentTest, RangeStrideSegmentTeams)
+{
+  using INDEX_TYPE  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using WORKING_RES = typename camp::at<TypeParam, camp::num<1>>::type;
+  using LAUNCH_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<0>>::type;
+  using GLOBAL_THREAD_POLICY = typename camp::at<typename camp::at<TypeParam,camp::num<2>>::type, camp::num<1>>::type;
+  using DIFF_TYPE   = typename std::make_signed<RAJA::strip_index_type_t<INDEX_TYPE>>::type;
+
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(1));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(21), DIFF_TYPE(2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(255), DIFF_TYPE(2));
+
+// Test size zero segments
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(0), INDEX_TYPE(20), DIFF_TYPE(-2));
+  LaunchRangeStrideSegmentTestImpl<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>(INDEX_TYPE(1), INDEX_TYPE(20), DIFF_TYPE(-2));
+
+  runNegativeStrideTests<INDEX_TYPE, DIFF_TYPE, WORKING_RES, LAUNCH_POLICY, GLOBAL_THREAD_POLICY>();
+}
+
+REGISTER_TYPED_TEST_SUITE_P(LaunchRangeStrideSegmentTest,
+                            RangeStrideSegmentTeams);
+
+#endif  // __TEST_TEAMS_RANGESTRIDESEGMENT_HPP__
diff --git a/test/functional/teams/CMakeLists.txt b/test/functional/teams/CMakeLists.txt
deleted file mode 100644
index e36383489b..0000000000
--- a/test/functional/teams/CMakeLists.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-###############################################################################
-# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
-# and RAJA project contributors. See the RAJA/LICENSE file for details.
-#
-# SPDX-License-Identifier: (BSD-3-Clause)
-###############################################################################
-
-#
-# List of segment types for generating test files.
-#
-set(TEST_TYPES BasicShared)
-
-
-#
-# Generate tests for each enabled RAJA back-end.
-#
-#
-list(APPEND TEAMS_BACKENDS Sequential)
-
-if(RAJA_ENABLE_OPENMP)
-  list(APPEND TEAMS_BACKENDS OpenMP)
-endif()
-
-if(RAJA_ENABLE_CUDA)
-  list(APPEND TEAMS_BACKENDS Cuda)
-endif()
-
-if(RAJA_ENABLE_HIP)
-  list(APPEND FORALL_BACKENDS Hip)
-endif()
-
-foreach( BACKEND ${TEAMS_BACKENDS} )
-  foreach( TESTTYPE ${TEST_TYPES} )
-    configure_file( test-teams.cpp.in
-                    test-teams-${TESTTYPE}-${BACKEND}.cpp )
-    raja_add_test( NAME test-teams-${TESTTYPE}-${BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-teams-${TESTTYPE}-${BACKEND}.cpp )
-
-    target_include_directories(test-teams-${TESTTYPE}-${BACKEND}.exe
-                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
-  endforeach()
-endforeach()
-
-unset( TEST_TYPES )
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
index a9957d4ff0..14c37edbdb 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-double.hpp
@@ -14,58 +14,58 @@ using MatrixElementType = double;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::cuda_warp_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::hip_wave_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::hip_wave_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::hip_wave_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::hip_wave_register>,
 #endif
 
 
 //#ifdef __AVX__
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::avx_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::avx_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::avx_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
 //
 //#endif
 
 
 #ifdef __AVX2__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,12, RAJA::avx2_register>,
-
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx2_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx2_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx2_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::avx2_register>,
-//    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,12, RAJA::expt::avx2_register>,
+
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
+//    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
 #endif
 
 
     // These tests use the platform default SIMD architecture
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
 
   >;
 
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
index 4c57f770e3..8c47142446 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-float.hpp
@@ -14,37 +14,37 @@ using MatrixElementType = float;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::cuda_warp_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
 #endif
 
 
     // These tests use the platform default SIMD architecture
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
 
   >;
 
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
index 18e8a15011..356e674fef 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int32_t.hpp
@@ -14,36 +14,36 @@ using MatrixElementType = int32_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::cuda_warp_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
 #endif
 
 
     // These tests use the platform default SIMD architecture
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
 
   >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
index 4072a70c3f..c14a08e430 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
+++ b/test/functional/tensor/matrix/test-tensor-matrix-int64_t.hpp
@@ -14,49 +14,49 @@ using MatrixElementType = int64_t;
 using TensorMatrixTypes = ::testing::Types<
 
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::cuda_warp_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::cuda_warp_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::cuda_warp_register>,
 #endif
 
 
 #ifdef __AVX__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::avx_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx_register>,
 
 #endif
 
 
 #ifdef __AVX2__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::avx2_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 2,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,8, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,2, RAJA::expt::avx2_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,2, RAJA::expt::avx2_register>,
 #endif
 
 
 #ifdef __AVX512__
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::avx512_register>,
-    RAJA::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 4,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,16, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,8, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 16,4, RAJA::expt::avx512_register>,
+    RAJA::expt::RectMatrixRegister<MatrixElementType, TensorMatrixLayoutType, 8,4, RAJA::expt::avx512_register>,
 #endif
 
 
     // These tests use the platform default SIMD architecture
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType>,
 
     // Always test the non-vectorized scalar type
-    RAJA::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::scalar_register>
+    RAJA::expt::SquareMatrixRegister<MatrixElementType, TensorMatrixLayoutType, RAJA::expt::scalar_register>
 
   >;
diff --git a/test/functional/tensor/matrix/test-tensor-matrix.cpp.in b/test/functional/tensor/matrix/test-tensor-matrix.cpp.in
index d80e18fc8e..eb4620aecd 100644
--- a/test/functional/tensor/matrix/test-tensor-matrix.cpp.in
+++ b/test/functional/tensor/matrix/test-tensor-matrix.cpp.in
@@ -16,7 +16,7 @@
 
 // set the layout alias here, then include the matrix header for the
 // given element type
-using TensorMatrixLayoutType = RAJA::@TENSOR_MATRIX_LAYOUT@Layout;
+using TensorMatrixLayoutType = RAJA::expt::@TENSOR_MATRIX_LAYOUT@Layout;
 
 #include "test-tensor-matrix-@TENSOR_ELEMENT_TYPE@.hpp"
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
index 1a90f7908d..9edf6bb27e 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Add.hpp
@@ -26,7 +26,6 @@ void ET_AddImpl()
   //
 
   // alloc data1
-
   std::vector<element_t> data1_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
@@ -35,7 +34,6 @@ void ET_AddImpl()
 
 
   // alloc data2
-
   std::vector<element_t> data2_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
 
@@ -43,41 +41,72 @@ void ET_AddImpl()
   RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
 
 
-  // alloc data3
-
+  // alloc data3 with StaticLayout
   std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(),  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
 
   element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+
+
+  // alloc data4 with StaticLayout
+  std::vector<element_t> data4_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+
+
+  // alloc data5
+  std::vector<element_t> data5_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
 
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
 
 
-  // Fill data1 and data2
+
+  // Fill data1, data2, data3, and data4
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
       data1_h(i,j) = i*matrix_t::s_num_columns+j;
       data2_h(i,j) = 1+i+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+      data4_h(i,j) = 1+i+j;
     }
   }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
+  tensor_copy_to_device<policy_t>(data3_ptr, data3_vec);
+  tensor_copy_to_device<policy_t>(data4_ptr, data4_vec);
 
 
   //
-  // Do Operation: Full sum of data1 and data2
+  // Do Operation: Full sum of data1, data2, data3, and data4
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
+
+    // Access types:
+    // data1_d - Layout with all() and all().
+    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
+    // data3_d - StaticLayout with static_all() and static_range().
+    // data4_d - StaticLayout with static_all() and all().
 
-    data3_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
+    data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, SRrows) + data3_d(SArows, SRcols) + data4_d(SAcols, rows);
 
   });
 
-  tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
+  tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
 
   //
@@ -85,7 +114,7 @@ void ET_AddImpl()
   //
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data3_h(j,i), data1_h(i,j)+data2_h(j,i));
+      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i)+data3_h(i,j)+data4_h(j,i));
 //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
@@ -99,14 +128,14 @@ void ET_AddImpl()
     for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
 //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
-      // Clear data3
+      // Clear data5
       //
       for(camp::idx_t i = 0;i < N; ++ i){
         for(camp::idx_t j = 0;j < N; ++ j){
-          data3_h(j,i) = -1;
+          data5_h(j,i) = -1;
         }
       }
-      tensor_copy_to_device<policy_t>(data3_ptr, data3_vec);
+      tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
 
 
       //
@@ -114,13 +143,16 @@ void ET_AddImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
         // Load data using a View
-        auto rows = RAJA::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::ColIndex<int, matrix_t>::range(0, m_size);
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        data3_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
+        // Access types:
+        // Layout with range() and range() because loop iterate cannot be determined statically.
+
+        data5_d(cols, rows) = data1_d(rows, cols) + data2_d(cols, rows);
       });
 
-      tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
+      tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
 
       //
@@ -130,10 +162,10 @@ void ET_AddImpl()
         for(camp::idx_t j = 0;j < N; ++ j){
 //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
           if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data3_h(j,i), data1_h(i,j)+data2_h(j,i));
+            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)+data2_h(j,i));
           }
           else{
-            ASSERT_SCALAR_EQ(element_t(-1), data3_h(j,i));
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
           }
         }
       }
@@ -149,8 +181,8 @@ void ET_AddImpl()
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
-
+  tensor_free<policy_t>(data4_ptr);
+  tensor_free<policy_t>(data5_ptr);
 }
 
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
index f91120fe85..cb242c54e8 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Divide.hpp
@@ -42,41 +42,72 @@ void ET_DivideImpl()
   RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
 
 
-  // alloc data3
-
+  // alloc data3 with StaticLayout
   std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(),  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
 
   element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+
+
+  // alloc data4 with StaticLayout
+  std::vector<element_t> data4_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+
+
+  // alloc data5
+  std::vector<element_t> data5_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
 
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
 
 
-  // Fill data1 and data2
+
+  // Fill data1, data2, data3, and data4
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
       data1_h(i,j) = i*matrix_t::s_num_columns+j;
       data2_h(i,j) = 1+i+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+      data4_h(i,j) = 1+i+j;
     }
   }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
+  tensor_copy_to_device<policy_t>(data3_ptr, data3_vec);
+  tensor_copy_to_device<policy_t>(data4_ptr, data4_vec);
 
 
   //
-  // Do Operation: Full sum of data1 and data2
+  // Do Operation: Full sum of data1, data2, data3, and data4
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
-    data3_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
+    // Access types:
+    // data1_d - Layout with all() and all().
+    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
+    // data3_d - StaticLayout with static_all() and static_range().
+    // data4_d - StaticLayout with static_all() and all().
+
+    data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, SRrows) + data3_d(SArows, SRcols) / data4_d(SAcols, rows);
 
   });
 
-  tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
+  tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
 
   //
@@ -84,7 +115,7 @@ void ET_DivideImpl()
   //
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data3_h(j,i), data1_h(i,j)/data2_h(j,i));
+      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i)+data3_h(i,j)/data4_h(j,i));
 //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
@@ -98,14 +129,14 @@ void ET_DivideImpl()
     for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
 //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
-      // Clear data3
+      // Clear data5
       //
       for(camp::idx_t i = 0;i < N; ++ i){
         for(camp::idx_t j = 0;j < N; ++ j){
-          data3_h(j,i) = -1;
+          data5_h(j,i) = -1;
         }
       }
-      tensor_copy_to_device<policy_t>(data3_ptr, data3_vec);
+      tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
 
 
       //
@@ -113,13 +144,16 @@ void ET_DivideImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
         // Load data using a View
-        auto rows = RAJA::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::ColIndex<int, matrix_t>::range(0, m_size);
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+
+        // Access types:
+        // Layout with range() and range() because loop iterate cannot be determined statically.
 
-        data3_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
+        data5_d(cols, rows) = data1_d(rows, cols) / data2_d(cols, rows);
       });
 
-      tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
+      tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
 
       //
@@ -129,10 +163,10 @@ void ET_DivideImpl()
         for(camp::idx_t j = 0;j < N; ++ j){
 //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
           if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data3_h(j,i), data1_h(i,j)/data2_h(j,i));
+            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)/data2_h(j,i));
           }
           else{
-            ASSERT_SCALAR_EQ(element_t(-1), data3_h(j,i));
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
           }
         }
       }
@@ -148,6 +182,8 @@ void ET_DivideImpl()
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
+  tensor_free<policy_t>(data4_ptr);
+  tensor_free<policy_t>(data5_ptr);
 
 }
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
index 24f5d0ad64..2e06ecec1e 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_LoadStore.hpp
@@ -24,7 +24,6 @@ void ET_LoadStoreImpl()
   //
 
   // alloc data1
-
   std::vector<element_t> data1_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), matrix_t::s_num_rows, matrix_t::s_num_columns);
 
@@ -33,7 +32,6 @@ void ET_LoadStoreImpl()
 
 
   // alloc data2
-
   std::vector<element_t> data2_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
 
@@ -41,15 +39,57 @@ void ET_LoadStoreImpl()
   RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
 
 
+  // alloc data3 with StaticLayout
+  std::vector<element_t> data3_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_h(data3_vec.data());
+
+  element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,matrix_t::s_num_rows,matrix_t::s_num_columns>> data3_d(data3_ptr);
+
+
+  // alloc data4
+  std::vector<element_t> data4_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_h(data4_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data4_d(data4_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+
+  // alloc data5
+  std::vector<element_t> data5_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+
+  // alloc data6
+  std::vector<element_t> data6_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_h(data6_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+  element_t *data6_ptr = tensor_malloc<policy_t>(data6_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data6_d(data6_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+
+  // alloc data7
+  std::vector<element_t> data7_vec(matrix_t::s_num_rows*matrix_t::s_num_columns);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_h(data7_vec.data(), matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+  element_t *data7_ptr = tensor_malloc<policy_t>(data7_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data7_d(data7_ptr, matrix_t::s_num_columns, matrix_t::s_num_rows);
+
+
 
   // Fill data
   for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
     for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
       data1_h(i,j) = i*matrix_t::s_num_columns+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
     }
   }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
+  tensor_copy_to_device<policy_t>(data3_ptr, data3_vec);
 
 
   //
@@ -57,14 +97,29 @@ void ET_LoadStoreImpl()
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_rows>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,matrix_t::s_num_columns>();
 
     data2_d(cols, rows) = data1_d(rows, cols);
 
+    data4_d(cols, rows) = data3_d(SArows, SRcols);  // mixed static_all and static_range
+    data5_d(cols, rows) = data3_d(SArows, SAcols);  // static_all
+    data6_d(cols, rows) = data3_d(SRrows, SRcols);  // static_range
+    data7_d(cols, rows) = data3_d(rows, SRcols);    // mixed static_range and non-static
+
   });
 
   tensor_copy_to_host<policy_t>(data2_vec, data2_ptr);
+  tensor_copy_to_host<policy_t>(data4_vec, data4_ptr);
+  tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
+  tensor_copy_to_host<policy_t>(data6_vec, data6_ptr);
+  tensor_copy_to_host<policy_t>(data7_vec, data7_ptr);
 
 
   //
@@ -72,13 +127,16 @@ void ET_LoadStoreImpl()
   //
   for(camp::idx_t i = 0;i < matrix_t::s_num_rows; ++ i){
     for(camp::idx_t j = 0;j < matrix_t::s_num_columns; ++ j){
+      //printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1_h(i,j), data2_h(j,i));
       ASSERT_SCALAR_EQ(data1_h(i,j), data2_h(j,i));
-//      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data4_h(j,i));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data5_h(j,i));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data6_h(j,i));
+      ASSERT_SCALAR_EQ(data3_h(i,j), data7_h(j,i));
     }
   }
 
 
-
   //
   // Loop over all possible sub-matrix sizes using the load_*_nm routines
   //
@@ -101,8 +159,8 @@ void ET_LoadStoreImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
         // Load data using a View
-        auto rows = RAJA::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::ColIndex<int, matrix_t>::range(0, m_size);
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
         data2_d(cols, rows) = data1_d(rows, cols);
       });
@@ -135,6 +193,11 @@ void ET_LoadStoreImpl()
   //
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
+  tensor_free<policy_t>(data3_ptr);
+  tensor_free<policy_t>(data4_ptr);
+  tensor_free<policy_t>(data5_ptr);
+  tensor_free<policy_t>(data6_ptr);
+  tensor_free<policy_t>(data7_ptr);
 }
 
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
index b361ecd4d9..df74cb6dfa 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiply.hpp
@@ -32,19 +32,19 @@ void ET_MatrixMatrixMultiplyImpl()
   // alloc data1 - The left matrix
 
   std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
 
   element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
 
 
   // alloc data2 - The right matrix
 
   std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_h(data2_vec.data());
 
   element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The result matrix
@@ -94,14 +94,14 @@ void ET_MatrixMatrixMultiplyImpl()
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto A_rows = RAJA::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::ColIndex<int, A_matrix_t>::all();
+    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
 
-    auto B_rows = RAJA::RowIndex<int, B_matrix_t>::all();
-    auto B_cols = RAJA::ColIndex<int, B_matrix_t>::all();
+    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
+    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
-    auto C_rows = RAJA::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::ColIndex<int, C_matrix_t>::all();
+    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
 
     data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
 
@@ -168,14 +168,14 @@ void ET_MatrixMatrixMultiplyImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto A_rows = RAJA::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::ColIndex<int, A_matrix_t>::range(0, m_size);
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
-        auto B_rows = RAJA::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::ColIndex<int, B_matrix_t>::range(0, n_size);
+        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
 
-        auto C_rows = RAJA::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::ColIndex<int, C_matrix_t>::range(0, n_size);
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
         data3_d(C_rows, C_cols) = data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
       });
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
index 105eb369c5..25536f9797 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixMatrixMultiplyAdd.hpp
@@ -32,19 +32,19 @@ void ET_MatrixMatrixMultiplyAddImpl()
   // alloc data1 - The left matrix
 
   std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
 
   element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
 
 
   // alloc data2 - The right matrix
 
   std::vector<element_t> data2_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_h(data2_vec.data());
 
   element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The result matrix
@@ -96,14 +96,14 @@ void ET_MatrixMatrixMultiplyAddImpl()
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto A_rows = RAJA::RowIndex<int, A_matrix_t>::all();
-    auto A_cols = RAJA::ColIndex<int, A_matrix_t>::all();
+    auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::all();
+    auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::template static_range<0,N>();
 
-    auto B_rows = RAJA::RowIndex<int, B_matrix_t>::all();
-    auto B_cols = RAJA::ColIndex<int, B_matrix_t>::all();
+    auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::template static_range<0,N>();
+    auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::static_all();
 
-    auto C_rows = RAJA::RowIndex<int, C_matrix_t>::all();
-    auto C_cols = RAJA::ColIndex<int, C_matrix_t>::all();
+    auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::all();
+    auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::all();
 
     data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
 
@@ -170,14 +170,14 @@ void ET_MatrixMatrixMultiplyAddImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-        auto A_rows = RAJA::RowIndex<int, A_matrix_t>::range(0, n_size);
-        auto A_cols = RAJA::ColIndex<int, A_matrix_t>::range(0, m_size);
+        auto A_rows = RAJA::expt::RowIndex<int, A_matrix_t>::range(0, n_size);
+        auto A_cols = RAJA::expt::ColIndex<int, A_matrix_t>::range(0, m_size);
 
-        auto B_rows = RAJA::RowIndex<int, B_matrix_t>::range(0, m_size);
-        auto B_cols = RAJA::ColIndex<int, B_matrix_t>::range(0, n_size);
+        auto B_rows = RAJA::expt::RowIndex<int, B_matrix_t>::range(0, m_size);
+        auto B_cols = RAJA::expt::ColIndex<int, B_matrix_t>::range(0, n_size);
 
-        auto C_rows = RAJA::RowIndex<int, C_matrix_t>::range(0, n_size);
-        auto C_cols = RAJA::ColIndex<int, C_matrix_t>::range(0, n_size);
+        auto C_rows = RAJA::expt::RowIndex<int, C_matrix_t>::range(0, n_size);
+        auto C_cols = RAJA::expt::ColIndex<int, C_matrix_t>::range(0, n_size);
 
 
         data3_d(C_rows, C_cols) += data1_d(A_rows, A_cols) * data2_d(B_rows, B_cols);
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
index 4f5162105f..f5adfa7cfd 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_MatrixVector.hpp
@@ -30,19 +30,19 @@ void ET_MatrixVectorImpl()
   // alloc data1 - The matrix
 
   std::vector<element_t> data1_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_h(data1_vec.data());
 
   element_t *data1_ptr = tensor_malloc<policy_t>(data1_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data1_d(data1_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data1_d(data1_ptr);
 
 
   // alloc data2 - The input vector
 
   std::vector<element_t> data2_vec(N);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data2_h(data2_vec.data(),  N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_h(data2_vec.data());
 
   element_t *data2_ptr = tensor_malloc<policy_t>(data2_vec);
-  RAJA::View<element_t, RAJA::Layout<1,int,0>> data2_d(data2_ptr,  N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_I,N>> data2_d(data2_ptr);
 
 
   // alloc data3 - The output vector
@@ -86,11 +86,11 @@ void ET_MatrixVectorImpl()
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
-    auto vrow = RAJA::VectorIndex<int, rvector_t>::all();
-    auto vcol = RAJA::VectorIndex<int, cvector_t>::all();
+    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::all();
 
     data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
 
@@ -137,11 +137,11 @@ void ET_MatrixVectorImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
         // Load data using a View
-        auto rows = RAJA::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::ColIndex<int, matrix_t>::range(0, m_size);
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        auto vrow = RAJA::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::VectorIndex<int, cvector_t>::range(0, n_size);
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
         data3_d(vcol) = data1_d(rows, cols) * data2_d(vrow);
       });
@@ -180,11 +180,11 @@ void ET_MatrixVectorImpl()
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
 
-    auto vrow = RAJA::VectorIndex<int, rvector_t>::all();
-    auto vcol = RAJA::VectorIndex<int, cvector_t>::all();
+    auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::static_all();
+    auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::static_all();
 
     data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
 
@@ -232,11 +232,11 @@ void ET_MatrixVectorImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
         // Load data using a View
-        auto rows = RAJA::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::ColIndex<int, matrix_t>::range(0, m_size);
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
 
-        auto vrow = RAJA::VectorIndex<int, rvector_t>::range(0, m_size);
-        auto vcol = RAJA::VectorIndex<int, cvector_t>::range(0, n_size);
+        auto vrow = RAJA::expt::VectorIndex<int, rvector_t>::range(0, m_size);
+        auto vcol = RAJA::expt::VectorIndex<int, cvector_t>::range(0, n_size);
 
         data3_d(vrow) =  data2_d(vcol) * data1_d(rows, cols);
       });
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
index a843285dcb..91693da438 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Negate.hpp
@@ -34,6 +34,14 @@ void ET_NegateImpl()
   RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, N);
 
 
+  // alloc input1 with StaticLayout
+
+  std::vector<element_t> input1_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_h(input1_vec.data());
+
+  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> input1_d(input1_ptr);
+
 
   // alloc output0
 
@@ -44,15 +52,53 @@ void ET_NegateImpl()
   RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  N, N);
 
 
+  // alloc output1
+
+  std::vector<element_t> output1_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  N, N);
+
+  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  N, N);
+
+
+  // alloc output2
+
+  std::vector<element_t> output2_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  N, N);
+
+  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  N, N);
+
+
+  // alloc output3
 
-  // Fill input0 and output0
+  std::vector<element_t> output3_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  N, N);
+
+  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  N, N);
+
+
+  // alloc output4
+
+  std::vector<element_t> output4_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  N, N);
+
+  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  N, N);
+
+
+
+  // Fill input0
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
       input0_h(i,j) = i*matrix_t::s_num_columns+j;
+      input1_h(i,j) = i*matrix_t::s_num_columns+j;
     }
   }
 
   tensor_copy_to_device<policy_t>(input0_ptr, input0_vec);
+  tensor_copy_to_device<policy_t>(input1_ptr, input1_vec);
 
 
   //
@@ -60,14 +106,29 @@ void ET_NegateImpl()
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
     output0_d(rows, cols) = -input0_d(rows, cols);
 
+    output1_d(rows, cols) = -input1_d(SArows, SRcols);  // mixed static_all and static_range
+    output2_d(rows, cols) = -input1_d(SArows, SAcols);  // static_all
+    output3_d(rows, cols) = -input1_d(SRrows, SRcols);  // static_range
+    output4_d(rows, cols) = -input1_d(rows, SRcols);    // mixed static_range and non-static
+
   });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
+  tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
+  tensor_copy_to_host<policy_t>(output2_vec, output2_ptr);
+  tensor_copy_to_host<policy_t>(output3_vec, output3_ptr);
+  tensor_copy_to_host<policy_t>(output4_vec, output4_ptr);
 
 
   //
@@ -76,6 +137,10 @@ void ET_NegateImpl()
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
       ASSERT_SCALAR_EQ(output0_h(i,j), -input0_h(i,j));
+      ASSERT_SCALAR_EQ(output1_h(i,j), -input1_h(i,j));
+      ASSERT_SCALAR_EQ(output2_h(i,j), -input1_h(i,j));
+      ASSERT_SCALAR_EQ(output3_h(i,j), -input1_h(i,j));
+      ASSERT_SCALAR_EQ(output4_h(i,j), -input1_h(i,j));
     }
   }
 
@@ -85,7 +150,12 @@ void ET_NegateImpl()
   // Free data
   //
   tensor_free<policy_t>(input0_ptr);
+  tensor_free<policy_t>(input1_ptr);
   tensor_free<policy_t>(output0_ptr);
+  tensor_free<policy_t>(output1_ptr);
+  tensor_free<policy_t>(output2_ptr);
+  tensor_free<policy_t>(output3_ptr);
+  tensor_free<policy_t>(output4_ptr);
 
 }
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
index 7e699074cd..6994a89294 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Subtract.hpp
@@ -26,7 +26,6 @@ void ET_SubtractImpl()
   //
 
   // alloc data1
-
   std::vector<element_t> data1_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> data1_h(data1_vec.data(), N, N);
 
@@ -35,7 +34,6 @@ void ET_SubtractImpl()
 
 
   // alloc data2
-
   std::vector<element_t> data2_vec(N*N);
   RAJA::View<element_t, RAJA::Layout<2>> data2_h(data2_vec.data(),  N, N);
 
@@ -43,41 +41,72 @@ void ET_SubtractImpl()
   RAJA::View<element_t, RAJA::Layout<2>> data2_d(data2_ptr,  N, N);
 
 
-  // alloc data3
-
+  // alloc data3 with StaticLayout
   std::vector<element_t> data3_vec(N*N);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_h(data3_vec.data(),  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_h(data3_vec.data());
 
   element_t *data3_ptr = tensor_malloc<policy_t>(data3_vec);
-  RAJA::View<element_t, RAJA::Layout<2>> data3_d(data3_ptr,  N, N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data3_d(data3_ptr);
+
+
+  // alloc data4 with StaticLayout
+  std::vector<element_t> data4_vec(N*N);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_h(data4_vec.data());
+
+  element_t *data4_ptr = tensor_malloc<policy_t>(data4_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,N>> data4_d(data4_ptr);
+
 
+  // alloc data5
+  std::vector<element_t> data5_vec(N*N);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_h(data5_vec.data(),  N, N);
 
+  element_t *data5_ptr = tensor_malloc<policy_t>(data5_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> data5_d(data5_ptr,  N, N);
 
-  // Fill data1 and data2
+
+
+  // Fill data1, data2, data3, and data4
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
       data1_h(i,j) = i*matrix_t::s_num_columns+j;
       data2_h(i,j) = 1+i+j;
+      data3_h(i,j) = i*matrix_t::s_num_columns+j;
+      data4_h(i,j) = 1+i+j;
     }
   }
 
   tensor_copy_to_device<policy_t>(data1_ptr, data1_vec);
   tensor_copy_to_device<policy_t>(data2_ptr, data2_vec);
+  tensor_copy_to_device<policy_t>(data3_ptr, data3_vec);
+  tensor_copy_to_device<policy_t>(data4_ptr, data4_vec);
 
 
   //
-  // Do Operation: Full sum of data1 and data2
+  // Do Operation: Full sum of data1, data2, data3, and data4
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,N>();
 
-    data3_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
+    // Access types:
+    // data1_d - Layout with all() and all().
+    // data2_d - Layout with all() and static_range(), which should default to normal Layout access.
+    // data3_d - StaticLayout with static_all() and static_range().
+    // data4_d - StaticLayout with static_all() and all().
+
+    data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, SRrows) + data3_d(SArows, SRcols) - data4_d(SAcols, rows);
 
   });
 
-  tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
+  tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
 
   //
@@ -85,7 +114,7 @@ void ET_SubtractImpl()
   //
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
-      ASSERT_SCALAR_EQ(data3_h(j,i), data1_h(i,j)-data2_h(j,i));
+      ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i)+data3_h(i,j)-data4_h(j,i));
 //      printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
     }
   }
@@ -99,14 +128,14 @@ void ET_SubtractImpl()
     for(camp::idx_t m_size = 0;m_size <= N; ++ m_size){
 //      printf("Running %d x %d\n", (int)n_size, (int)m_size);
       //
-      // Clear data3
+      // Clear data5
       //
       for(camp::idx_t i = 0;i < N; ++ i){
         for(camp::idx_t j = 0;j < N; ++ j){
-          data3_h(j,i) = -1;
+          data5_h(j,i) = -1;
         }
       }
-      tensor_copy_to_device<policy_t>(data3_ptr, data3_vec);
+      tensor_copy_to_device<policy_t>(data5_ptr, data5_vec);
 
 
       //
@@ -114,13 +143,16 @@ void ET_SubtractImpl()
       //
       tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
         // Load data using a View
-        auto rows = RAJA::RowIndex<int, matrix_t>::range(0, n_size);
-        auto cols = RAJA::ColIndex<int, matrix_t>::range(0, m_size);
+        auto rows = RAJA::expt::RowIndex<int, matrix_t>::range(0, n_size);
+        auto cols = RAJA::expt::ColIndex<int, matrix_t>::range(0, m_size);
+
+        // Access types:
+        // Layout with range() and range() because loop iterate cannot be determined statically.
 
-        data3_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
+        data5_d(cols, rows) = data1_d(rows, cols) - data2_d(cols, rows);
       });
 
-      tensor_copy_to_host<policy_t>(data3_vec, data3_ptr);
+      tensor_copy_to_host<policy_t>(data5_vec, data5_ptr);
 
 
       //
@@ -130,10 +162,10 @@ void ET_SubtractImpl()
         for(camp::idx_t j = 0;j < N; ++ j){
 //          printf("%d,%d:  %lf, %lf\n", (int)i, (int)j, data1(i,j), data2(i,j));
           if(i < n_size && j < m_size){
-            ASSERT_SCALAR_EQ(data3_h(j,i), data1_h(i,j)-data2_h(j,i));
+            ASSERT_SCALAR_EQ(data5_h(j,i), data1_h(i,j)-data2_h(j,i));
           }
           else{
-            ASSERT_SCALAR_EQ(element_t(-1), data3_h(j,i));
+            ASSERT_SCALAR_EQ(element_t(-1), data5_h(j,i));
           }
         }
       }
@@ -149,7 +181,8 @@ void ET_SubtractImpl()
   tensor_free<policy_t>(data1_ptr);
   tensor_free<policy_t>(data2_ptr);
   tensor_free<policy_t>(data3_ptr);
-
+  tensor_free<policy_t>(data4_ptr);
+  tensor_free<policy_t>(data5_ptr);
 }
 
 
diff --git a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
index e0afc399e8..3699cecc30 100644
--- a/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
+++ b/test/functional/tensor/matrix/tests/test-tensor-matrix-ET_Transpose.hpp
@@ -39,6 +39,15 @@ void ET_TransposeImpl()
   RAJA::View<element_t, RAJA::Layout<2>> input0_d(input0_ptr,  N, M);
 
 
+  // alloc input1 with StaticLayout
+
+  std::vector<element_t> input1_vec(N*M);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_h(input1_vec.data());
+
+  element_t *input1_ptr = tensor_malloc<policy_t>(input1_vec);
+  RAJA::View<element_t, RAJA::StaticLayout<RAJA::PERM_IJ,N,M>> input1_d(input1_ptr);
+
+
 
   // alloc output0
 
@@ -49,15 +58,53 @@ void ET_TransposeImpl()
   RAJA::View<element_t, RAJA::Layout<2>> output0_d(output0_ptr,  M, N);
 
 
+  // alloc output1
+
+  std::vector<element_t> output1_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_h(output1_vec.data(),  M, N);
+
+  element_t *output1_ptr = tensor_malloc<policy_t>(output1_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output1_d(output1_ptr,  M, N);
+
+
+  // alloc output2
+
+  std::vector<element_t> output2_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_h(output2_vec.data(),  M, N);
+
+  element_t *output2_ptr = tensor_malloc<policy_t>(output2_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output2_d(output2_ptr,  M, N);
+
+
+  // alloc output3
 
-  // Fill input0 and output0
+  std::vector<element_t> output3_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_h(output3_vec.data(),  M, N);
+
+  element_t *output3_ptr = tensor_malloc<policy_t>(output3_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output3_d(output3_ptr,  M, N);
+
+
+  // alloc output4
+
+  std::vector<element_t> output4_vec(N*M);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_h(output4_vec.data(),  M, N);
+
+  element_t *output4_ptr = tensor_malloc<policy_t>(output4_vec);
+  RAJA::View<element_t, RAJA::Layout<2>> output4_d(output4_ptr,  M, N);
+
+
+
+  // Fill input0 and input1
   for(camp::idx_t i = 0;i < N; ++ i){
     for(camp::idx_t j = 0;j < M; ++ j){
       input0_h(i,j) = i*matrix_t::s_num_columns+j;
+      input1_h(i,j) = i*matrix_t::s_num_columns+j;
     }
   }
 
   tensor_copy_to_device<policy_t>(input0_ptr, input0_vec);
+  tensor_copy_to_device<policy_t>(input1_ptr, input1_vec);
 
 
   //
@@ -65,17 +112,32 @@ void ET_TransposeImpl()
   //
   tensor_do<policy_t>([=] RAJA_HOST_DEVICE (){
 
-    auto rows = RAJA::RowIndex<int, matrix_t>::all();
-    auto cols = RAJA::ColIndex<int, matrix_t>::all();
+    auto rows = RAJA::expt::RowIndex<int, matrix_t>::all();
+    auto cols = RAJA::expt::ColIndex<int, matrix_t>::all();
+
+    auto rows_tr = RAJA::expt::RowIndex<int, transpose_t>::all();
+    auto cols_tr = RAJA::expt::ColIndex<int, transpose_t>::all();
 
-    auto rows_tr = RAJA::RowIndex<int, transpose_t>::all();
-    auto cols_tr = RAJA::ColIndex<int, transpose_t>::all();
+    auto SArows = RAJA::expt::RowIndex<int, matrix_t>::static_all();
+    auto SAcols = RAJA::expt::ColIndex<int, matrix_t>::static_all();
+
+    auto SRrows = RAJA::expt::RowIndex<int, matrix_t>::template static_range<0,N>();
+    auto SRcols = RAJA::expt::ColIndex<int, matrix_t>::template static_range<0,M>();
 
     output0_d(rows_tr, cols_tr) = input0_d(rows, cols).transpose();
 
+    output1_d(rows_tr, cols_tr) = input1_d(SArows, SRcols).transpose();  // mixed static_all and static_range
+    output2_d(rows_tr, cols_tr) = input1_d(SArows, SAcols).transpose();  // static_all
+    output3_d(rows_tr, cols_tr) = input1_d(SRrows, SRcols).transpose();  // static_range
+    output4_d(rows_tr, cols_tr) = input1_d(rows, SRcols).transpose();    // mixed static_range and non-static
+
   });
 
   tensor_copy_to_host<policy_t>(output0_vec, output0_ptr);
+  tensor_copy_to_host<policy_t>(output1_vec, output1_ptr);
+  tensor_copy_to_host<policy_t>(output2_vec, output2_ptr);
+  tensor_copy_to_host<policy_t>(output3_vec, output3_ptr);
+  tensor_copy_to_host<policy_t>(output4_vec, output4_ptr);
 
 
 
@@ -94,6 +156,10 @@ void ET_TransposeImpl()
   for(camp::idx_t i = 0;i < M; ++ i){
     for(camp::idx_t j = 0;j < N; ++ j){
       ASSERT_SCALAR_EQ(output0_h(i,j), input0_h(j,i));
+      ASSERT_SCALAR_EQ(output1_h(i,j), input1_h(j,i));
+      ASSERT_SCALAR_EQ(output2_h(i,j), input1_h(j,i));
+      ASSERT_SCALAR_EQ(output3_h(i,j), input1_h(j,i));
+      ASSERT_SCALAR_EQ(output4_h(i,j), input1_h(j,i));
     }
   }
 
@@ -103,7 +169,12 @@ void ET_TransposeImpl()
   // Free data
   //
   tensor_free<policy_t>(input0_ptr);
+  tensor_free<policy_t>(input1_ptr);
   tensor_free<policy_t>(output0_ptr);
+  tensor_free<policy_t>(output1_ptr);
+  tensor_free<policy_t>(output2_ptr);
+  tensor_free<policy_t>(output3_ptr);
+  tensor_free<policy_t>(output4_ptr);
 
 }
 
diff --git a/test/functional/tensor/register/test-tensor-register.cpp.in b/test/functional/tensor/register/test-tensor-register.cpp.in
index 8bdc695695..acc15826e6 100644
--- a/test/functional/tensor/register/test-tensor-register.cpp.in
+++ b/test/functional/tensor/register/test-tensor-register.cpp.in
@@ -20,27 +20,27 @@
 using TensorRegisterTypes = ::testing::Types<
 	
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::Register<@TENSOR_ELEMENT_TYPE@, RAJA::cuda_warp_register>,
+    RAJA::expt::Register<@TENSOR_ELEMENT_TYPE@, RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef RAJA_ENABLE_HIP
-    RAJA::Register<@TENSOR_ELEMENT_TYPE@, RAJA::hip_wave_register>,
+    RAJA::expt::Register<@TENSOR_ELEMENT_TYPE@, RAJA::expt::hip_wave_register>,
 #endif
 
 #ifdef __AVX__
-    RAJA::Register<@TENSOR_ELEMENT_TYPE@, RAJA::avx_register>,
+    RAJA::expt::Register<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx_register>,
 #endif
 
 #ifdef __AVX2__
-    RAJA::Register<@TENSOR_ELEMENT_TYPE@, RAJA::avx2_register>,
+    RAJA::expt::Register<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx2_register>,
 #endif
 
 #ifdef __AVX512__
-    RAJA::Register<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register>,
+    RAJA::expt::Register<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register>,
 #endif
 
     // scalar_register is supported on all platforms
-    RAJA::Register<@TENSOR_ELEMENT_TYPE@, RAJA::scalar_register>
+    RAJA::expt::Register<@TENSOR_ELEMENT_TYPE@, RAJA::expt::scalar_register>
   >;
 
 
diff --git a/test/functional/tensor/vector/test-tensor-vector.cpp.in b/test/functional/tensor/vector/test-tensor-vector.cpp.in
index 42ea48c783..6fd6096b1d 100644
--- a/test/functional/tensor/vector/test-tensor-vector.cpp.in
+++ b/test/functional/tensor/vector/test-tensor-vector.cpp.in
@@ -16,42 +16,42 @@
 using TensorVectorTypes = ::testing::Types<
 	
 #ifdef RAJA_ENABLE_CUDA
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::cuda_warp_register>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::cuda_warp_register>,
 #endif
 
 #ifdef __AVX__
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx_register>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx_register, 2>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx_register, 4>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx_register, 8>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx_register, 16>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx_register, 32>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx_register>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx_register, 2>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx_register, 4>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx_register, 8>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx_register, 16>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx_register, 32>,
 #endif
 
 #ifdef __AVX2__
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx2_register>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx2_register, 2>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx2_register, 4>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx2_register, 8>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx2_register, 16>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx2_register, 32>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx2_register>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx2_register, 2>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx2_register, 4>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx2_register, 8>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx2_register, 16>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx2_register, 32>,
 #endif
 
 #ifdef __AVX512__
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register, 2>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register, 4>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register, 8>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register, 16>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register, 32>,
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::avx512_register, 64>,    
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register, 2>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register, 4>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register, 8>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register, 16>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register, 32>,
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::avx512_register, 64>,    
 #endif
 
 	// Test defaulted register type
-	RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@>,
+	RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@>,
 
     // Test scalar_register type (supported on all platforms)
-    RAJA::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::scalar_register>
+    RAJA::expt::VectorRegister<@TENSOR_ELEMENT_TYPE@, RAJA::expt::scalar_register>
   >;
 
 
diff --git a/test/functional/workgroup/CMakeLists.txt b/test/functional/workgroup/CMakeLists.txt
index 4f9762077d..730a93eeae 100644
--- a/test/functional/workgroup/CMakeLists.txt
+++ b/test/functional/workgroup/CMakeLists.txt
@@ -13,29 +13,21 @@
 # hipcc_cmake_linker_helper because it expects the path to hipcc as the first
 # argument
 #
-macro( buildfunctionalworkgrouptest TESTNAME SUBTESTNAMES BACKENDS )
+macro( buildfunctionalworkgrouptest TESTNAME SUBTESTNAMES DISPATCHERS BACKENDS )
   foreach( BACKEND ${BACKENDS} )
-    foreach( SUBTESTNAME ${SUBTESTNAMES} )
-      configure_file( test-workgroup-${TESTNAME}.cpp.in
-                      test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
-
-      #Some tests are known to fail for Hip, mark those tests (Will not be run in Gitlab CI)
-      if(${BACKEND} STREQUAL "Hip" AND ${TESTNAME} STREQUAL "Unordered")
-        if(${SUBTESTNAME} STREQUAL "Single" OR ${SUBTESTNAME} STREQUAL "MultipleReuse")
-          raja_add_test( NAME test-workgroup-Known-Hip-Failure-${TESTNAME}-${SUBTESTNAME}-${BACKEND}
-                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
-
-          target_include_directories(test-workgroup-Known-Hip-Failure-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.exe
-                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
-        else()
-          raja_add_test( NAME test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}
-                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
-
-          target_include_directories(test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.exe
-                                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
-        endif()
-      endif()
+    foreach( DISPATCHER ${DISPATCHERS} )
+      foreach( SUBTESTNAME ${SUBTESTNAMES} )
 
+        configure_file( test-workgroup-${TESTNAME}.cpp.in
+                        test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}.cpp )
+
+        raja_add_test( NAME test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}
+                       SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}.cpp )
+
+        target_include_directories( test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}.exe
+                                    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests )
+
+      endforeach()
     endforeach()
   endforeach()
 endmacro()
@@ -70,11 +62,14 @@ if(RAJA_ENABLE_HIP)
 endif()
 
 
+set(DISPATCHERS IndirectFunction IndirectVirtual Direct)
+
+
 set(Ordered_SUBTESTS Single MultipleReuse)
-buildfunctionalworkgrouptest(Ordered "${Ordered_SUBTESTS}" "${BACKENDS}")
+buildfunctionalworkgrouptest(Ordered "${Ordered_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}")
 
 set(Unordered_SUBTESTS Single MultipleReuse)
-buildfunctionalworkgrouptest(Unordered "${Unordered_SUBTESTS}" "${BACKENDS}")
+buildfunctionalworkgrouptest(Unordered "${Unordered_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}")
 
 unset(BACKENDS)
 
@@ -85,11 +80,12 @@ if(RAJA_ENABLE_TARGET_OPENMP)
   if(RAJA_TEST_OPENMP_TARGET_SUBSET)
 
     set(BACKENDS OpenMPTarget)
-    buildfunctionalworkgrouptest(Unordered "${Unordered_SUBTESTS}" "${BACKENDS}")
+    buildfunctionalworkgrouptest(Unordered "${Unordered_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}")
 
   endif()
 endif()
 
+unset(DISPATCHERS)
 unset(BACKENDS)
 unset(Ordered_SUBTESTS)
 unset(Unordered_SUBTESTS)
diff --git a/test/functional/workgroup/test-workgroup-Ordered.cpp.in b/test/functional/workgroup/test-workgroup-Ordered.cpp.in
index 7336cd68ef..df2ad0a3c0 100644
--- a/test/functional/workgroup/test-workgroup-Ordered.cpp.in
+++ b/test/functional/workgroup/test-workgroup-Ordered.cpp.in
@@ -15,6 +15,7 @@ using @BACKEND@BasicWorkGroupOrdered@SUBTESTNAME@Types =
   Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
                                  @BACKEND@OrderedPolicyList,
                                  @BACKEND@StoragePolicyList,
+                                 @DISPATCHER@DispatchTyperList,
                                  IndexTypeTypeList,
                                  @BACKEND@AllocatorList,
                                  @BACKEND@ResourceList > >::Types;
diff --git a/test/functional/workgroup/test-workgroup-Unordered.cpp.in b/test/functional/workgroup/test-workgroup-Unordered.cpp.in
index 4eae8e97e9..c1c24e7495 100644
--- a/test/functional/workgroup/test-workgroup-Unordered.cpp.in
+++ b/test/functional/workgroup/test-workgroup-Unordered.cpp.in
@@ -15,6 +15,7 @@ using @BACKEND@BasicWorkGroupUnordered@SUBTESTNAME@Types =
   Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
                                  @BACKEND@OrderPolicyList,
                                  @BACKEND@StoragePolicyList,
+                                 @DISPATCHER@DispatchTyperList,
                                  IndexTypeTypeList,
                                  @BACKEND@AllocatorList,
                                  @BACKEND@ResourceList > >::Types;
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
index df0c5c2160..11a86c43c9 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-MultipleReuse.hpp
@@ -19,41 +19,70 @@
 #include <vector>
 
 
+// These are defined here due to cuda limitations
+template < typename IndexType, typename type1 >
+struct callable11 {
+  type1* working_ptr1;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+    working_ptr1[i] += type1(i);
+  }
+};
+template < typename IndexType, typename type1 >
+struct callable12 {
+  type1* working_ptr1;
+  type1 const test_val1;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+    working_ptr1[i] += test_val1;
+  }
+};
+
+template < typename IndexType, typename type2 >
+struct callable21 {
+  type2* working_ptr2;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+    working_ptr2[i] += type2(i);
+  }
+};
+template < typename IndexType, typename type2 >
+struct callable22 {
+  type2* working_ptr2;
+  type2 const test_val2;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+    working_ptr2[i] += test_val2;
+  }
+};
+
+template < typename IndexType, typename type3 >
+struct callable31 {
+  type3* working_ptr3;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+    working_ptr3[i] += type3(i);
+  }
+};
+template < typename IndexType, typename type3 >
+struct callable32 {
+  type3* working_ptr3;
+  type3 const test_val3;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+    working_ptr3[i] += test_val3;
+  }
+};
+
+
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
           typename Allocator,
           typename WORKING_RES
           >
-void testWorkGroupOrderedMultiple(
+struct testWorkGroupOrderedMultiple {
+void operator()(
     std::mt19937& rng, IndexType max_begin, IndexType min_end,
     IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse)
+    IndexType pool_reuse, IndexType group_reuse) const
 {
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using resource_type = typename WorkGroup_type::resource_type;
-
   ASSERT_GT(min_end, max_begin);
   IndexType N = min_end + max_begin;
 
@@ -117,6 +146,43 @@ void testWorkGroupOrderedMultiple(
                                 &check_array3,
                                 &test_array3);
 
+  type1 const test_val1(5);
+  type2 const test_val2(7);
+  type3 const test_val3(11);
+
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable11<IndexType, type1>>,
+      camp::list<range_segment, callable12<IndexType, type1>>,
+      camp::list<range_segment, callable21<IndexType, type2>>,
+      camp::list<range_segment, callable22<IndexType, type2>>,
+      camp::list<range_segment, callable31<IndexType, type3>>,
+      camp::list<range_segment, callable32<IndexType, type3>> >;
+
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using resource_type = typename WorkGroup_type::resource_type;
 
   WorkPool_type pool(Allocator{});
   WorkGroup_type group = pool.instantiate();
@@ -124,46 +190,31 @@ void testWorkGroupOrderedMultiple(
 
   for (IndexType pr = 0; pr < pool_reuse; pr++) {
 
-    type1 test_val1(5);
-    type2 test_val2(7);
-    type3 test_val3(11);
 
     // fill_pool(pool, type1(5), type2(7), type3(11));
     {
       for (IndexType j = IndexType(0); j < num1; j++) {
         type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin1[j], end1[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr1[i] += type1(i);
-        });
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin1[j], end1[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr1[i] += test_val1;
-        });
+        pool.enqueue(range_segment{ begin1[j], end1[j] },
+            callable11<IndexType, type1>{working_ptr1});
+        pool.enqueue(range_segment{ begin1[j], end1[j] },
+            callable12<IndexType, type1>{working_ptr1, test_val1});
       }
 
       for (IndexType j = IndexType(0); j < num2; j++) {
         type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin2[j], end2[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr2[i] += type2(i);
-        });
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin2[j], end2[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr2[i] += test_val2;
-        });
+        pool.enqueue(range_segment{ begin2[j], end2[j] },
+            callable21<IndexType, type2>{working_ptr2});
+        pool.enqueue(range_segment{ begin2[j], end2[j] },
+            callable22<IndexType, type2>{working_ptr2, test_val2});
       }
 
       for (IndexType j = IndexType(0); j < num3; j++) {
         type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin3[j], end3[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr3[i] += type3(i);
-        });
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin3[j], end3[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr3[i] += test_val3;
-        });
+        pool.enqueue(range_segment{ begin3[j], end3[j] },
+            callable31<IndexType, type3>{working_ptr3});
+        pool.enqueue(range_segment{ begin3[j], end3[j] },
+            callable32<IndexType, type3>{working_ptr3, test_val3});
       }
     }
 
@@ -305,6 +356,54 @@ void testWorkGroupOrderedMultiple(
                                   check_array3,
                                   test_array3);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_function_call_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_virtual_function_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
+};
+
+#endif
+
 
 
 template <typename T>
@@ -320,9 +419,10 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrd
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
-  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
@@ -334,7 +434,8 @@ TYPED_TEST_P(WorkGroupBasicOrderedMultipleReuseFunctionalTest, BasicWorkGroupOrd
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupOrderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(
+  testWorkGroupOrderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                                IndexType, Allocator, WORKING_RESOURCE >{}(
       rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
 }
 
diff --git a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
index f9e0b638db..5960b03430 100644
--- a/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Ordered-Single.hpp
@@ -22,33 +22,14 @@
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
           typename Allocator,
           typename WORKING_RES
           >
-void testWorkGroupOrderedSingle(IndexType begin, IndexType end)
+struct testWorkGroupOrderedSingle {
+void operator()(IndexType begin, IndexType end) const
 {
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
   ASSERT_GE(begin, (IndexType)0);
   ASSERT_GE(end, begin);
   IndexType N = end + begin;
@@ -66,6 +47,42 @@ void testWorkGroupOrderedSingle(IndexType begin, IndexType end)
                                     &check_array,
                                     &test_array);
 
+  IndexType const test_val(5);
+
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+  auto callable1 = [=] RAJA_HOST_DEVICE (IndexType i) {
+        working_array[i] += i;
+      };
+
+  auto callable2 = [=] RAJA_HOST_DEVICE (IndexType i) {
+        working_array[i] += test_val;
+      };
+
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, decltype(callable1)>,
+      camp::list<range_segment, decltype(callable2)> >;
+
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
   {
     for (IndexType i = IndexType(0); i < N; i++) {
@@ -81,17 +98,9 @@ void testWorkGroupOrderedSingle(IndexType begin, IndexType end)
 
   WorkPool_type pool(Allocator{});
 
-  IndexType test_val(5);
-
   {
-    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin, end },
-        [=] RAJA_HOST_DEVICE (IndexType i) {
-      working_array[i] += i;
-    });
-    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin, end },
-        [=] RAJA_HOST_DEVICE (IndexType i) {
-      working_array[i] += test_val;
-    });
+    pool.enqueue(range_segment{ begin, end }, callable1);
+    pool.enqueue(range_segment{ begin, end }, callable2);
   }
 
   WorkGroup_type group = pool.instantiate();
@@ -119,6 +128,49 @@ void testWorkGroupOrderedSingle(IndexType begin, IndexType end)
                                       check_array,
                                       test_array);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_function_call_dispatch_typer,
+                                  IndexType,
+                                  Allocator,
+                                  WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupOrderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_virtual_function_dispatch_typer,
+                                  IndexType,
+                                  Allocator,
+                                  WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
+};
+
+#endif
 
 
 template <typename T>
@@ -134,9 +186,10 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSin
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
-  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
@@ -150,9 +203,9 @@ TYPED_TEST_P(WorkGroupBasicOrderedSingleFunctionalTest, BasicWorkGroupOrderedSin
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b1, e1);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b2, e2);
-  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b3, e3);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
+  testWorkGroupOrderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_ORDERED_SINGLE__
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
index e4d772e05d..5e66016b45 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-MultipleReuse.hpp
@@ -18,39 +18,49 @@
 #include <random>
 
 
+// These are defined here due to cuda limitations
+template < typename IndexType, typename type1 >
+struct callable1 {
+  type1* working_ptr1;
+  type1 const test_val1;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+        working_ptr1[i] += type1(i) + test_val1;
+  }
+};
+
+template < typename IndexType, typename type2 >
+struct callable2 {
+  type2* working_ptr2;
+  type2 const test_val2;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+        working_ptr2[i] += type2(i) + test_val2;
+  }
+};
+
+template < typename IndexType, typename type3 >
+struct callable3 {
+  type3* working_ptr3;
+  type3 const test_val3;
+  RAJA_HOST_DEVICE void operator()(IndexType i) const {
+        working_ptr3[i] += type3(i) + test_val3;
+  }
+};
+
+
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
           typename Allocator,
           typename WORKING_RES
           >
-void testWorkGroupUnorderedMultiple(
+struct testWorkGroupUnorderedMultiple {
+void operator()(
     std::mt19937& rng, IndexType max_begin, IndexType min_end,
     IndexType num1, IndexType num2, IndexType num3,
-    IndexType pool_reuse, IndexType group_reuse)
+    IndexType pool_reuse, IndexType group_reuse) const
 {
-  using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
-  using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
-                  IndexType,
-                  RAJA::xargs<>,
-                  Allocator
-                >;
-
   ASSERT_GT(min_end, max_begin);
   IndexType N = min_end + max_begin;
 
@@ -114,39 +124,60 @@ void testWorkGroupUnorderedMultiple(
                                 &check_array3,
                                 &test_array3);
 
+  type1 const test_val1(5);
+  type2 const test_val2(7);
+  type3 const test_val3(11);
+
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable1<IndexType, type1>>,
+      camp::list<range_segment, callable2<IndexType, type2>>,
+      camp::list<range_segment, callable3<IndexType, type3>> >;
+
+  using WorkPool_type = RAJA::WorkPool<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkGroup_type = RAJA::WorkGroup<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
+
+  using WorkSite_type = RAJA::WorkSite<
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
+                  IndexType,
+                  RAJA::xargs<>,
+                  Allocator
+                >;
 
   WorkPool_type pool(Allocator{});
 
   for (IndexType pr = 0; pr < pool_reuse; pr++) {
 
-    type1 test_val1(5);
-    type2 test_val2(7);
-    type3 test_val3(11);
-
     // fill_pool(pool, type1(5), type2(7), type3(11));
     {
       for (IndexType j = IndexType(0); j < num1; j++) {
         type1* working_ptr1 = working_array1 + N * j;
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin1[j], end1[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr1[i] += type1(i) + test_val1;
-        });
+        pool.enqueue(range_segment{ begin1[j], end1[j] },
+            callable1<IndexType, type1>{working_ptr1, test_val1});
       }
 
       for (IndexType j = IndexType(0); j < num2; j++) {
         type2* working_ptr2 = working_array2 + N * j;
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin2[j], end2[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr2[i] += type2(i) + test_val2;
-        });
+        pool.enqueue(range_segment{ begin2[j], end2[j] },
+            callable2<IndexType, type2>{working_ptr2, test_val2});
       }
 
       for (IndexType j = IndexType(0); j < num3; j++) {
         type3* working_ptr3 = working_array3 + N * j;
-        pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin3[j], end3[j] },
-            [=] RAJA_HOST_DEVICE (IndexType i) {
-          working_ptr3[i] += type3(i) + test_val3;
-        });
+        pool.enqueue(range_segment{ begin3[j], end3[j] },
+            callable3<IndexType, type3>{working_ptr3, test_val3});
       }
     }
 
@@ -283,6 +314,53 @@ void testWorkGroupUnorderedMultiple(
                                   check_array3,
                                   test_array3);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_function_call_dispatch_typer,
+                                      IndexType,
+                                      Allocator,
+                                      WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_virtual_function_dispatch_typer,
+                                      IndexType,
+                                      Allocator,
+                                      WORKING_RES> {
+void operator()(
+    std::mt19937&, IndexType, IndexType,
+    IndexType, IndexType, IndexType,
+    IndexType, IndexType) const
+{ }
+};
+
+#endif
 
 
 template <typename T>
@@ -298,9 +376,10 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupU
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
-  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
@@ -312,7 +391,8 @@ TYPED_TEST_P(WorkGroupBasicUnorderedMultipleReuseFunctionalTest, BasicWorkGroupU
   IndexType pool_reuse  = dist_type(IndexType(0), IndexType(8))(rng);
   IndexType group_reuse = dist_type(IndexType(0), IndexType(8))(rng);
 
-  testWorkGroupUnorderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(
+  testWorkGroupUnorderedMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper,
+                                  IndexType, Allocator, WORKING_RESOURCE >{}(
       rng, IndexType(96), IndexType(4000), num1, num2, num3, pool_reuse, group_reuse);
 }
 
diff --git a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
index fa97805724..1a05d4f7de 100644
--- a/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
+++ b/test/functional/workgroup/tests/test-workgroup-Unordered-Single.hpp
@@ -21,28 +21,59 @@
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
           typename Allocator,
           typename WORKING_RES
           >
-void testWorkGroupUnorderedSingle(IndexType begin, IndexType end)
+struct testWorkGroupUnorderedSingle {
+void operator()(IndexType begin, IndexType end) const
 {
+
+  ASSERT_GE(begin, (IndexType)0);
+  ASSERT_GE(end, begin);
+  IndexType N = end + begin;
+
+  WORKING_RES res = WORKING_RES::get_default();
+  camp::resources::Resource working_res{res};
+
+  IndexType* working_array;
+  IndexType* check_array;
+  IndexType* test_array;
+
+  allocateForallTestData<IndexType>(N,
+                                    working_res,
+                                    &working_array,
+                                    &check_array,
+                                    &test_array);
+
+  IndexType const test_val(5);
+
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+  auto callable = [=] RAJA_HOST_DEVICE (IndexType i) {
+        working_array[i] += i + test_val;
+      };
+
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, decltype(callable)> >;
+
   using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                   IndexType,
                   RAJA::xargs<>,
                   Allocator
                 >;
 
   using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                   IndexType,
                   RAJA::xargs<>,
                   Allocator
                 >;
 
   using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                   IndexType,
                   RAJA::xargs<>,
                   Allocator
@@ -52,24 +83,6 @@ void testWorkGroupUnorderedSingle(IndexType begin, IndexType end)
   static_assert(std::is_same<WORKING_RES, resource_type>::value,
                 "Expected same resource types");
 
-  ASSERT_GE(begin, (IndexType)0);
-  ASSERT_GE(end, begin);
-  IndexType N = end + begin;
-
-  WORKING_RES res = WORKING_RES::get_default();
-  camp::resources::Resource working_res{res};
-
-  IndexType* working_array;
-  IndexType* check_array;
-  IndexType* test_array;
-
-  allocateForallTestData<IndexType>(N,
-                                    working_res,
-                                    &working_array,
-                                    &check_array,
-                                    &test_array);
-
-
   {
     for (IndexType i = IndexType(0); i < N; i++) {
       test_array[i] = IndexType(0);
@@ -84,13 +97,8 @@ void testWorkGroupUnorderedSingle(IndexType begin, IndexType end)
 
   WorkPool_type pool(Allocator{});
 
-  IndexType test_val(5);
-
   {
-    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{ begin, end },
-        [=] RAJA_HOST_DEVICE (IndexType i) {
-      working_array[i] += i + test_val;
-    });
+    pool.enqueue(range_segment{ begin, end }, callable);
   }
 
   WorkGroup_type group = pool.instantiate();
@@ -120,6 +128,49 @@ void testWorkGroupUnorderedSingle(IndexType begin, IndexType end)
                                       check_array,
                                       test_array);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_function_call_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKING_RES
+          >
+struct testWorkGroupUnorderedSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_virtual_function_dispatch_typer,
+                                    IndexType,
+                                    Allocator,
+                                    WORKING_RES> {
+void operator()(
+    IndexType, IndexType) const
+{ }
+};
+
+#endif
 
 
 template <typename T>
@@ -135,9 +186,10 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnordere
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
-  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
   using dist_type = std::uniform_int_distribution<IndexType>;
@@ -151,9 +203,9 @@ TYPED_TEST_P(WorkGroupBasicUnorderedSingleFunctionalTest, BasicWorkGroupUnordere
   IndexType b3 = dist_type(e2, IndexType(1023))(rng);
   IndexType e3 = dist_type(b3, IndexType(1024))(rng);
 
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b1, e1);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b2, e2);
-  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE >(b3, e3);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b1, e1);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b2, e2);
+  testWorkGroupUnorderedSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE >{}(b3, e3);
 }
 
 #endif  //__TEST_WORKGROUP_UNORDERED_SINGLE__
diff --git a/test/include/RAJA_test-atomicpol.hpp b/test/include/RAJA_test-atomicpol.hpp
index e9e9667450..ffdc51b417 100644
--- a/test/include/RAJA_test-atomicpol.hpp
+++ b/test/include/RAJA_test-atomicpol.hpp
@@ -103,6 +103,22 @@ using HipAtomicPols =
             >;
 #endif  // RAJA_ENABLE_HIP
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclAtomicPols =
+  camp::list<
+#if defined(RAJA_TEST_EXHAUSTIVE)
+               RAJA::auto_atomic,
+               RAJA::sycl_atomic_explicit<RAJA::seq_atomic>,
+               RAJA::sycl_atomic_explicit<RAJA::loop_atomic>,
+               RAJA::sycl_atomic_explicit<RAJA::builtin_atomic>,
+#if defined(RAJA_ENABLE_OPENMP)
+               RAJA::sycl_atomic_explicit<RAJA::omp_atomic>,
+#endif
+#endif
+               RAJA::sycl_atomic
+            >;
+#endif  // RAJA_ENABLE_SYCL
+
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 using OpenMPTargetAtomicPols = OpenMPAtomicPols;
 #endif
diff --git a/test/include/RAJA_test-camp.hpp b/test/include/RAJA_test-camp.hpp
index dcb81231f2..f8c03b57f9 100644
--- a/test/include/RAJA_test-camp.hpp
+++ b/test/include/RAJA_test-camp.hpp
@@ -42,4 +42,8 @@ using OpenMPTargetResourceList = camp::list<camp::resources::Omp>;
 using HipResourceList = camp::list<camp::resources::Hip>;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclResourceList = camp::list<camp::resources::Sycl>;
+#endif
+
 #endif // __RAJA_test_camp_HPP__
diff --git a/test/include/RAJA_test-dynamic-forall.hpp b/test/include/RAJA_test-dynamic-forall.hpp
new file mode 100644
index 0000000000..f367ceabf7
--- /dev/null
+++ b/test/include/RAJA_test-dynamic-forall.hpp
@@ -0,0 +1,34 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Execution policy lists used throughout teams tests
+//
+
+#ifndef __RAJA_test_dynamic_execpol_HPP__
+#define __RAJA_test_dynamic_execpol_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+using policy_list = camp::list<camp::list<RAJA::loop_exec
+                               ,RAJA::simd_exec
+#if defined(RAJA_ENABLE_OPENMP)
+                               ,RAJA::omp_parallel_for_exec
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                               ,RAJA::cuda_exec<256>
+                               ,RAJA::cuda_exec<512>
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                               ,RAJA::hip_exec<256>
+                               ,RAJA::hip_exec<512>
+#endif
+                                          >>;
+
+
+#endif  // __RAJA_test_dynamic_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index c7dfb3e8dd..eb097c3b58 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -142,4 +142,14 @@ using HipForallAtomicExecPols = HipForallExecPols;
 
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclForallExecPols = camp::list< RAJA::sycl_exec<128, false>,
+                                       RAJA::sycl_exec<256, false> >;
+
+using SyclForallReduceExecPols = SyclForallExecPols;
+
+using SyclForallAtomicExecPols = SyclForallExecPols;
+
+#endif
+
 #endif  // __RAJA_test_forall_execpol_HPP__
diff --git a/test/include/RAJA_test-forall-indexset-execpol.hpp b/test/include/RAJA_test-forall-indexset-execpol.hpp
index 317159bd3b..9eed12c81e 100644
--- a/test/include/RAJA_test-forall-indexset-execpol.hpp
+++ b/test/include/RAJA_test-forall-indexset-execpol.hpp
@@ -94,4 +94,12 @@ using HipForallIndexSetExecPols =
 using HipForallIndexSetReduceExecPols = HipForallIndexSetExecPols;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclForallIndexSetExecPols =
+  camp::list< RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<128>>,
+              RAJA::ExecPolicy<RAJA::seq_segit, RAJA::sycl_exec<256>> >;
+
+using SyclForallIndexSetReduceExecPols = SyclForallIndexSetExecPols;
+#endif
+
 #endif  // __RAJA_test_forall_indexset_execpol_HPP__
diff --git a/test/include/RAJA_test-index-types.hpp b/test/include/RAJA_test-index-types.hpp
index 85ac9b47c9..a52aa8ffa0 100644
--- a/test/include/RAJA_test-index-types.hpp
+++ b/test/include/RAJA_test-index-types.hpp
@@ -58,7 +58,7 @@ using StrongIdxTypeList = camp::list<RAJA::Index_type,
                                      int,
                                      StrongIndexType,
 #if defined(RAJA_TEST_EXHAUSTIVE)
-                                     StrongInt,
+                                     //StrongInt,
                                      unsigned int,
 // short int types will break a bunch of tests due to assumpitons made in 
 // the test implementations.
@@ -68,7 +68,7 @@ using StrongIdxTypeList = camp::list<RAJA::Index_type,
                                      unsigned long,
                                      long long,
 #endif
-                                     StrongULL,
+                                     //StrongULL,
                                      unsigned long long>;
 
 #endif // __RAJA_test_index_types_HPP__
diff --git a/test/include/RAJA_test-kernel-nested-loop-types.hpp b/test/include/RAJA_test-kernel-nested-loop-types.hpp
index d1e0763696..e111c783fa 100644
--- a/test/include/RAJA_test-kernel-nested-loop-types.hpp
+++ b/test/include/RAJA_test-kernel-nested-loop-types.hpp
@@ -8,7 +8,9 @@
 #ifndef __TEST_KERNEL_NESTED_LOOP_TYPES_HPP__
 #define __TEST_KERNEL_NESTED_LOOP_TYPES_HPP__
 
-#if defined(RAJA_ENABLE_HIP)
+#if defined(RAJA_ENABLE_SYCL)
+#define DEVICE_KERNEL SyclKernel
+#elif defined(RAJA_ENABLE_HIP)
 #define DEVICE_KERNEL HipKernel
 #else
 #define DEVICE_KERNEL CudaKernel
@@ -25,11 +27,19 @@ struct DEPTH_3_REDUCESUM {};
 struct DEPTH_3_REDUCESUM_SEQ_INNER {};
 struct DEPTH_3_REDUCESUM_SEQ_OUTER {};
 struct DEVICE_DEPTH_1_REDUCESUM {};
+struct DEVICE_DEPTH_1_REDUCESUM_WARP {};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPDIRECT_TILE {};
+struct DEVICE_DEPTH_1_REDUCESUM_WARPREDUCE {};
 struct DEVICE_DEPTH_2 {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARP {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPMASK_FORI {};
+struct DEVICE_DEPTH_2_REDUCESUM_WARPREDUCE {};
 struct DEVICE_DEPTH_3 {};
 struct DEVICE_DEPTH_3_REDUCESUM {};
 struct DEVICE_DEPTH_3_REDUCESUM_SEQ_INNER {};
 struct DEVICE_DEPTH_3_REDUCESUM_SEQ_OUTER {};
+struct DEVICE_DEPTH_3_REDUCESUM_WARPREDUCE {};
 
 
 //
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index 21f123c2ec..67cb09b605 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -45,4 +45,8 @@ using CudaReducePols = camp::list< RAJA::cuda_reduce >;
 using HipReducePols = camp::list< RAJA::hip_reduce >;
 #endif
 
+#if defined(RAJA_ENABLE_SYCL)
+using SyclReducePols = camp::list< RAJA::sycl_reduce >;
+#endif
+
 #endif  // __RAJA_test_reducepol_HPP__
diff --git a/test/include/RAJA_test-teams-execpol.hpp b/test/include/RAJA_test-teams-execpol.hpp
index 4dc1f56e8a..e240086566 100644
--- a/test/include/RAJA_test-teams-execpol.hpp
+++ b/test/include/RAJA_test-teams-execpol.hpp
@@ -16,100 +16,52 @@
 #include "camp/list.hpp"
 
 //Launch policies
-#if defined(RAJA_ENABLE_CUDA)
-using seq_cuda_policies = camp::list<
-  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::expt::cuda_launch_t<true>>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
-
-using seq_cuda_explicit_policies = camp::list<
-  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::policy::cuda::expt::cuda_launch_explicit_t<true, 0, 0>>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
-
-using Sequential_launch_policies = camp::list<
-        seq_cuda_policies,
-        seq_cuda_explicit_policies
-         >;
-
-#elif defined(RAJA_ENABLE_HIP)
-using seq_hip_policies = camp::list<
-  RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t,RAJA::expt::hip_launch_t<true>>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
-  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
+using seq_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+  RAJA::LoopPolicy<RAJA::loop_exec>
+  >;
 
 using Sequential_launch_policies = camp::list<
-         seq_hip_policies
-         >;
-#else
-using Sequential_launch_policies = camp::list<
-        camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::seq_launch_t>,
-         RAJA::expt::LoopPolicy<RAJA::loop_exec>,
-         RAJA::expt::LoopPolicy<RAJA::loop_exec>>>;
-#endif // Sequential + device policies
-
-
-#if defined(RAJA_ENABLE_OPENMP)
-
-#if defined(RAJA_ENABLE_CUDA)
-
-using omp_cuda_policies = camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::expt::cuda_launch_t<false>>,
-         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::cuda_block_x_direct>,
-         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
+  seq_policies
   >;
 
-using omp_cuda_explicit_policies = camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::policy::cuda::expt::cuda_launch_explicit_t<false, 0, 0>>,
-         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::cuda_block_x_direct>,
-         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
+#if defined(RAJA_ENABLE_OPENMP)
+using omp_policies = camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+         RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>  
   >;
 
 using OpenMP_launch_policies = camp::list<
-         omp_cuda_policies,
-         omp_cuda_explicit_policies
-         >;
-
-#elif defined(RAJA_ENABLE_HIP)
-
-using omp_hip_policies = camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t,RAJA::expt::hip_launch_t<false>>,
-         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::hip_block_x_direct>,
-         RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
+  omp_policies
   >;
 
-using OpenMP_launch_policies = camp::list<
-         omp_hip_policies
-         >;
-#else
-using OpenMP_launch_policies = camp::list<
-        camp::list<
-         RAJA::expt::LaunchPolicy<RAJA::expt::omp_launch_t>,
-         RAJA::expt::LoopPolicy<RAJA::omp_parallel_for_exec>,
-         RAJA::expt::LoopPolicy<RAJA::loop_exec>>>;
-#endif
-
 #endif  // RAJA_ENABLE_OPENMP
 
 #if defined(RAJA_ENABLE_CUDA)
+
+using cuda_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+
+using cuda_explicit_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+  RAJA::LoopPolicy<RAJA::cuda_global_thread_x>>;
+
 using Cuda_launch_policies = camp::list<
-         seq_cuda_policies
-         , seq_cuda_explicit_policies
-#if defined(RAJA_ENABLE_OPENMP)
-         , omp_cuda_policies
-         , omp_cuda_explicit_policies
-#endif
-        >;
+        cuda_policies,
+        cuda_explicit_policies
+         >;
 #endif  // RAJA_ENABLE_CUDA
 
 #if defined(RAJA_ENABLE_HIP)
+
+using hip_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::hip_global_thread_x>>;
+
 using Hip_launch_policies = camp::list<
-         seq_hip_policies
-#if defined(RAJA_ENABLE_OPENMP)
-         , omp_hip_policies
-#endif
-        >;
+      hip_policies
+       >;
 #endif // RAJA_ENABLE_HIP
 
 
diff --git a/test/include/RAJA_test-teams-runtime-execpol.hpp b/test/include/RAJA_test-teams-runtime-execpol.hpp
new file mode 100644
index 0000000000..671cbfe5fb
--- /dev/null
+++ b/test/include/RAJA_test-teams-runtime-execpol.hpp
@@ -0,0 +1,116 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// Execution policy lists used throughout teams tests
+//
+
+#ifndef __RAJA_TEST_TEAMS_RUNTIME_EXECPOL_HPP__
+#define __RAJA_TEST_TEAMS_RUNTIME_EXECPOL_HPP__
+
+#include "RAJA/RAJA.hpp"
+#include "camp/list.hpp"
+
+//Launch policies
+#if defined(RAJA_ENABLE_CUDA)
+using seq_cuda_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::cuda_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
+  RAJA::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
+
+using seq_cuda_explicit_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<true, 0, 0>>,
+  RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_block_x_direct>,
+  RAJA::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>>;
+
+using Sequential_launch_policies = camp::list<
+        seq_cuda_policies,
+        seq_cuda_explicit_policies
+         >;
+
+#elif defined(RAJA_ENABLE_HIP)
+using seq_hip_policies = camp::list<
+  RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::hip_launch_t<true>>,
+  RAJA::LoopPolicy<RAJA::loop_exec, RAJA::hip_block_x_direct>,
+  RAJA::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>>;
+
+using Sequential_launch_policies = camp::list<
+         seq_hip_policies
+         >;
+#else
+using Sequential_launch_policies = camp::list<
+        camp::list<
+         RAJA::LaunchPolicy<RAJA::seq_launch_t>,
+         RAJA::LoopPolicy<RAJA::loop_exec>,
+         RAJA::LoopPolicy<RAJA::loop_exec>>>;
+#endif // Sequential
+
+
+#if defined(RAJA_ENABLE_OPENMP)
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using omp_cuda_policies = camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::cuda_launch_t<false>>,
+         RAJA::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::cuda_block_x_direct>,
+         RAJA::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
+  >;
+
+using omp_cuda_explicit_policies = camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::policy::cuda::cuda_launch_explicit_t<false, 0, 0>>,
+         RAJA::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::cuda_block_x_direct>,
+         RAJA::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_x_loop>
+  >;
+
+using OpenMP_launch_policies = camp::list<
+         omp_cuda_policies,
+         omp_cuda_explicit_policies
+         >;
+
+#elif defined(RAJA_ENABLE_HIP)
+
+using omp_hip_policies = camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::hip_launch_t<false>>,
+         RAJA::LoopPolicy<RAJA::omp_parallel_for_exec, RAJA::hip_block_x_direct>,
+         RAJA::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_x_loop>
+  >;
+
+using OpenMP_launch_policies = camp::list<
+         omp_hip_policies
+         >;
+#else
+using OpenMP_launch_policies = camp::list<
+        camp::list<
+         RAJA::LaunchPolicy<RAJA::omp_launch_t>,
+         RAJA::LoopPolicy<RAJA::omp_parallel_for_exec>,
+         RAJA::LoopPolicy<RAJA::loop_exec>>>;
+#endif
+
+#endif  // RAJA_ENABLE_OPENMP
+
+#if defined(RAJA_ENABLE_CUDA)
+using Cuda_launch_policies = camp::list<
+         seq_cuda_policies
+         , seq_cuda_explicit_policies
+#if defined(RAJA_ENABLE_OPENMP)
+         , omp_cuda_policies
+         , omp_cuda_explicit_policies
+#endif
+        >;
+#endif  // RAJA_ENABLE_CUDA
+
+#if defined(RAJA_ENABLE_HIP)
+using Hip_launch_policies = camp::list<
+         seq_hip_policies
+#if defined(RAJA_ENABLE_OPENMP)
+         , omp_hip_policies
+#endif
+        >;
+#endif // RAJA_ENABLE_HIP
+
+
+#endif  // __RAJA_test_teams_runtime_execpol_HPP__
diff --git a/test/include/RAJA_test-tensor.hpp b/test/include/RAJA_test-tensor.hpp
index 884919b70d..3890e87139 100644
--- a/test/include/RAJA_test-tensor.hpp
+++ b/test/include/RAJA_test-tensor.hpp
@@ -45,7 +45,7 @@ void test_launcher(BODY body_in)
 }
 
 template<>
-struct TensorTestHelper<RAJA::cuda_warp_register>
+struct TensorTestHelper<RAJA::expt::cuda_warp_register>
 {
 
     RAJA_SUPPRESS_HD_WARN
@@ -79,7 +79,7 @@ void test_launcher(BODY body_in)
 }
 
 template<>
-struct TensorTestHelper<RAJA::hip_wave_register>
+struct TensorTestHelper<RAJA::expt::hip_wave_register>
 {
 
     template<typename BODY>
diff --git a/test/include/RAJA_test-workgroup.hpp b/test/include/RAJA_test-workgroup.hpp
index 407097feec..43cb7eadba 100644
--- a/test/include/RAJA_test-workgroup.hpp
+++ b/test/include/RAJA_test-workgroup.hpp
@@ -20,6 +20,22 @@
 
 namespace detail {
 
+struct indirect_function_call_dispatch_typer {
+  template < typename ... >
+  using type = ::RAJA::indirect_function_call_dispatch;
+};
+
+struct indirect_virtual_function_dispatch_typer {
+  template < typename ... >
+  using type = ::RAJA::indirect_virtual_function_dispatch;
+};
+
+struct direct_dispatch_typer {
+  template < typename ... Ts >
+  using type = ::RAJA::direct_dispatch<Ts...>;
+};
+
+
 template < typename Resource >
 struct ResourceAllocator
 {
@@ -391,14 +407,20 @@ using HipOrderPolicyList   =
     camp::list<
                 RAJA::ordered,
                 RAJA::reverse_ordered
-#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
               , RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average
-#endif
               >;
 using HipStoragePolicyList = SequentialStoragePolicyList;
 #endif
 
 
+//
+// Dispatch policy type lists, broken up for compile time reasons
+//
+using IndirectFunctionDispatchTyperList = camp::list<detail::indirect_function_call_dispatch_typer>;
+using IndirectVirtualDispatchTyperList = camp::list<detail::indirect_virtual_function_dispatch_typer>;
+using DirectDispatchTyperList = camp::list<detail::direct_dispatch_typer>;
+
+
 //
 // Memory resource Allocator types
 //
diff --git a/test/install/CMakeLists.txt b/test/install/CMakeLists.txt
new file mode 100644
index 0000000000..c57ed12eb7
--- /dev/null
+++ b/test/install/CMakeLists.txt
@@ -0,0 +1,16 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+configure_file(
+     using-with-cmake/host-config.cmake.in
+     ${PROJECT_BINARY_DIR}/examples/using-with-cmake/host-config.cmake)
+
+ install( FILES
+   using-with-cmake/CMakeLists.txt
+   using-with-cmake/using-with-cmake.cpp
+   ${PROJECT_BINARY_DIR}/examples/using-with-cmake/host-config.cmake
+   DESTINATION examples/RAJA/using-with-cmake)
diff --git a/test/install/using-with-cmake/CMakeLists.txt b/test/install/using-with-cmake/CMakeLists.txt
new file mode 100644
index 0000000000..32f8baa1da
--- /dev/null
+++ b/test/install/using-with-cmake/CMakeLists.txt
@@ -0,0 +1,22 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+cmake_minimum_required(VERSION 3.14)
+
+ project(using_with_cmake)
+
+ if(NOT DEFINED RAJA_DIR OR NOT EXISTS ${RAJA_DIR}/lib/cmake/raja/raja-config.cmake)
+     message(FATAL_ERROR "Missing required 'RAJA_DIR' variable pointing to an installed RAJA")
+ endif()
+
+ find_package(RAJA REQUIRED
+              NO_DEFAULT_PATH 
+              PATHS ${RAJA_DIR}/lib/cmake/raja)
+
+ add_executable(using-with-cmake using-with-cmake.cpp)
+ target_link_libraries(using-with-cmake RAJA)
+ 
\ No newline at end of file
diff --git a/test/install/using-with-cmake/host-config.cmake.in b/test/install/using-with-cmake/host-config.cmake.in
new file mode 100644
index 0000000000..f33b20c408
--- /dev/null
+++ b/test/install/using-with-cmake/host-config.cmake.in
@@ -0,0 +1,27 @@
+###############################################################################
+# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+# and RAJA project contributors. See the RAJA/LICENSE file for details.
+#
+# SPDX-License-Identifier: (BSD-3-Clause)
+###############################################################################
+
+# Config related to compiler
+set(CMAKE_C_COMPILER              "@CMAKE_C_COMPILER@"       CACHE PATH "")
+set(CMAKE_CXX_COMPILER            "@CMAKE_CXX_COMPILER@"     CACHE PATH "")
+set(CMAKE_Fortran_COMPILER        "@CMAKE_Fortran_COMPILER@" CACHE PATH "")
+set(CMAKE_CXX_STANDARD            "@CMAKE_CXX_STANDARD@"     CACHE STRING "")
+
+set(CMAKE_C_FLAGS            "@CMAKE_C_FLAGS@"          CACHE STRING "")
+set(CMAKE_CXX_FLAGS          "@CMAKE_CXX_FLAGS@"        CACHE STRING "")
+set(CMAKE_Fortran_FLAGS      "@CMAKE_Fortran_FLAGS@"    CACHE STRING "")
+set(CMAKE_EXE_LINKER_FLAGS   "@CMAKE_EXE_LINKER_FLAGS@" CACHE STRING "")
+
+# MPI
+set(ENABLE_MPI             @ENABLE_MPI@               CACHE BOOL "")
+set(MPI_C_COMPILER         "@MPI_C_COMPILER@"         CACHE PATH "")
+set(MPI_CXX_COMPILER       "@MPI_CXX_COMPILER@"       CACHE PATH "")
+set(MPI_Fortran_COMPILER   "@MPI_Fortran_COMPILER@"   CACHE PATH "")
+set(MPIEXEC_EXECUTABLE     "@MPIEXEC_EXECUTABLE@"     CACHE PATH "")
+set(MPIEXEC_NUMPROC_FLAG   "@MPIEXEC_NUMPROC_FLAG@"   CACHE STRING "")
+
+set(RAJA_DIR        "@CMAKE_INSTALL_PREFIX@" CACHE PATH "")
diff --git a/test/install/using-with-cmake/using-with-cmake.cpp b/test/install/using-with-cmake/using-with-cmake.cpp
new file mode 100644
index 0000000000..22bbe38aee
--- /dev/null
+++ b/test/install/using-with-cmake/using-with-cmake.cpp
@@ -0,0 +1,32 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+#include "RAJA/RAJA.hpp"
+
+
+int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) 
+{
+  constexpr std::size_t N{1024};
+
+  double* a = new double[N];
+  double* b = new double[N];
+  double c = 3.14159;
+  
+  for (std::size_t i = 0; i < N; i++) {
+    a[i] = 1.0;
+    b[i] = 2.0;
+  }
+
+  RAJA::forall<RAJA::loop_exec>(
+    RAJA::RangeSegment(0, N),
+    [=] RAJA_HOST_DEVICE (std::size_t i) {
+      a[i] += b[i] * c;
+    }
+  );
+
+  delete[] a;
+  delete[] b;
+}
diff --git a/test/integration/plugin/CMakeLists.txt b/test/integration/plugin/CMakeLists.txt
index 40a3a75b72..da41e2a0bd 100644
--- a/test/integration/plugin/CMakeLists.txt
+++ b/test/integration/plugin/CMakeLists.txt
@@ -32,25 +32,22 @@ foreach( BACKEND ${PLUGIN_BACKENDS} )
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 endforeach()
 
+set(DISPATCHERS Direct IndirectFunction IndirectVirtual)
+
 foreach( BACKEND ${PLUGIN_BACKENDS} )
-  configure_file( test-plugin-workgroup.cpp.in
-                  test-plugin-workgroup-${BACKEND}.cpp )
+  foreach( DISPATCHER ${DISPATCHERS} )
 
-  #Some tests are known to fail for Hip, mark those tests (Will not be run in Gitlab CI)
-  if(${BACKEND} STREQUAL "Hip")
-      raja_add_test( NAME test-plugin-workgroup-Known-Hip-Failure-${BACKEND}
-                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${BACKEND}.cpp
-                         plugin_to_test.cpp )
+    configure_file( test-plugin-workgroup.cpp.in
+                    test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp )
 
-      target_include_directories(test-plugin-workgroup-Known-Hip-Failure-${BACKEND}.exe
-                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
-  else()
-      raja_add_test( NAME test-plugin-workgroup-${BACKEND}
-                 SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${BACKEND}.cpp
-                         plugin_to_test.cpp )
+    raja_add_test( NAME test-plugin-workgroup-${DISPATCHER}-${BACKEND}
+               SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-plugin-workgroup-${DISPATCHER}-${BACKEND}.cpp
+                       plugin_to_test.cpp )
 
-      target_include_directories(test-plugin-workgroup-${BACKEND}.exe
-                               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
-  endif()
+    target_include_directories(test-plugin-workgroup-${DISPATCHER}-${BACKEND}.exe
+                             PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+
+  endforeach()
 endforeach()
 
+unset(DISPATCHERS)
diff --git a/test/integration/plugin/test-plugin-workgroup.cpp.in b/test/integration/plugin/test-plugin-workgroup.cpp.in
index 35c048bdc4..0a192333e5 100644
--- a/test/integration/plugin/test-plugin-workgroup.cpp.in
+++ b/test/integration/plugin/test-plugin-workgroup.cpp.in
@@ -29,6 +29,7 @@ using @BACKEND@PluginWorkGroupTypes =
   Test< camp::cartesian_product<@BACKEND@ExecPolicyList,
                                 @BACKEND@OrderPolicyList,
                                 @BACKEND@StoragePolicyList,
+                                @DISPATCHER@DispatchTyperList,
                                 IndexTypeTypeList,
                                 @BACKEND@AllocatorList,
                                 @BACKEND@ResourceList,
diff --git a/test/integration/plugin/tests/test-plugin-workgroup.hpp b/test/integration/plugin/tests/test-plugin-workgroup.hpp
index a084268902..786c30f368 100644
--- a/test/integration/plugin/tests/test-plugin-workgroup.hpp
+++ b/test/integration/plugin/tests/test-plugin-workgroup.hpp
@@ -24,28 +24,35 @@
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
           typename Allocator,
           typename WORKINGRES,
           RAJA::Platform PLATFORM>
-void PluginWorkGroupTestImpl()
+struct PluginWorkGroupTestImpl {
+void operator()() const
 {
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
+
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, PluginTestCallable> >;
+
   using WorkPool_type = RAJA::WorkPool<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                   IndexType,
                   RAJA::xargs<>,
                   Allocator
                 >;
 
   using WorkGroup_type = RAJA::WorkGroup<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                   IndexType,
                   RAJA::xargs<>,
                   Allocator
                 >;
 
   using WorkSite_type = RAJA::WorkSite<
-                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                  RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                   IndexType,
                   RAJA::xargs<>,
                   Allocator
@@ -71,8 +78,7 @@ void PluginWorkGroupTestImpl()
   WorkPool_type pool(Allocator{});
 
   for (int i = 0; i < 10; i++) {
-    pool.enqueue(RAJA::TypedRangeSegment<IndexType>{i,i+1},
-        PluginTestCallable{data});
+    pool.enqueue(range_segment{i,i+1}, PluginTestCallable{data});
   }
 
   {
@@ -156,6 +162,51 @@ void PluginWorkGroupTestImpl()
 
   plugin_test_resource->deallocate(data);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKINGRES,
+          RAJA::Platform PLATFORM
+          >
+struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
+                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                               StoragePolicy,
+                               detail::indirect_function_call_dispatch_typer,
+                               IndexType,
+                               Allocator,
+                               WORKINGRES,
+                               PLATFORM> {
+void operator()() const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator,
+          typename WORKINGRES,
+          RAJA::Platform PLATFORM
+          >
+struct PluginWorkGroupTestImpl<RAJA::hip_work<BLOCK_SIZE, Async>,
+                               RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                               StoragePolicy,
+                               detail::indirect_virtual_function_dispatch_typer,
+                               IndexType,
+                               Allocator,
+                               WORKINGRES,
+                               PLATFORM> {
+void operator()() const
+{ }
+};
+
+#endif
 
 
 TYPED_TEST_SUITE_P(PluginWorkGroupTest);
@@ -169,12 +220,13 @@ TYPED_TEST_P(PluginWorkGroupTest, PluginWorkGroup)
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<4>>::type;
-  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<5>>::type;
-  using PlatformHolder = typename camp::at<TypeParam, camp::num<6>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using WORKING_RESOURCE = typename camp::at<TypeParam, camp::num<6>>::type;
+  using PlatformHolder = typename camp::at<TypeParam, camp::num<7>>::type;
 
-  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator, WORKING_RESOURCE, PlatformHolder::platform>( );
+  PluginWorkGroupTestImpl<ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator, WORKING_RESOURCE, PlatformHolder::platform>{}( );
 }
 
 REGISTER_TYPED_TEST_SUITE_P(PluginWorkGroupTest,
diff --git a/test/unit/index/test-listsegment.cpp b/test/unit/index/test-listsegment.cpp
index 27e4146090..b67a7c8203 100644
--- a/test/unit/index/test-listsegment.cpp
+++ b/test/unit/index/test-listsegment.cpp
@@ -37,17 +37,20 @@ TYPED_TEST(ListSegmentUnitTest, Constructors)
   }
 
   RAJA::TypedListSegment<TypeParam> list1( &idx[0], idx.size(), host_res);
-  RAJA::TypedListSegment<TypeParam> copied(list1);
+  ASSERT_EQ(list1.size(), idx.size());
+  ASSERT_EQ(list1.getIndexOwnership(), RAJA::Owned);
 
+  RAJA::TypedListSegment<TypeParam> copied(list1);
   ASSERT_EQ(list1, copied);
+  ASSERT_EQ(copied.getIndexOwnership(), RAJA::Unowned);
 
   RAJA::TypedListSegment<TypeParam> moved(std::move(list1));
-
+  ASSERT_EQ(list1.size(), 0);
   ASSERT_EQ(moved, copied);
 
   RAJA::TypedListSegment<TypeParam> container(idx, host_res);
-
-  ASSERT_EQ(list1, container); 
+  ASSERT_EQ(container.getIndexOwnership(), RAJA::Owned);
+  ASSERT_EQ(moved, container); 
 }
 
 TYPED_TEST(ListSegmentUnitTest, Swaps)
diff --git a/test/unit/util/test-timer.cpp b/test/unit/util/test-timer.cpp
index 7ea59d992d..26bfd6af24 100644
--- a/test/unit/util/test-timer.cpp
+++ b/test/unit/util/test-timer.cpp
@@ -61,7 +61,9 @@ TEST(TimerUnitTest, No2)
   RAJA::Timer::ElapsedType elapsed = timer.elapsed();
 
   EXPECT_GT(elapsed, 0.02);
+#if !defined(__APPLE__)
   EXPECT_LT(elapsed, 0.05);
+#endif
 }
 
 
@@ -81,7 +83,9 @@ TEST(TimerUnitTest, No3)
   RAJA::Timer::ElapsedType elapsed = timer.elapsed();
 
   EXPECT_GT(elapsed, 0.02);
+#if !defined(__APPLE__)
   EXPECT_LT(elapsed, 0.05);
+#endif
 
   timer.reset();
   elapsed = timer.elapsed();
diff --git a/test/unit/workgroup/CMakeLists.txt b/test/unit/workgroup/CMakeLists.txt
index 0648898934..0de12ba336 100644
--- a/test/unit/workgroup/CMakeLists.txt
+++ b/test/unit/workgroup/CMakeLists.txt
@@ -13,73 +13,76 @@
 # hipcc_cmake_linker_helper because it expects the path to hipcc as the first
 # argument
 #
-macro( buildunitworkgrouptest TESTNAME SUBTESTNAMES BACKENDS )
+macro( buildunitworkgrouptest TESTNAME SUBTESTNAMES DISPATCHERS BACKENDS )
   foreach( BACKEND ${BACKENDS} )
-    foreach( SUBTESTNAME ${SUBTESTNAMES} )
-      configure_file( test-workgroup-${TESTNAME}.cpp.in
-                      test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
-      raja_add_test( NAME test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}
-                     SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.cpp )
-
-      target_include_directories(test-workgroup-${TESTNAME}-${SUBTESTNAME}-${BACKEND}.exe
-                     PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
+    foreach( DISPATCHER ${DISPATCHERS} )
+      foreach( SUBTESTNAME ${SUBTESTNAMES} )
+
+        configure_file( test-workgroup-${TESTNAME}.cpp.in
+                        test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}.cpp )
+
+        raja_add_test( NAME test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}
+                       SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}.cpp )
+
+        target_include_directories( test-workgroup-${TESTNAME}-${SUBTESTNAME}-${DISPATCHER}-${BACKEND}.exe
+                                    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests )
+
+      endforeach()
     endforeach()
   endforeach()
 endmacro()
 
 
 set(BACKENDS Sequential)
-set(Vtable_BACKENDS Sequential)
+
+# only need to test WorkStorage once
 set(WorkStorage_BACKENDS Sequential)
 
 if(RAJA_ENABLE_TBB)
   list(APPEND BACKENDS TBB)
-  list(APPEND Vtable_BACKENDS TBB)
 endif()
 
 if(RAJA_ENABLE_OPENMP)
   list(APPEND BACKENDS OpenMP)
-  list(APPEND Vtable_BACKENDS OpenMP)
 endif()
 
 if(RAJA_ENABLE_TARGET_OPENMP)
   list(APPEND BACKENDS OpenMPTarget)
-  list(APPEND Vtable_BACKENDS OpenMPTarget)
 endif()
 
 if(RAJA_ENABLE_CUDA)
   list(APPEND BACKENDS Cuda)
-  list(APPEND Vtable_BACKENDS Cuda)
 endif()
 
 if(RAJA_ENABLE_HIP)
   list(APPEND BACKENDS Hip)
-  if(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
-    list(APPEND Vtable_BACKENDS Hip)
-  endif()
 endif()
 
+set(DISPATCHERS Direct IndirectFunction IndirectVirtual)
+
+
 # reduce travis build times with intel compiler
 if(RAJA_TEST_EXHAUSTIVE OR NOT RAJA_COMPILER MATCHES "RAJA_COMPILER_Intel")
   set(Constructor_SUBTESTS Single)
-  buildunitworkgrouptest(Constructor "${Constructor_SUBTESTS}" "${BACKENDS}")
+  buildunitworkgrouptest(Constructor "${Constructor_SUBTESTS}" "${DISPATCHERS}" "${BACKENDS}")
 
   set(Enqueue_SUBTESTS Single Multiple)
-  buildunitworkgrouptest(Enqueue     "${Enqueue_SUBTESTS}"     "${BACKENDS}")
+  buildunitworkgrouptest(Enqueue     "${Enqueue_SUBTESTS}"     "${DISPATCHERS}" "${BACKENDS}")
 
   unset(Constructor_SUBTESTS)
   unset(Enqueue_SUBTESTS)
 endif()
 
-set(Vtable_SUBTESTS Single)
-buildunitworkgrouptest(Vtable      "${Vtable_SUBTESTS}"      "${Vtable_BACKENDS}")
+set(Dispatcher_SUBTESTS Single)
+buildunitworkgrouptest(Dispatcher    "${Dispatcher_SUBTESTS}"  "${DISPATCHERS}" "${BACKENDS}")
 
 set(WorkStorage_SUBTESTS Constructor Iterator InsertCall Multiple)
-buildunitworkgrouptest(WorkStorage "${WorkStorage_SUBTESTS}" "${WorkStorage_BACKENDS}")
+buildunitworkgrouptest(WorkStorage "${WorkStorage_SUBTESTS}"   "${DISPATCHERS}" "${WorkStorage_BACKENDS}")
 
-unset(Vtable_SUBTESTS)
+unset(Dispatcher_SUBTESTS)
 unset(WorkStorage_SUBTESTS)
 
+unset(DISPATCHERS)
+
 unset(BACKENDS)
-unset(Vtable_BACKENDS)
 unset(WorkStorage_BACKENDS)
diff --git a/test/unit/workgroup/test-workgroup-Constructor.cpp.in b/test/unit/workgroup/test-workgroup-Constructor.cpp.in
index f0c02eb563..b4582c0d5e 100644
--- a/test/unit/workgroup/test-workgroup-Constructor.cpp.in
+++ b/test/unit/workgroup/test-workgroup-Constructor.cpp.in
@@ -15,6 +15,7 @@ using @BACKEND@BasicWorkGroupConstructor@SUBTESTNAME@Types =
   Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
                                  @BACKEND@OrderPolicyList,
                                  @BACKEND@StoragePolicyList,
+                                 @DISPATCHER@DispatchTyperList,
                                  IndexTypeTypeList,
                                  XargsTypeList,
                                  @BACKEND@AllocatorList > >::Types;
diff --git a/test/unit/workgroup/test-workgroup-Vtable.cpp.in b/test/unit/workgroup/test-workgroup-Dispatcher.cpp.in
similarity index 56%
rename from test/unit/workgroup/test-workgroup-Vtable.cpp.in
rename to test/unit/workgroup/test-workgroup-Dispatcher.cpp.in
index 48696d4d02..f427f007ec 100644
--- a/test/unit/workgroup/test-workgroup-Vtable.cpp.in
+++ b/test/unit/workgroup/test-workgroup-Dispatcher.cpp.in
@@ -6,21 +6,22 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
-/// Source file containing tests for RAJA workgroup vtable.
+/// Source file containing tests for RAJA workgroup dispatcher.
 ///
 
-#include "test-workgroup-Vtable.hpp"
+#include "test-workgroup-Dispatcher.hpp"
 
-using @BACKEND@BasicWorkGroupVtable@SUBTESTNAME@Types =
+using @BACKEND@BasicWorkGroupDispatcher@SUBTESTNAME@Types =
   Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
+                                 @DISPATCHER@DispatchTyperList,
                                  IndexTypeTypeList,
                                  XargsTypeList,
                                  @BACKEND@ResourceList,
                                  @BACKEND@ForoneList > >::Types;
 
-REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicVtable@SUBTESTNAME@UnitTest,
-                            BasicWorkGroupVtable@SUBTESTNAME@);
+REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicDispatcher@SUBTESTNAME@UnitTest,
+                            BasicWorkGroupDispatcher@SUBTESTNAME@);
 
 INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@BasicTest,
-                               WorkGroupBasicVtable@SUBTESTNAME@UnitTest,
-                               @BACKEND@BasicWorkGroupVtable@SUBTESTNAME@Types);
+                               WorkGroupBasicDispatcher@SUBTESTNAME@UnitTest,
+                               @BACKEND@BasicWorkGroupDispatcher@SUBTESTNAME@Types);
diff --git a/test/unit/workgroup/test-workgroup-Enqueue.cpp.in b/test/unit/workgroup/test-workgroup-Enqueue.cpp.in
index 591f2d64fc..c680bd2161 100644
--- a/test/unit/workgroup/test-workgroup-Enqueue.cpp.in
+++ b/test/unit/workgroup/test-workgroup-Enqueue.cpp.in
@@ -15,6 +15,7 @@ using @BACKEND@BasicWorkGroupEnqueue@SUBTESTNAME@Types =
   Test< camp::cartesian_product< @BACKEND@ExecPolicyList,
                                  @BACKEND@OrderPolicyList,
                                  @BACKEND@StoragePolicyList,
+                                 @DISPATCHER@DispatchTyperList,
                                  IndexTypeTypeList,
                                  XargsTypeList,
                                  @BACKEND@AllocatorList > >::Types;
diff --git a/test/unit/workgroup/test-workgroup-WorkStorage.cpp.in b/test/unit/workgroup/test-workgroup-WorkStorage.cpp.in
index 7bb4b87f9d..6a973f1384 100644
--- a/test/unit/workgroup/test-workgroup-WorkStorage.cpp.in
+++ b/test/unit/workgroup/test-workgroup-WorkStorage.cpp.in
@@ -13,6 +13,7 @@
 
 using @BACKEND@BasicWorkGroupWorkStorage@SUBTESTNAME@Types =
   Test< camp::cartesian_product< @BACKEND@StoragePolicyList,
+                                 @DISPATCHER@DispatchTyperList,
                                  WorkStorageAllocatorList > >::Types;
 
 REGISTER_TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorage@SUBTESTNAME@UnitTest,
diff --git a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
index be5f72e2f1..2c22538dc6 100644
--- a/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Constructor.hpp
@@ -18,17 +18,21 @@
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
-          typename Allocator,
-          typename ... Xargs
+          typename Allocator
           >
-void testWorkGroupConstructorSingle(RAJA::xargs<Xargs...>)
+struct testWorkGroupConstructorSingle {
+template < typename ... Xargs >
+void operator()(RAJA::xargs<Xargs...>) const
 {
   bool success = true;
 
+  using DispatchPolicy = typename DispatchTyper::template type<>;
+
   {
     RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Xargs...>,
                     Allocator
@@ -39,7 +43,7 @@ void testWorkGroupConstructorSingle(RAJA::xargs<Xargs...>)
     ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
     RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Xargs...>,
                     Allocator
@@ -50,7 +54,7 @@ void testWorkGroupConstructorSingle(RAJA::xargs<Xargs...>)
     ASSERT_EQ(pool.storage_bytes(), (size_t)0);
 
     RAJA::WorkSite<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Xargs...>,
                     Allocator
@@ -58,7 +62,7 @@ void testWorkGroupConstructorSingle(RAJA::xargs<Xargs...>)
         site = group.run(Xargs{}...);
 
     using resource_type = typename RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Xargs...>,
                     Allocator
@@ -76,6 +80,46 @@ void testWorkGroupConstructorSingle(RAJA::xargs<Xargs...>)
 
   ASSERT_TRUE(success);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator
+          >
+struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_function_call_dispatch_typer,
+                                      IndexType,
+                                      Allocator> {
+template < typename ... Xargs >
+void operator()(RAJA::xargs<Xargs...>) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator
+          >
+struct testWorkGroupConstructorSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                      RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                      StoragePolicy,
+                                      detail::indirect_virtual_function_dispatch_typer,
+                                      IndexType,
+                                      Allocator> {
+template < typename ... Xargs >
+void operator()(RAJA::xargs<Xargs...>) const
+{ }
+};
+
+#endif
+
 
 template <typename T>
 class WorkGroupBasicConstructorSingleUnitTest : public ::testing::Test
@@ -90,11 +134,12 @@ TYPED_TEST_P(WorkGroupBasicConstructorSingleUnitTest, BasicWorkGroupConstructorS
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupConstructorSingle< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{});
+  testWorkGroupConstructorSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{});
 }
 
 #endif  //__TEST_WORKGROUP_CONSTRUCTOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-Vtable.hpp b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
similarity index 55%
rename from test/unit/workgroup/tests/test-workgroup-Vtable.hpp
rename to test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
index 8384d0b765..848f2e05c5 100644
--- a/test/unit/workgroup/tests/test-workgroup-Vtable.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Dispatcher.hpp
@@ -9,46 +9,48 @@
 /// Header file containing tests for RAJA workgroup constructors.
 ///
 
-#ifndef __TEST_WORKGROUP_VTABLE__
-#define __TEST_WORKGROUP_VTABLE__
+#ifndef __TEST_WORKGROUP_DISPATCHER__
+#define __TEST_WORKGROUP_DISPATCHER__
 
 #include "RAJA_test-workgroup.hpp"
 
 
 template  < typename ForOnePol,
+            typename Invoker,
             typename ... CallArgs >
 typename  std::enable_if<
             !std::is_base_of<RunOnDevice, ForOnePol>::value
           >::type
-call_dispatcher( void(*call_function)(CallArgs...),
+call_dispatcher( Invoker invoker,
                  CallArgs... callArgs )
 {
   forone<ForOnePol>( [=] () {
-    call_function(callArgs...);
+    invoker(callArgs...);
   });
 }
 
 #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
 template  < typename ForOnePol,
+            typename Invoker,
             typename ... CallArgs >
 typename  std::enable_if<
             std::is_base_of<RunOnDevice, ForOnePol>::value
           >::type
-call_dispatcher( void(*call_function)(CallArgs...),
+call_dispatcher( Invoker invoker,
                  CallArgs... callArgs )
 {
-  RAJA::tuple<CallArgs...> callArgs_device_lambda_workaround(callArgs...);
+  RAJA::tuple<CallArgs...> lambda_capturable_callArgs(callArgs...);
   forone<ForOnePol>( [=] RAJA_DEVICE () {
-    camp::invoke(callArgs_device_lambda_workaround, call_function);
+    camp::invoke(lambda_capturable_callArgs, invoker);
   });
 }
 #endif
 
 template < typename IndexType,
            typename ... Args >
-struct VtableTestCallable
+struct DispatcherTestCallable
 {
-  VtableTestCallable(IndexType* _ptr_call, IndexType _val_call,
+  DispatcherTestCallable(IndexType* _ptr_call, IndexType _val_call,
                      IndexType* _ptr_dtor, IndexType _val_dtor)
     : ptr_call(_ptr_call)
     , val_call(_val_call)
@@ -56,10 +58,10 @@ struct VtableTestCallable
     , val_dtor(_val_dtor)
   { }
 
-  VtableTestCallable(VtableTestCallable const&) = delete;
-  VtableTestCallable& operator=(VtableTestCallable const&) = delete;
+  DispatcherTestCallable(DispatcherTestCallable const&) = delete;
+  DispatcherTestCallable& operator=(DispatcherTestCallable const&) = delete;
 
-  VtableTestCallable(VtableTestCallable&& o)
+  DispatcherTestCallable(DispatcherTestCallable&& o)
     : ptr_call(o.ptr_call)
     , val_call(o.val_call)
     , ptr_dtor(o.ptr_dtor)
@@ -68,7 +70,7 @@ struct VtableTestCallable
   {
     o.moved_from = true;
   }
-  VtableTestCallable& operator=(VtableTestCallable&& o)
+  DispatcherTestCallable& operator=(DispatcherTestCallable&& o)
   {
     ptr_call = o.ptr_call;
     val_call = o.val_call;
@@ -78,7 +80,7 @@ struct VtableTestCallable
     return *this;
   }
 
-  ~VtableTestCallable()
+  ~DispatcherTestCallable()
   {
     *ptr_dtor = val_dtor;
   }
@@ -100,21 +102,27 @@ struct VtableTestCallable
 };
 
 template < typename ExecPolicy,
+           typename DispatchTyper,
            typename IndexType,
            typename WORKING_RES,
-           typename ForOnePol,
-           typename ... Args >
-void testWorkGroupVtableSingle(RAJA::xargs<Args...>)
+           typename ForOnePol >
+struct testWorkGroupDispatcherSingle {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>) const
 {
-  using TestCallable = VtableTestCallable<IndexType, Args...>;
+  using TestCallable = DispatcherTestCallable<IndexType, Args...>;
 
   camp::resources::Resource work_res{WORKING_RES()};
   camp::resources::Resource host_res{camp::resources::Host()};
 
-  using Vtable_type = RAJA::detail::Vtable<void, IndexType, Args...>;
-  using Vtable_cptr_type = typename Vtable_type::void_cptr_wrapper;
-  const Vtable_type* vtable =
-      RAJA::detail::get_Vtable<TestCallable, Vtable_type>(ExecPolicy{});
+  static constexpr auto platform = RAJA::platform_of<ExecPolicy>::value;
+  using DispatchPolicy = typename DispatchTyper::template type<TestCallable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, IndexType, Args...>;
+  using Invoker_type = typename Dispatcher_type::invoker_type;
+  using Dispatcher_cptr_type = typename Dispatcher_type::void_cptr_wrapper;
+  const Dispatcher_type* dispatcher =
+      RAJA::detail::get_Dispatcher<TestCallable, Dispatcher_type>(ExecPolicy{});
 
   TestCallable* old_obj = host_res.allocate<TestCallable>(1);
   TestCallable* new_obj = host_res.allocate<TestCallable>(1);
@@ -158,7 +166,7 @@ void testWorkGroupVtableSingle(RAJA::xargs<Args...>)
   ASSERT_FALSE(old_obj->moved_from);
 
 
-  vtable->move_construct_destroy_function_ptr(new_obj, old_obj);
+  dispatcher->move_construct_destroy(new_obj, old_obj);
 
   ASSERT_TRUE(new_obj->move_constructed);
   ASSERT_FALSE(new_obj->moved_from);
@@ -175,8 +183,8 @@ void testWorkGroupVtableSingle(RAJA::xargs<Args...>)
   work_res.memcpy(wrk_obj, new_obj, sizeof(TestCallable) * 1);
 
   // move a value onto device and fiddle
-  call_dispatcher<ForOnePol, Vtable_cptr_type, IndexType, Args...>(
-      vtable->call_function_ptr, wrk_obj, (IndexType)1, Args{}...);
+  call_dispatcher<ForOnePol, Invoker_type, Dispatcher_cptr_type, IndexType, Args...>(
+      dispatcher->invoke, wrk_obj, (IndexType)1, Args{}...);
 
   work_res.memcpy(testCall, workCall, sizeof(IndexType) * 3);
 
@@ -185,7 +193,7 @@ void testWorkGroupVtableSingle(RAJA::xargs<Args...>)
   ASSERT_EQ(testCall[2], chckCall[2]);
 
 
-  vtable->destroy_function_ptr(new_obj);
+  dispatcher->destroy(new_obj);
 
   ASSERT_EQ(testDtor[0], chckDtor[0]);
   ASSERT_EQ(testDtor[1], chckDtor[1]);
@@ -201,25 +209,63 @@ void testWorkGroupVtableSingle(RAJA::xargs<Args...>)
   host_res.deallocate( chckDtor );
   host_res.deallocate( testDtor );
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename IndexType,
+           typename WORKING_RES,
+          typename ForOnePol
+          >
+struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                     detail::indirect_function_call_dispatch_typer,
+                                     IndexType,
+                                     WORKING_RES,
+                                     ForOnePol> {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename IndexType,
+           typename WORKING_RES,
+          typename ForOnePol
+          >
+struct testWorkGroupDispatcherSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                     detail::indirect_virtual_function_dispatch_typer,
+                                     IndexType,
+                                     WORKING_RES,
+                                     ForOnePol> {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>) const
+{ }
+};
+
+#endif
 
 
 template <typename T>
-class WorkGroupBasicVtableSingleUnitTest : public ::testing::Test
+class WorkGroupBasicDispatcherSingleUnitTest : public ::testing::Test
 {
 };
 
-TYPED_TEST_SUITE_P(WorkGroupBasicVtableSingleUnitTest);
+TYPED_TEST_SUITE_P(WorkGroupBasicDispatcherSingleUnitTest);
 
-TYPED_TEST_P(WorkGroupBasicVtableSingleUnitTest, BasicWorkGroupVtableSingle)
+TYPED_TEST_P(WorkGroupBasicDispatcherSingleUnitTest, BasicWorkGroupDispatcherSingle)
 {
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<1>>::type;
-  using Args = typename camp::at<TypeParam, camp::num<2>>::type;
-  using ResourceType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using ForOneType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<2>>::type;
+  using Args = typename camp::at<TypeParam, camp::num<3>>::type;
+  using ResourceType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using ForOneType = typename camp::at<TypeParam, camp::num<5>>::type;
 
-  testWorkGroupVtableSingle< ExecPolicy, IndexType, ResourceType, ForOneType >(
+  testWorkGroupDispatcherSingle< ExecPolicy, DispatchTyper, IndexType, ResourceType, ForOneType >{}(
       Args{});
 }
 
-#endif  //__TEST_WORKGROUP_VTABLE__
+#endif  //__TEST_WORKGROUP_DISPATCHER__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
index ac28fe5227..d41adbc427 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Multiple.hpp
@@ -21,25 +21,32 @@
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
-          typename Allocator,
-          typename ... Args
+          typename Allocator
           >
-void testWorkGroupEnqueueMultiple(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num)
+struct testWorkGroupEnqueueMultiple {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
 {
   IndexType success = (IndexType)1;
 
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
   using callable = EnqueueTestCallable<IndexType, Args...>;
 
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable> >;
+
   using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Args...>,
                     Allocator
                   >;
 
   using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Args...>,
                     Allocator
@@ -56,7 +63,7 @@ void testWorkGroupEnqueueMultiple(RAJA::xargs<Args...>, bool do_instantiate, siz
 
       {
         for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(RAJA::TypedRangeSegment<IndexType>{0, 1}, callable{&success, IndexType(0)});
+          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
         }
 
         ASSERT_EQ(pool.num_loops(), (size_t)num);
@@ -76,6 +83,47 @@ void testWorkGroupEnqueueMultiple(RAJA::xargs<Args...>, bool do_instantiate, siz
 
   ASSERT_EQ(success, (IndexType)1);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator
+          >
+struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_function_call_dispatch_typer,
+                                    IndexType,
+                                    Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator
+          >
+struct testWorkGroupEnqueueMultiple<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                    RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                    StoragePolicy,
+                                    detail::indirect_virtual_function_dispatch_typer,
+                                    IndexType,
+                                    Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
+};
+
+#endif
 
 
 template <typename T>
@@ -91,15 +139,16 @@ TYPED_TEST_P(WorkGroupBasicEnqueueMultipleUnitTest, BasicWorkGroupEnqueueMultipl
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
 
   std::mt19937 rng(std::random_device{}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, false, dist(rng), dist(rng));
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, true, dist(rng), dist(rng));
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, false, dist(rng), dist(rng));
+  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, true, dist(rng), dist(rng));
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUEMULTIPLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
index 3d73893cf9..5141df8852 100644
--- a/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-Enqueue-Single.hpp
@@ -21,25 +21,31 @@
 template <typename ExecPolicy,
           typename OrderPolicy,
           typename StoragePolicy,
+          typename DispatchTyper,
           typename IndexType,
-          typename Allocator,
-          typename ... Args
+          typename Allocator
           >
-void testWorkGroupEnqueueMultiple(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num)
+struct testWorkGroupEnqueueSingle {
+template < typename ... Args >
+void operator()(RAJA::xargs<Args...>, bool do_instantiate, size_t rep, size_t num) const
 {
   IndexType success = (IndexType)1;
 
+  using range_segment = RAJA::TypedRangeSegment<IndexType>;
   using callable = EnqueueTestCallable<IndexType, Args...>;
 
+  using DispatchPolicy = typename DispatchTyper::template type<
+      camp::list<range_segment, callable> >;
+
   using WorkPool_type = RAJA::WorkPool<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Args...>,
                     Allocator
                   >;
 
   using WorkGroup_type = RAJA::WorkGroup<
-                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy>,
+                    RAJA::WorkGroupPolicy<ExecPolicy, OrderPolicy, StoragePolicy, DispatchPolicy>,
                     IndexType,
                     RAJA::xargs<Args...>,
                     Allocator
@@ -56,7 +62,7 @@ void testWorkGroupEnqueueMultiple(RAJA::xargs<Args...>, bool do_instantiate, siz
 
       {
         for (size_t i = 0; i < num; ++i) {
-          pool.enqueue(RAJA::TypedRangeSegment<IndexType>{0, 1}, callable{&success, IndexType(0)});
+          pool.enqueue(range_segment{0, 1}, callable{&success, IndexType(0)});
         }
 
         ASSERT_EQ(pool.num_loops(), (size_t)num);
@@ -76,6 +82,47 @@ void testWorkGroupEnqueueMultiple(RAJA::xargs<Args...>, bool do_instantiate, siz
 
   ASSERT_EQ(success, (IndexType)1);
 }
+};
+
+
+#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
+
+/// leave unsupported types untested
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator
+          >
+struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_function_call_dispatch_typer,
+                                  IndexType,
+                                  Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
+};
+///
+template <size_t BLOCK_SIZE, bool Async,
+          typename StoragePolicy,
+          typename IndexType,
+          typename Allocator
+          >
+struct testWorkGroupEnqueueSingle<RAJA::hip_work<BLOCK_SIZE, Async>,
+                                  RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average,
+                                  StoragePolicy,
+                                  detail::indirect_virtual_function_dispatch_typer,
+                                  IndexType,
+                                  Allocator> {
+template < typename ... Args >
+void operator()(
+    RAJA::xargs<Args...>, bool, size_t, size_t) const
+{ }
+};
+
+#endif
 
 
 template <typename T>
@@ -91,12 +138,13 @@ TYPED_TEST_P(WorkGroupBasicEnqueueSingleUnitTest, BasicWorkGroupEnqueueSingle)
   using ExecPolicy = typename camp::at<TypeParam, camp::num<0>>::type;
   using OrderPolicy = typename camp::at<TypeParam, camp::num<1>>::type;
   using StoragePolicy = typename camp::at<TypeParam, camp::num<2>>::type;
-  using IndexType = typename camp::at<TypeParam, camp::num<3>>::type;
-  using Xargs = typename camp::at<TypeParam, camp::num<4>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<5>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<3>>::type;
+  using IndexType = typename camp::at<TypeParam, camp::num<4>>::type;
+  using Xargs = typename camp::at<TypeParam, camp::num<5>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<6>>::type;
 
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, false, 1, 1);
-  testWorkGroupEnqueueMultiple< ExecPolicy, OrderPolicy, StoragePolicy, IndexType, Allocator >(Xargs{}, true, 1, 1);
+  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, false, 1, 1);
+  testWorkGroupEnqueueSingle< ExecPolicy, OrderPolicy, StoragePolicy, DispatchTyper, IndexType, Allocator >{}(Xargs{}, true, 1, 1);
 }
 
 #endif  //__TEST_WORKGROUP_ENQUEUESINGLE__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
index 3aff88d2c0..a148c1376c 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Constructor.hpp
@@ -21,17 +21,21 @@
 
 
 template <typename StoragePolicy,
+          typename DispatchTyper,
           typename Allocator
           >
 void testWorkGroupWorkStorageConstructor()
 {
   bool success = true;
 
-  using Vtable_type = RAJA::detail::Vtable<void, void*, bool*, bool*>;
+  static constexpr auto platform = RAJA::Platform::host;
+  using DispatchPolicy = typename DispatchTyper::template type<>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type = RAJA::detail::WorkStorage<
                                                       StoragePolicy,
                                                       Allocator,
-                                                      Vtable_type
+                                                      Dispatcher_type
                                                     >;
 
   {
@@ -78,9 +82,10 @@ TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageConstructorUnitTest);
 TYPED_TEST_P(WorkGroupBasicWorkStorageConstructorUnitTest, BasicWorkGroupWorkStorageConstructor)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageConstructor< StoragePolicy, Allocator >();
+  testWorkGroupWorkStorageConstructor< StoragePolicy, DispatchTyper, Allocator >();
 }
 
 
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
index 749bccb486..c5e10ddbc2 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-InsertCall.hpp
@@ -21,24 +21,28 @@
 
 
 template <typename StoragePolicy,
+          typename DispatchTyper,
           typename Allocator
           >
 void testWorkGroupWorkStorageInsertCall()
 {
   bool success = true;
 
-  using Vtable_type = RAJA::detail::Vtable<void, void*, bool*, bool*>;
+  using callable = TestCallable<double>;
+
+  static constexpr auto platform = RAJA::Platform::host;
+  using DispatchPolicy = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type = RAJA::detail::WorkStorage<
                                                       StoragePolicy,
                                                       Allocator,
-                                                      Vtable_type
+                                                      Dispatcher_type
                                                     >;
   using WorkStruct_type = typename WorkStorage_type::value_type;
 
-  using callable = TestCallable<double>;
-
-  const Vtable_type* vtable = RAJA::detail::get_Vtable<
-      callable, Vtable_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
+      callable, Dispatcher_type>(RAJA::seq_work{});
 
   {
     auto test_empty = [&](WorkStorage_type& container) {
@@ -54,7 +58,7 @@ void testWorkGroupWorkStorageInsertCall()
       ASSERT_FALSE(c.move_constructed);
       ASSERT_FALSE(c.moved_from);
 
-      container.template emplace<callable>(vtable, std::move(c));
+      container.template emplace<callable>(dispatcher, std::move(c));
 
       ASSERT_FALSE(c.move_constructed);
       ASSERT_TRUE(c.moved_from);
@@ -73,7 +77,7 @@ void testWorkGroupWorkStorageInsertCall()
       double test_val = -1;
       bool move_constructed = false;
       bool moved_from = true;
-      WorkStruct_type::call(&*iter, (void*)&test_val, &move_constructed, &moved_from);
+      WorkStruct_type::host_call(&*iter, (void*)&test_val, &move_constructed, &moved_from);
 
       ASSERT_EQ(test_val, init_val);
       ASSERT_TRUE(move_constructed);
@@ -131,9 +135,10 @@ TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageInsertCallUnitTest);
 TYPED_TEST_P(WorkGroupBasicWorkStorageInsertCallUnitTest, BasicWorkGroupWorkStorageInsertCall)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageInsertCall< StoragePolicy, Allocator >();
+  testWorkGroupWorkStorageInsertCall< StoragePolicy, DispatchTyper, Allocator >();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEINSERTCALL__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
index e4c394adad..7a14ee8c7b 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Iterator.hpp
@@ -21,23 +21,28 @@
 
 
 template <typename StoragePolicy,
+          typename DispatchTyper,
           typename Allocator
           >
 void testWorkGroupWorkStorageIterator()
 {
   bool success = true;
 
-  using Vtable_type = RAJA::detail::Vtable<void, void*, bool*, bool*>;
+  using callable = TestCallable<int>;
+
+  static constexpr auto platform = RAJA::Platform::host;
+  using DispatchPolicy = typename DispatchTyper::template type<callable>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
   using WorkStorage_type = RAJA::detail::WorkStorage<
                                                       StoragePolicy,
                                                       Allocator,
-                                                      Vtable_type
+                                                      Dispatcher_type
                                                     >;
 
-  using callable = TestCallable<int>;
 
-  const Vtable_type* vtable = RAJA::detail::get_Vtable<
-      callable, Vtable_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher = RAJA::detail::get_Dispatcher<
+      callable, Dispatcher_type>(RAJA::seq_work{});
 
   {
     WorkStorage_type container(Allocator{});
@@ -50,7 +55,7 @@ void testWorkGroupWorkStorageIterator()
     ASSERT_TRUE(container.begin() <= container.end());
     ASSERT_TRUE(container.begin() >= container.end());
 
-    container.template emplace<callable>(vtable, callable{-1});
+    container.template emplace<callable>(dispatcher, callable{-1});
 
     ASSERT_EQ(container.end()-container.begin(), (std::ptrdiff_t)1);
     ASSERT_TRUE(container.begin() < container.end());
@@ -94,9 +99,10 @@ TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageIteratorUnitTest);
 TYPED_TEST_P(WorkGroupBasicWorkStorageIteratorUnitTest, BasicWorkGroupWorkStorageIterator)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
-  testWorkGroupWorkStorageIterator< StoragePolicy, Allocator >();
+  testWorkGroupWorkStorageIterator< StoragePolicy, DispatchTyper, Allocator >();
 }
 
 #endif  //__TEST_WORKGROUP_WORKSTORAGEITERATOR__
diff --git a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
index 94b3fb923c..815ac52d15 100644
--- a/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
+++ b/test/unit/workgroup/tests/test-workgroup-WorkStorage-Multiple.hpp
@@ -21,6 +21,7 @@
 
 
 template <typename StoragePolicy,
+          typename DispatchTyper,
           typename Allocator
           >
 void testWorkGroupWorkStorageMultiple(
@@ -28,14 +29,6 @@ void testWorkGroupWorkStorageMultiple(
 {
   bool success = true;
 
-  using Vtable_type = RAJA::detail::Vtable<void, void*, bool*, bool*>;
-  using WorkStorage_type = RAJA::detail::WorkStorage<
-                                                      StoragePolicy,
-                                                      Allocator,
-                                                      Vtable_type
-                                                    >;
-  using WorkStruct_type = typename WorkStorage_type::value_type;
-
   using type0 = double;
   using type1 = TestArray<double, 6>;
   using type2 = TestArray<double, 14>;
@@ -63,12 +56,24 @@ void testWorkGroupWorkStorageMultiple(
   using callable1 = TestCallable<type1>;
   using callable2 = TestCallable<type2>;
 
-  const Vtable_type* vtable0 = RAJA::detail::get_Vtable<
-      callable0, Vtable_type>(RAJA::seq_work{});
-  const Vtable_type* vtable1 = RAJA::detail::get_Vtable<
-      callable1, Vtable_type>(RAJA::seq_work{});
-  const Vtable_type* vtable2 = RAJA::detail::get_Vtable<
-      callable2, Vtable_type>(RAJA::seq_work{});
+  static constexpr auto platform = RAJA::Platform::host;
+  using DispatchPolicy = typename DispatchTyper::template type<callable0, callable1, callable2>;
+  using Dispatcher_type = RAJA::detail::Dispatcher<
+      platform, DispatchPolicy, void, void*, bool*, bool*>;
+  using WorkStorage_type = RAJA::detail::WorkStorage<
+                                                      StoragePolicy,
+                                                      Allocator,
+                                                      Dispatcher_type
+                                                    >;
+  using WorkStruct_type = typename WorkStorage_type::value_type;
+
+
+  const Dispatcher_type* dispatcher0 = RAJA::detail::get_Dispatcher<
+      callable0, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher1 = RAJA::detail::get_Dispatcher<
+      callable1, Dispatcher_type>(RAJA::seq_work{});
+  const Dispatcher_type* dispatcher2 = RAJA::detail::get_Dispatcher<
+      callable2, Dispatcher_type>(RAJA::seq_work{});
 
   {
     auto test_empty = [&](WorkStorage_type& container) {
@@ -85,7 +90,7 @@ void testWorkGroupWorkStorageMultiple(
         vec0.emplace_back(make_type0(init_val0, i));
         ASSERT_FALSE(vec0[i].move_constructed);
         ASSERT_FALSE(vec0[i].moved_from);
-        container.template emplace<callable0>(vtable0, std::move(vec0[i]));
+        container.template emplace<callable0>(dispatcher0, std::move(vec0[i]));
         ASSERT_FALSE(vec0[i].move_constructed);
         ASSERT_TRUE (vec0[i].moved_from);
       }
@@ -96,7 +101,7 @@ void testWorkGroupWorkStorageMultiple(
         vec1.emplace_back(make_type1(init_val1, i));
         ASSERT_FALSE(vec1[i].move_constructed);
         ASSERT_FALSE(vec1[i].moved_from);
-        container.template emplace<callable1>(vtable1, std::move(vec1[i]));
+        container.template emplace<callable1>(dispatcher1, std::move(vec1[i]));
         ASSERT_FALSE(vec1[i].move_constructed);
         ASSERT_TRUE (vec1[i].moved_from);
       }
@@ -107,7 +112,7 @@ void testWorkGroupWorkStorageMultiple(
         vec2.emplace_back(make_type2(init_val2, i));
         ASSERT_FALSE(vec2[i].move_constructed);
         ASSERT_FALSE(vec2[i].moved_from);
-        container.template emplace<callable2>(vtable2, std::move(vec2[i]));
+        container.template emplace<callable2>(dispatcher2, std::move(vec2[i]));
         ASSERT_FALSE(vec2[i].move_constructed);
         ASSERT_TRUE (vec2[i].moved_from);
       }
@@ -134,7 +139,7 @@ void testWorkGroupWorkStorageMultiple(
           type0 val{};
           bool move_constructed = false;
           bool moved_from = true;
-          WorkStruct_type::call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
 
           type0 expected = make_type0(init_val0, i);
           ASSERT_EQ(val, expected);
@@ -148,7 +153,7 @@ void testWorkGroupWorkStorageMultiple(
           type1 val{};
           bool move_constructed = false;
           bool moved_from = true;
-          WorkStruct_type::call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
 
           type1 expected = make_type1(init_val1, i);
           ASSERT_EQ(val, expected);
@@ -162,7 +167,7 @@ void testWorkGroupWorkStorageMultiple(
           type2 val{};
           bool move_constructed = false;
           bool moved_from = true;
-          WorkStruct_type::call(&*iter, (void*)&val, &move_constructed, &moved_from);
+          WorkStruct_type::host_call(&*iter, (void*)&val, &move_constructed, &moved_from);
 
           type2 expected = make_type2(init_val2, i);
           ASSERT_EQ(val, expected);
@@ -228,12 +233,13 @@ TYPED_TEST_SUITE_P(WorkGroupBasicWorkStorageMultipleUnitTest);
 TYPED_TEST_P(WorkGroupBasicWorkStorageMultipleUnitTest, BasicWorkGroupWorkStorageMultiple)
 {
   using StoragePolicy = typename camp::at<TypeParam, camp::num<0>>::type;
-  using Allocator = typename camp::at<TypeParam, camp::num<1>>::type;
+  using DispatchTyper = typename camp::at<TypeParam, camp::num<1>>::type;
+  using Allocator = typename camp::at<TypeParam, camp::num<2>>::type;
 
   std::mt19937 rng(std::random_device{}());
   std::uniform_int_distribution<size_t> dist(0, 128);
 
-  testWorkGroupWorkStorageMultiple< StoragePolicy, Allocator >(
+  testWorkGroupWorkStorageMultiple< StoragePolicy, DispatchTyper, Allocator >(
       dist(rng), dist(rng), dist(rng));
 }
 
diff --git a/tpl/camp b/tpl/camp
index 156b4903fc..9a6b8216a9 160000
--- a/tpl/camp
+++ b/tpl/camp
@@ -1 +1 @@
-Subproject commit 156b4903fc6f78d0cfc7dc4b753a55d4fc148034
+Subproject commit 9a6b8216a9c5f6d8f05a77fc1402fa7e91043d5c
diff --git a/tpl/desul b/tpl/desul
index ac4eb0229a..e4b65e00a8 160000
--- a/tpl/desul
+++ b/tpl/desul
@@ -1 +1 @@
-Subproject commit ac4eb0229a75a715b4d80f64ffff56c578ef6f41
+Subproject commit e4b65e00a8f26cfc7b59cf5f2fb75a24f69111ab