diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000..94143827ed --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +Dockerfile diff --git a/.gitignore b/.gitignore index f4f1cd0dc1..10b3b40f79 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.cache +.idea +*.sync-conflict-* *.pyc *.o *.a diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 32d794b644..81c128f0b9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,6 @@ ############################################################################### -# Copyright (c) 2016-2020, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/COPYRIGHT file for details. +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### @@ -10,93 +10,56 @@ # at Lawrence Livermore National Laboratory (LLNL). # # This entire pipeline is LLNL-specific -# ############################################################################# - -# We define the following GitLab pipeline variables: -# -# GIT_SUBMODULE_STRATEGY: -# Tells Gitlab to recursively update the submodules when cloning umpire # -# ALLOC_NAME: -# On LLNL's ruby, this pipeline creates only one allocation shared among jobs -# in order to save time and resources. This allocation has to be uniquely named -# so that we are sure to retrieve it. +# Important note: This file is a template provided by +# llnl/radiuss-shared-ci. It should not require any change from the project to +# get started but could feature project-specific stages. # -# BUILD_ROOT: -# The path to the shared resources between all jobs. The BUILD_ROOT is unique to -# the pipeline, preventing any form of concurrency with other pipelines. This -# also means that the BUILD_ROOT directory will never be cleaned. -# -# DEFAULT_TIME: -# Default time to let the Lassen jobs run will be 30 minutes. However, if it is -# a job that requires more time, it will be overwritten in the lassen template -# file. -# TODO: add a clean-up mechanism +# However, each project should provide: +# - .gitlab/custom-jobs-and-variables.yml +# - .gitlab/subscribed-pipelines.yml +# - .gitlab/${MACHINE}-build-and-test-extra.yml +############################################################################### +# We define the following GitLab pipeline variables: variables: + MP_BRANCH: "develop" +# Use a service user to run CI. This prevents from running pipelines as an +# actual user. + LLNL_SERVICE_USER: "" +# Use a service user workspace. Solves permission issues, stores everything +# at the same location whoever triggers a pipeline. +# CUSTOM_CI_BUILDS_DIR: "" +# Tells Gitlab to recursively update the submodules when cloning the project. GIT_SUBMODULE_STRATEGY: recursive - ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} +# We build the projects in the CI clone directory. +# TODO: add a clean-up mechanism BUILD_ROOT: ${CI_PROJECT_DIR} - DEFAULT_TIME: 30 - MP_BRANCH: "develop" -# Normally, stages are blocking in Gitlab. However, using the keyword "needs" we -# can express dependencies between job that break the ordering of stages, in -# favor of a DAG. -# In practice r_*, l_* and b_* stages are independently run and start immediately. +# We organize the build-and-test stage in sub-pipelines. Each sub-pipeline +# corresponds to a test batch on a given machine. +# High level stages stages: - - r_allocate_resources - - r_build_and_test - - r_release_resources - - l_build_and_test - - b_build_and_test - - c_build_and_test + - build-and-test - multi_project -# This is the rules that drives the activation of "advanced" jobs. All advanced -# jobs will share this through a template mechanism. -.advanced_pipeline: - rules: - - if: '$CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "develop" || $ALL_TARGETS == "ON"' #run only if ... - -# These are also templates (.name) that define project specific build commands. -# If an allocation exist with the name defined in this pipeline, the job will -# use it (slurm specific). -.build_toss_3_x86_64_ib_script: - script: - - echo ${ALLOC_NAME} - - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - - echo ${JOBID} - - srun $( [[ -n "${JOBID}" ]] && echo "--jobid=${JOBID}" ) -t ${DEFAULT_TIME} -N 1 scripts/gitlab/build_and_test.sh - artifacts: - reports: - junit: junit.xml - -.build_toss_4_x86_64_ib_corona_script: - script: - - srun -p pbatch -t 30 -N 1 scripts/gitlab/build_and_test.sh - -# Lassen and Butte use a different job scheduler (spectrum lsf) that does not -# allow pre-allocation the same way slurm does. -.build_blueos_3_ppc64le_ib_script: - script: - - lalloc 1 -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh - artifacts: - reports: - junit: junit.xml - -.build_blueos_3_ppc64le_ib_ats_disabled_script: - script: - - lalloc 1 --atsdisable -W ${DEFAULT_TIME} scripts/gitlab/build_and_test.sh - artifacts: - reports: - junit: junit.xml - -.build_blueos_3_ppc64le_ib_p9_script: - extends: .build_blueos_3_ppc64le_ib_script +# Template for jobs triggering a build-and-test sub-pipelines: +.build-and-test: + stage: build-and-test + trigger: + include: + - local: '.gitlab/custom-jobs-and-variables.yml' + - project: 'radiuss/radiuss-shared-ci' + ref: v2022.09.0 + file: '${CI_MACHINE}-build-and-test.yml' + - local: '.gitlab/${CI_MACHINE}-build-and-test-extra.yml' + strategy: depend + forward: + pipeline_variables: true -# If testing develop branch, trigger CHAI pipeline with this version of RAJA. +# If testing develop branch, trigger RAJAPerf pipeline with this version of +# RAJA. # TODO: Once spack allows to clone a specific commit on demand, then point to the exact commit. # This will prevent from sticking to a branch (here develop). # MP_BRANCH is short for "Multi-Project Branch" and will usually be develop. @@ -111,11 +74,6 @@ trigger-rajaperf: branch: develop strategy: depend -# This is where jobs are included. +# pipelines subscribed by the project include: - - local: .gitlab/ruby-templates.yml - - local: .gitlab/ruby-jobs.yml - - local: .gitlab/lassen-templates.yml - - local: .gitlab/lassen-jobs.yml - - local: .gitlab/corona-templates.yml - - local: .gitlab/corona-jobs.yml + - local: .gitlab/subscribed-pipelines.yml diff --git a/.gitlab/corona-build-and-test-extra.yml b/.gitlab/corona-build-and-test-extra.yml new file mode 100644 index 0000000000..a94300f85b --- /dev/null +++ b/.gitlab/corona-build-and-test-extra.yml @@ -0,0 +1,28 @@ +############################################################################# +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################# + +######################## +# Overridden shared jobs +######################## +# We duplicate the shared jobs description and add necessary changes for RAJA. +# We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that +# the comparison with the original job is easier. + +# No overridden jobs so far. + +############ +# Extra jobs +############ +# We do not recommend using ${PROJECT__VARIANTS} and +# ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully +# describe the spec here. + +rocm_5_1_1_clang_13_0_0_desul_atomics: + variables: + SPEC: " +rocm ~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 ^hip@5.1.1 ^blt@develop" + extends: .build_and_test_on_corona + diff --git a/.gitlab/corona-jobs.yml b/.gitlab/corona-jobs.yml deleted file mode 100644 index d5e72f6fea..0000000000 --- a/.gitlab/corona-jobs.yml +++ /dev/null @@ -1,16 +0,0 @@ -############################################################################# -# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################# - -hip_4_5_2_clang_13_0_0 (build and test on corona): - variables: - SPEC: "+rocm~openmp amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" - extends: .build_and_test_on_corona - -hip_4_5_2_clang_13_0_0_desul_atomics (build and test on corona): - variables: - SPEC: "+rocm~openmp +desul amdgpu_target=gfx906 %clang@13.0.0 cxxflags=--offload-arch=gfx906 ^blt@develop ^hip@4.5.2" - extends: .build_and_test_on_corona diff --git a/.gitlab/corona-templates.yml b/.gitlab/corona-templates.yml deleted file mode 100644 index 4e1a5cb744..0000000000 --- a/.gitlab/corona-templates.yml +++ /dev/null @@ -1,33 +0,0 @@ -############################################################################# -# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################# - -#### -# This is the share configuration of jobs for corona - -#### -# In pre-build phase, allocate a node for builds -.on_corona: - tags: - - shell - - corona - rules: - - if: '$ON_CORONA == "OFF"' #run except if ... - when: never - - if: '$CI_JOB_NAME =~ /release_resources/' - when: always - - when: on_success - -#### -# Generic corona build job, extending build script -.build_and_test_on_corona: - stage: c_build_and_test - extends: [.build_toss_4_x86_64_ib_corona_script, .on_corona] - needs: [] - -.build_and_test_on_corona_advanced: - extends: [.build_and_test_on_corona, .advanced_pipeline] - diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml new file mode 100644 index 0000000000..53f36c56cd --- /dev/null +++ b/.gitlab/custom-jobs-and-variables.yml @@ -0,0 +1,52 @@ +############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# We define the following GitLab pipeline variables: +variables: +# On LLNL's ruby, this pipeline creates only one allocation shared among jobs +# in order to save time and resources. This allocation has to be uniquely named +# so that we are sure to retrieve it and avoid collisions. + ALLOC_NAME: ${CI_PROJECT_NAME}_ci_${CI_PIPELINE_ID} + +# Ruby +# Arguments for top level allocation + RUBY_BUILD_AND_TEST_SHARED_ALLOC: "--exclusive --partition=pdebug --time=60 --nodes=1" +# Arguments for job level allocation + RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=45 --nodes=1" +# Project specific variants for ruby + PROJECT_RUBY_VARIANTS: "+openmp " +# Project specific deps for ruby + PROJECT_RUBY_DEPS: "" + +# Corona +# Arguments for top level allocation + CORONA_BUILD_AND_TEST_SHARED_ALLOC: "--time-limit=60m --nodes=1" +# Arguments for job level allocation + CORONA_BUILD_AND_TEST_JOB_ALLOC: "--time-limit=45m --nodes=1" +# Project specific variants for corona + PROJECT_CORONA_VARIANTS: "~openmp " +# Project specific deps for corona + PROJECT_CORONA_DEPS: "^blt@develop " + +# Lassen and Butte use a different job scheduler (spectrum lsf) that does not +# allow pre-allocation the same way slurm does. +# Arguments for job level allocation + LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 60" +# Project specific variants for lassen + PROJECT_LASSEN_VARIANTS: "+openmp " +# Project specific deps for lassen + PROJECT_LASSEN_DEPS: "" + + +# Configuration shared by build and test jobs specific to this project. +# Not all configuration can be shared. Here projects can fine tune the +# CI behavior. +# See Umpire for an example (export junit test reports). +.custom_build_and_test: + artifacts: + reports: + junit: junit.xml diff --git a/.gitlab/lassen-build-and-test-extra.yml b/.gitlab/lassen-build-and-test-extra.yml new file mode 100644 index 0000000000..0442a602bd --- /dev/null +++ b/.gitlab/lassen-build-and-test-extra.yml @@ -0,0 +1,146 @@ +############################################################################## +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## + +######################## +# Overridden shared jobs +######################## +# We duplicate the shared jobs description and add necessary changes for RAJA. +# We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that +# the comparison with the original job is easier. + +# Overriding shared spec: Allow failures +ibm_clang_9_0_0: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} %clang@ibm.9.0.0 ${PROJECT_LASSEN_DEPS}" + extends: .build_and_test_on_lassen + allow_failure: true + +# Overriding shared spec: Allow failures +ibm_clang_9_0_0_gcc_8_3_1: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} %clang@ibm.9.0.0 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ${PROJECT_LASSEN_DEPS}" + extends: .build_and_test_on_lassen + allow_failure: true + +# Overriding shared spec: Extra flags +gcc_8_3_1: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} %gcc@8.3.1 cxxflags=\"-finline-functions -finline-limit=20000\" cflags=\"-finline-functions -finline-limit=20000\" ${PROJECT_LASSEN_DEPS}" + extends: .build_and_test_on_lassen + +# Overriding shared spec: Longer allocation + Allow failures +pgi_20_4_gcc_8_3_1: + extends: .build_and_test_on_lassen + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} %pgi@20.4 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ${PROJECT_LASSEN_DEPS}" + LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 80" + allow_failure: true + +# Overriding shared spec: Extra flags +xl_16_1_1_12: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} %xl@16.1.1.12 cxxflags=\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" ${PROJECT_LASSEN_DEPS}" + extends: .build_and_test_on_lassen + +# Overriding shared spec: Extra flags +xl_16_1_1_12_gcc_8_3_1: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ${PROJECT_LASSEN_DEPS}" + extends: .build_and_test_on_lassen + +# Overriding shared spec: Allow failures +ibm_clang_9_0_0_gcc_8_3_1_cuda_10_1_168: + extends: .build_and_test_on_lassen + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %clang@ibm.9.0.0 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@10.1.168 ${PROJECT_LASSEN_DEPS}" + allow_failure: true + +# Overriding shared spec: Longer allocation + Extra flags + Allow failure + Updated cuda +xl_16_1_1_12_cuda_11_1_1: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"-qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cuda_arch=70 ^cuda@11.1.0 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}" + LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" + allow_failure: true + extends: .build_and_test_on_lassen + +# Overriding shared spec: Deactivated spec. This spec will be removed soon. +xl_16_1_1_12_gcc_8_3_1_cuda_11_0_2: + variables: + SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" ^cuda@11.0.2 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}" + extends: .build_and_test_on_lassen + script: + - | + echo -e "\e[31mDeactivated spec !\e[0m" + echo -e "\e[31m${SPEC}\e[0m" + echo -e "\e[31mRAJA won’t build with Cuda 11.0.2 due to a known issue.\e[0m" + - exit 1 + allow_failure: true + +# Overriding shared spec: Longer allocation + Extra flags + Allow failure + Updated cuda +xl_16_1_1_12_gcc_8_3_1_cuda_11_1_0: + variables: + SPEC: " ${PROJECT_LASSEN_VARIANTS} +cuda %xl@16.1.1.12 cxxflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qstrict -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036\" cflags=\"--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1\" cuda_arch=70 ^cuda@11.1.0 ^cmake@3.14.5 ${PROJECT_LASSEN_DEPS}" + LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 -W 120" + allow_failure: true + extends: .build_and_test_on_lassen + + +############ +# Extra jobs +############ +# We do not recommend using ${PROJECT__VARIANTS} and +# ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully +# describe the spec here. + +########## +# CPU ONLY +########## + +clang_14_0_5: + variables: + SPEC: " +openmp %clang@14.0.5" + extends: .build_and_test_on_lassen + +########## +# CUDA +########## + +clang_12_0_1_cuda_11_5_0: + variables: + SPEC: " +openmp +cuda cuda_arch=70 %clang@12.0.1 ^cuda@11.5.0" + extends: .build_and_test_on_lassen + +gcc_8_3_1_cuda_11_1_0: + variables: + SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.1.0" + extends: .build_and_test_on_lassen + +gcc_8_3_1_cuda_11_5_0_ats_disabled: + extends: .build_and_test_on_lassen + variables: + SPEC: " +openmp +cuda %gcc@8.3.1 cuda_arch=70 ^cuda@11.5.0" + LASSEN_BUILD_AND_TEST_JOB_ALLOC: "1 --atsdisable -W 60" + +########## +# OTHERS +########## + +clang_13_0_1_libcpp: + variables: + SPEC: " +openmp %clang@13.0.1+libcpp" + extends: .build_and_test_on_lassen + +clang_14_0_5_asan: + variables: + SPEC: " +openmp %clang@14.0.5 cxxflags=-fsanitize=address" + ASAN_OPTIONS: "detect_leaks=1" + extends: .build_and_test_on_lassen + +gcc_8_3_1_cuda_10_1_168_desul_atomics: + variables: + SPEC: " +openmp +cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" + extends: .build_and_test_on_lassen diff --git a/.gitlab/lassen-jobs.yml b/.gitlab/lassen-jobs.yml deleted file mode 100644 index 8b5d070993..0000000000 --- a/.gitlab/lassen-jobs.yml +++ /dev/null @@ -1,95 +0,0 @@ -############################################################################## -# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################## - -########## -# CPU ONLY -########## - -ibm_clang_9: - variables: - SPEC: "%clang@ibm.9.0.0" - extends: .build_and_test_on_lassen - -ibm_clang_9_gcc_8: - variables: - SPEC: "%clang@ibm.9.0.0 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" - extends: .build_and_test_on_lassen - -gcc_8_3_1: - variables: - SPEC: "%gcc@8.3.1 cxxflags='-finline-functions -finline-limit=20000' cflags='-finline-functions -finline-limit=20000'" - extends: .build_and_test_on_lassen - -xl_16_1_1_11: - variables: - SPEC: "%xl@16.1.1.11 cxxflags='-qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036'" - DEFAULT_TIME: 50 - extends: .build_and_test_on_lassen - -xl_16_1_1_11_gcc_8_3_1: - variables: - SPEC: "%xl@16.1.1.11 cxxflags='--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 -qthreaded -std=c++14 -O3 -qxlcompatmacros -qlanglvl=extended0x -qalias=noansi -qhot -qpic -qsmp=omp -qsuppress=1500-029 -qsuppress=1500-036' cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" - DEFAULT_TIME: 50 - extends: .build_and_test_on_lassen - -########## -# CUDA -########## - -ibm_clang_9_cuda: - variables: - SPEC: "+cuda cuda_arch=70 %clang@ibm.9.0.0 ^cuda@10.1.168" - extends: .build_and_test_on_lassen - -ibm_clang_10_cuda: - variables: - SPEC: "+cuda cuda_arch=70 %clang@ibm.10.0.1 ^cuda@10.1.168" - extends: .build_and_test_on_lassen - -gcc_8_3_1_cuda: - variables: - SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" - extends: .build_and_test_on_lassen - -gcc_8_3_1_cuda_ats_disabled: - variables: - SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" - extends: .build_and_test_on_lassen_ats_disabled - -xl_16_1_1_7_cuda: - variables: - SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" - DEFAULT_TIME: 60 - allow_failure: true - extends: .build_and_test_on_lassen - -xl_16_1_1_7_gcc_8_3_1_cuda_11: - variables: - SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 ^cuda@11.0.2 ^cmake@3.14.5" - DEFAULT_TIME: 60 - allow_failure: true - extends: .build_and_test_on_lassen - -########## -# EXTRAS -########## - -clang_9_0_0_libcpp (build and test on lassen): - variables: - SPEC: "%clang@9.0.0+libcpp" - extends: .build_and_test_on_lassen - -clang_9_0_0_memleak (build and test on lassen): - variables: - SPEC: "%clang@9.0.0 cxxflags=-fsanitize=address" - ASAN_OPTIONS: "detect_leaks=1" - extends: .build_and_test_on_lassen - -gcc_8_3_1_cuda_desul_atomics: - variables: - SPEC: "+cuda +desul %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" - extends: .build_and_test_on_lassen diff --git a/.gitlab/lassen-templates.yml b/.gitlab/lassen-templates.yml deleted file mode 100644 index dbc340f22a..0000000000 --- a/.gitlab/lassen-templates.yml +++ /dev/null @@ -1,34 +0,0 @@ -############################################################################## -# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################## - -#### -# Shared configuration of jobs for lassen -.on_lassen: - variables: - tags: - - shell - - lassen - rules: - - if: '$CI_COMMIT_BRANCH =~ /_lnone/ || $ON_LASSEN == "OFF"' #run except if ... - when: never - - when: on_success - -.build_and_test_on_lassen: - stage: l_build_and_test - extends: [.build_blueos_3_ppc64le_ib_p9_script, .on_lassen] - needs: [] - -.build_and_test_on_lassen_ats_disabled: - stage: l_build_and_test - extends: [.build_blueos_3_ppc64le_ib_ats_disabled_script, .on_lassen] - needs: [] - -# Note: .build_and_test_on_lassen_advanced inherits from -# .build_and_test_on_lassen and .advanced_pileline. -# In particular, the rules section will be merged. Careful when changing rules. -.build_and_test_on_lassen_advanced: - extends: [.build_and_test_on_lassen, .advanced_pipeline] diff --git a/.gitlab/ruby-build-and-test-extra.yml b/.gitlab/ruby-build-and-test-extra.yml new file mode 100644 index 0000000000..9bebc62530 --- /dev/null +++ b/.gitlab/ruby-build-and-test-extra.yml @@ -0,0 +1,58 @@ +############################################################################## +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################## + +######################## +# Overridden shared jobs +######################## +# We duplicate the shared jobs description and add necessary changes for RAJA. +# We keep ${PROJECT__VARIANTS} and ${PROJECT__DEPS} So that +# the comparison with the original job is easier. + +# Overriding shared config for longer run +gcc_8_1_0: + variables: + SPEC: " ${PROJECT_RUBY_VARIANTS} %gcc@8.1.0 ${PROJECT_RUBY_DEPS}" + RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" + extends: .build_and_test_on_ruby + +# Overriding shared spec: Allow failures +pgi_20_1_gcc_local_8_3_1: + variables: + SPEC: " ${PROJECT_RUBY_VARIANTS} %pgi@20.1 cxxflags\"=-rc=/usr/workspace/umpire/pgi/x86_64/local-gcc-8.3.1-rc\" cflags\"=-rc=/usr/workspace/umpire/pgi/x86_64/local-gcc-8.3.1-rc\" fflags=\"-rc=/usr/workspace/umpire/pgi/x86_64/local-gcc-8.3.1-rc\" ${PROJECT_RUBY_DEPS}" + extends: .build_and_test_on_ruby + allow_failure: true + +############ +# Extra jobs +############ +# We do not recommend using ${PROJECT__VARIANTS} and +# ${PROJECT__DEPS} in the extra jobs. There is no reason not to fully +# describe the spec here. + +clang_9_0_0_openmp_off: + variables: + SPEC: " ~openmp %clang@9.0.0" + extends: .build_and_test_on_ruby + +gcc_8_1_0_openmp_default: + variables: + SPEC: " %gcc@8.1.0" + RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=60 --nodes=1" + extends: .build_and_test_on_ruby + +icpc_19_1_0: + variables: + SPEC: " +openmp %intel@19.1.0" + RUBY_BUILD_AND_TEST_JOB_ALLOC: "--time=40 --nodes=1" + extends: .build_and_test_on_ruby + +# OTHERS +clang_10_0_1_gcc_8_3_1_desul_atomics: + variables: + SPEC: " +openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" + extends: .build_and_test_on_ruby + diff --git a/.gitlab/ruby-jobs.yml b/.gitlab/ruby-jobs.yml deleted file mode 100644 index 2b6cceb5c7..0000000000 --- a/.gitlab/ruby-jobs.yml +++ /dev/null @@ -1,53 +0,0 @@ -############################################################################## -# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################## - -clang_10: - variables: - SPEC: "%clang@10.0.1" - extends: .build_and_test_on_ruby - -clang_9: - variables: - SPEC: "%clang@9.0.0" - extends: .build_and_test_on_ruby - -gcc_8_1_0: - variables: - SPEC: "%gcc@8.1.0" - DEFAULT_TIME: 60 - extends: .build_and_test_on_ruby - -#icpc_17_0_2: -# variables: -# SPEC: "%intel@17.0.2" -# DEFAULT_TIME: 40 -# extends: .build_and_test_on_ruby - -icpc_18_0_2: - variables: - SPEC: " tests=none %intel@18.0.2" - DEFAULT_TIME: 40 - extends: .build_and_test_on_ruby - -icpc_19_1_0: - variables: - SPEC: "%intel@19.1.0" - DEFAULT_TIME: 40 - extends: .build_and_test_on_ruby - -# EXTRAS - -#gcc_4_9_3: -# variables: -# SPEC: "%gcc@4.9.3" -# DEFAULT_TIME: 60 -# extends: .build_and_test_on_ruby - -clang_10_desul_atomics: - variables: - SPEC: "+openmp +desul %clang@10.0.1 cxxflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1 cflags=--gcc-toolchain=/usr/tce/packages/gcc/gcc-8.3.1" - extends: .build_and_test_on_ruby diff --git a/.gitlab/ruby-templates.yml b/.gitlab/ruby-templates.yml deleted file mode 100644 index b1314534b3..0000000000 --- a/.gitlab/ruby-templates.yml +++ /dev/null @@ -1,54 +0,0 @@ -############################################################################## -# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################## - -#### -# This is the shared configuration of jobs for ruby - -#### -# In pre-build phase, allocate a node for builds -.on_ruby: - tags: - - shell - - ruby - rules: - - if: '$CI_COMMIT_BRANCH =~ /_qnone/ || $ON_RUBY == "OFF"' #run except if ... - when: never - - if: '$CI_JOB_NAME =~ /release_resources/' - when: always - - when: on_success - -#### -# In pre-build phase, allocate a node for builds -# NOTE: Not specifying 'salloc -c 56' should allocate the max number of CPU cores -allocate_resources (on ruby): - variables: - GIT_STRATEGY: none - extends: .on_ruby - stage: r_allocate_resources - script: - - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} - -#### -# In post-build phase, deallocate resources -# Note : make sure this is run even on build phase failure -release_resources (on ruby): - variables: - GIT_STRATEGY: none - extends: .on_ruby - stage: r_release_resources - script: - - export JOBID=$(squeue -h --name=${ALLOC_NAME} --format=%A) - - ([[ -n "${JOBID}" ]] && scancel ${JOBID}) - -#### -# Generic ruby build job, extending build script -.build_and_test_on_ruby: - extends: [.build_toss_3_x86_64_ib_script, .on_ruby] - stage: r_build_and_test - -.build_and_test_on_ruby_advanced: - extends: [.build_and_test_on_ruby, .advanced_pipeline] diff --git a/.gitlab/subscribed-pipelines.yml b/.gitlab/subscribed-pipelines.yml new file mode 100644 index 0000000000..c424d3e1e4 --- /dev/null +++ b/.gitlab/subscribed-pipelines.yml @@ -0,0 +1,24 @@ +############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC and RADIUSS +# project contributors. See the COPYRIGHT file for details. +# +# SPDX-License-Identifier: (MIT) +############################################################################### + +# Uncomment pipelines to subscribe to a shared pipeline. + +# Trigger a build-and-test pipeline for ruby, corona and lassen +ruby-build-and-test: + variables: + CI_MACHINE: "ruby" + extends: [.build-and-test] + +corona-build-and-test: + variables: + CI_MACHINE: "corona" + extends: [.build-and-test] + +lassen-build-and-test: + variables: + CI_MACHINE: "lassen" + extends: [.build-and-test] diff --git a/.uberenv_config.json b/.uberenv_config.json index 335f4c91eb..2fc700f855 100644 --- a/.uberenv_config.json +++ b/.uberenv_config.json @@ -3,10 +3,11 @@ "package_version" : "develop", "package_final_phase" : "hostconfig", "package_source_dir" : "../..", -"spack_url": "https://github.com/spack/spack", -"spack_branch": "develop", -"spack_commit": "be1c4bc563722d0774436cc905fd938c88c61a72", +"spack_url": "https://github.com/spack/spack.git", +"spack_branch": "v0.18.1", "spack_activate" : {}, "spack_configs_path": "scripts/radiuss-spack-configs", -"spack_packages_path": "scripts/spack_packages" +"spack_packages_path": "scripts/spack_packages", +"spack_concretizer": "clingo", +"spack_setup_clingo": false } diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fc49959ac..77d31fe778 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,14 +44,21 @@ set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PA include(cmake/SetupRajaOptions.cmake) -cmake_minimum_required(VERSION 3.14.5) +if (ENABLE_HIP) + cmake_minimum_required(VERSION 3.23) +else() + cmake_minimum_required(VERSION 3.20) +endif() # Detect C++ standard and add appropriate flag _before_ loading BLT set(COMPILERS_KNOWN_TO_CMAKE33 AppleClang Clang GNU MSVC) include(CheckCXXCompilerFlag) if(NOT DEFINED BLT_CXX_STD) - if("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + if("cxx_std_20" IN_LIST CMAKE_CXX_KNOWN_FEATURES) + set(BLT_CXX_STD c++20 CACHE STRING "Version of C++ standard") + message("Using C++ standard: ${BLT_CXX_STD}") + elseif("cxx_std_17" IN_LIST CMAKE_CXX_KNOWN_FEATURES) set(BLT_CXX_STD c++17 CACHE STRING "Version of C++ standard") message("Using C++ standard: ${BLT_CXX_STD}") elseif("cxx_std_14" IN_LIST CMAKE_CXX_KNOWN_FEATURES) @@ -216,6 +223,7 @@ if (RAJA_ENABLE_EXTERNAL_ROCPRIM) endif () if (RAJA_ENABLE_SYCL) + set (RAJA_ENABLE_DESUL_ATOMICS "On") set (raja_depends ${raja_depends} sycl) @@ -283,13 +291,13 @@ blt_add_library( install(TARGETS RAJA - EXPORT RAJA + EXPORT RAJATargets ARCHIVE DESTINATION lib LIBRARY DESTINATION lib RUNTIME DESTINATION lib ) -install(EXPORT RAJA DESTINATION lib/cmake/raja) +install(EXPORT RAJATargets DESTINATION lib/cmake/raja) target_include_directories(RAJA PUBLIC diff --git a/Dockerfile b/Dockerfile index b4d6dfc585..3b7f3c4804 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ +RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \ cmake -DCMAKE_CXX_COMPILER=clang++ -DRAJA_ENABLE_TBB=On -DENABLE_OPENMP=On .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -50,7 +50,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-20.04:llvm-11.0.0 AS clang11-debug ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ +RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \ cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Debug .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -59,7 +59,7 @@ FROM ghcr.io/rse-ops/clang-ubuntu-22.04:llvm-13.0.0 AS clang13 ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build -RUN . /opt/spack/share/spack/setup-env.sh && spack load llvm && \ +RUN . /opt/spack/share/spack/setup-env.sh && export LD_LIBRARY_PATH=/opt/view/lib:$LD_LIBRARY_PATH && \ cmake -DCMAKE_CXX_COMPILER=clang++ -DENABLE_OPENMP=On -DCMAKE_BUILD_TYPE=Release .. && \ make -j 6 &&\ ctest -T test --output-on-failure @@ -88,16 +88,16 @@ RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. && \ make -j 4 -FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-4.3.1 AS hip +FROM ghcr.io/rse-ops/hip-ubuntu-20.04:hip-5.1.3 AS hip ENV GTEST_COLOR=1 ENV HCC_AMDGPU_TARGET=gfx900 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build RUN . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu && \ - cmake -DCMAKE_CXX_COMPILER=amdclang++ -DRAJA_ENABLE_EXTERNAL_ROCPRIM=Off -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \ + cmake -DCMAKE_CXX_COMPILER=clang++ -DHIP_PATH=/opt -DENABLE_HIP=On -DENABLE_CUDA=Off -DRAJA_ENABLE_WARNINGS_AS_ERRORS=Off .. && \ make -j 6 -FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.0.1 AS sycl +FROM ghcr.io/rse-ops/intel-ubuntu-22.04:intel-2022.1.0 AS sycl ENV GTEST_COLOR=1 COPY . /home/raja/workspace WORKDIR /home/raja/workspace/build diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 30f94b6e50..062c604d6a 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -20,6 +20,90 @@ Notable changes include: * Bug fixes/improvements: +Version 2022.10.0 -- Release date 2022-10-28 +============================================ + +This release contains new features, bug fixes, and build improvements. Please +see the RAJA user guide for more information about items in this release. + +Notable changes include: + + * New features / API changes: + * Introduced a new RAJA::forall and reduction interface that extend + the execution behavior of reduction operations with RAJA::forall. + The main difference with the pre-existing reduction interface in RAJA + is that reduction variables and operations are passed into the + RAJA::forall method and lambda expression instead of using the lambda + capture mechanism for reduction objects. This offers flexibility and + potential performance advantages when using RAJA reductions as the + new interface enables the ability to integrate with programming model + back-end reduction machinery directly, for OpenMP and SYCL for example. + The interface also enables user-chosen kernel names to be passed to + RAJA::forall for performance analysis annotations that are easier to + understand. Example codes are included as well as a description of + the new interface and comparison with the pre-existing interface in + the RAJA User Guide. + * Added support for run time execution policy selection for RAJA::forall + kernels. Users can specify any number of execution policies in their + code and then select which to use at run time. There is no discussion + of this in the RAJA User Guide yet. However, there are a couple of + example codes in files RAJA/examples/*dynamic-forall*.cpp. + * The RAJA::launch framework has been moved out of the experimental namespace, into the RAJA:: namespace, which introduces an API change. + * Add support for all RAJA segment types in the RAJA::launch framework. + * Add SYCL back-end support for RAJA::launch and dynamic shared memory + for all back-ends in RAJA::launch. These changes introduce API changes. + * Add additional policies to WorkGroup construct that allow for different + methods of dispatching work. + * Add special case implementations to CUDA atomicInc and atomicDec + functions to use special hardware support when available. This can + result in a significant performance boost. + * Rework HIP atomic implementations to support more native data types. + * Added RAJA_UNROLL_COUNT macro which enables users to unroll loops for + a fix unroll count. + * Major User Guide rework: + * New RAJA tutorial sections, including new exercise source files + to work through. Material used in recent RADIUSS/AWS RAJA Tutorial. + * Cleaned up and expanded RAJA feature sections to be more like a + reference guide with links to associated tutorial sections for + implementation examples. + * Improved presentation of build configuration sections. + + * Build changes / improvements: + * Submodule updates: + * BLT updated to v0.5.2 release. + * Camp updated to v2022.10.0 release. + * The minimum CMake version required has changed. For a HIP build, + CMake 3.23 or newer is required. For all other builds CMake 3.20 + or newer is required. + * OpenMP back-end support is now off by default to match behavior of + all other RAJA parallel back-end support. To enable OpenMP, users + must now run CMake with the -DENABLE_OPENMP=On option. + * Support OpenMP back-end enablement in a HIP build configuration. + * RAJA_ENABLE_VECTORIZATION CMake option added to enable/disable + new SIMD/SIMT vectorization support. The default is 'On'. The option + allows users to disable if they wish. + * Improvements to build target export mechanics coordinated with camp, + BLT, and Spack projects. + * Improve HIP builds to better support evolving ROCm software stack. + * Add CMake variable RAJA_ALLOW_INCONSISTENT_OPTIONS and CMake messages + to allow users more control when using CMake dependent options. When + CMake is run, the code now checks for cases when RAJA_ENABLE_X=On and + but ENABLE_X=Off. Previously, this was confusing because X would not + be enabled despite the value of the RAJA-specific option. + * Build system refactoring to make CMake configurations more robust; added + test to check for installed CMake config. + * Added basic support to compile with C++20 standard. + * Add missing compilation macro guards for HIP and CUDA policies in + vectorization support when not running on a GPU device. + * Various compiler warnings squashed. + + * Bug fixes / improvements: + * Expanded test coverage to catch more cases that users have run into. + * Various fixes in SIMD/SIMT support for different compilers and versions + users have hit recently. Also, changes to internal implementations to + improve run time performance for those features. + + Version 2022.03.1 -- Release date 2022-08-10 ============================================ diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 6b40fa89ac..c84a71eb18 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,11 +1,5 @@ -variables: - DO_BUILD: 'yes' - DO_TEST: 'yes' - DO_INSTALL: 'yes' - COMPILER: 'g++' - jobs: -- job: Windows #temporarily commenting out until cmake/azure version issue resolved +- job: Windows strategy: matrix: shared: @@ -28,15 +22,14 @@ jobs: inputs: workingDir: 'build' cmakeArgs: '--build . --config Release --verbose -j 4' -# - task: CmdLine@2 -# inputs: -# script: 'ctest.exe -T test -C Release' -# workingDirectory: 'build' -# condition: eq( variables['Agent.OS'], 'Windows_NT') -# - task: PublishTestResults@2 -# inputs: -# testResultsFormat: 'cTest' -# testResultsFiles: '**/Test.xml' + - task: CmdLine@2 + inputs: + script: 'ctest.exe -T test -C Release' + workingDirectory: 'build' + - task: PublishTestResults@2 + inputs: + testResultsFormat: 'cTest' + testResultsFiles: '**/Test.xml' - job: Docker timeoutInMinutes: 360 strategy: @@ -79,17 +72,6 @@ jobs: command: build dockerFile: 'Dockerfile' arguments: '--target $(docker_target)' - - script: | - CID=$(docker create llnl/raja:$(Build.BuildId)) - echo ${CID} - docker cp ${CID}:/home/axom/workspace/build local-build - docker rm ${CID} - displayName: 'Copy test artifacts' - condition: ne( variables['docker_target'], 'nvcc') - - script: | - bash <(curl -s https://raw.githubusercontent.com/codecov/codecov-bash/0b376529f626b50b7d4a9fb734e0e50d28b9b91e/codecov) >& /dev/null - displayName: 'Upload code coverage' - condition: eq( variables['docker_target'], 'gcc') - task: PublishTestResults@2 inputs: testResultsFormat: 'cTest' @@ -112,12 +94,11 @@ jobs: make -j 4 displayName: 'OSX Build' condition: eq( variables['Agent.OS'], 'Darwin') -# - script: | -# cd build -# ctest -T test --output-on-failure -# displayName: 'OSX Test' -# condition: eq( variables['Agent.OS'], 'Darwin') -# - task: PublishTestResults@2 -# inputs: -# testResultsFormat: 'cTest' -# testResultsFiles: '**/Test.xml' + - script: | + cd build + ctest -T test --output-on-failure + displayName: 'OSX Test' + - task: PublishTestResults@2 + inputs: + testResultsFormat: 'cTest' + testResultsFiles: '**/Test.xml' diff --git a/benchmark/ltimes.cpp b/benchmark/ltimes.cpp index 2f059265d9..6720cda059 100644 --- a/benchmark/ltimes.cpp +++ b/benchmark/ltimes.cpp @@ -473,7 +473,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) timer.start(); for (int iter = 0;iter < num_iter;++ iter){ - RAJA::launch(RAJA::HOST, RAJA::Grid(), [=](RAJA::LaunchContext ctx){ + RAJA::launch(RAJA::ExecPlace::HOST, RAJA::LaunchParams(), [=](RAJA::LaunchContext ctx){ RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_g), [&](IG g){ RAJA::loop(ctx, RAJA::TypedRangeSegment(0, num_z), [&](IZ z){ @@ -1239,8 +1239,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int iter = 0;iter < num_iter;++ iter){ RAJA::launch( - RAJA::DEVICE, - RAJA::Grid(RAJA::Teams(160, 1, 1), + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(160, 1, 1), RAJA::Threads(8, 64, 1)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { @@ -1380,8 +1380,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) printf("num_iter=%d\n", (int)num_iter); for (int iter = 0;iter < num_iter;++ iter){ RAJA::launch( - RAJA::DEVICE, - RAJA::Grid(RAJA::Teams(num_g, 1, 1), + RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(num_g, 1, 1), RAJA::Threads(32, 32, 1)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { diff --git a/blt b/blt index 296bf64e64..97ea54d892 160000 --- a/blt +++ b/blt @@ -1 +1 @@ -Subproject commit 296bf64e64edfcfcce6a53e3b396d6529e76b986 +Subproject commit 97ea54d892b4b1d56736830575c3db62e3d7674d diff --git a/cmake/SetupDependentOptions.cmake b/cmake/SetupDependentOptions.cmake index 163608071a..f5a3966bf9 100644 --- a/cmake/SetupDependentOptions.cmake +++ b/cmake/SetupDependentOptions.cmake @@ -9,6 +9,18 @@ ## Here are the CMake dependent options in RAJA. ## +set(RAJA_DEPENDENT_OPTIONS ENABLE_OPENMP ENABLE_CUDA ENABLE_HIP ENABLE_CLANG_CUDA ENABLE_COVERAGE ENABLE_TESTS ENABLE_EXAMPLES ENABLE_BENCHMARKS) +foreach (option ${RAJA_DEPENDENT_OPTIONS}) + if (${RAJA_${option}}) + if (NOT ${option}) + if (RAJA_ALLOW_INCONSISTENT_OPTIONS) + message(WARNING "RAJA_${option} set to On, but ${option} is Off. Please set ${option} to On to enable this feature.") + else () + message(FATAL_ERROR "RAJA_${option} set to On, but ${option} is Off. Please set ${option} to On enable this feature.") + endif () + endif () + endif () +endforeach () cmake_dependent_option(RAJA_ENABLE_OPENMP "Build with OpenMP support" On "ENABLE_OPENMP" Off) cmake_dependent_option(RAJA_ENABLE_CUDA "Build with CUDA support" On "ENABLE_CUDA" Off) diff --git a/cmake/SetupPackages.cmake b/cmake/SetupPackages.cmake index 8c76eb3b74..45dadbee4b 100644 --- a/cmake/SetupPackages.cmake +++ b/cmake/SetupPackages.cmake @@ -82,7 +82,7 @@ if (RAJA_ENABLE_HIP) endif() if (RAJA_ENABLE_EXTERNAL_ROCPRIM) - find_package(RocPRIM) + include(cmake/thirdparty/FindRocPRIM.cmake) if (ROCPRIM_FOUND) blt_import_library( NAME rocPRIM @@ -105,21 +105,35 @@ if (RAJA_ENABLE_HIP AND RAJA_ENABLE_ROCTX) endif () set(TPL_DEPS) -blt_list_append(TO TPL_DEPS ELEMENTS cuda cuda_runtime IF RAJA_ENABLE_CUDA) blt_list_append(TO TPL_DEPS ELEMENTS nvtoolsext IF RAJA_ENABLE_NV_TOOLS_EXT) blt_list_append(TO TPL_DEPS ELEMENTS cub IF RAJA_ENABLE_EXTERNAL_CUB) -blt_list_append(TO TPL_DEPS ELEMENTS blt_hip blt_hip_runtime IF RAJA_ENABLE_HIP) blt_list_append(TO TPL_DEPS ELEMENTS rocPRIM IF RAJA_ENABLE_EXTERNAL_ROCPRIM) -blt_list_append(TO TPL_DEPS ELEMENTS openmp IF RAJA_ENABLE_OPENMP) -blt_list_append(TO TPL_DEPS ELEMENTS mpi IF RAJA_ENABLE_MPI) + +set(RAJA_NEEDS_BLT_TPLS False) +if (RAJA_ENABLE_CUDA OR RAJA_ENABLE_HIP OR RAJA_ENABLE_OPENMP OR RAJA_ENABLE_MPI) + set(RAJA_NEEDS_BLT_TPLS True) +endif () + +if (RAJA_NEEDS_BLT_TPLS) + if (NOT BLT_EXPORTED) + set(BLT_EXPORTED On CACHE BOOL "" FORCE) + blt_import_library(NAME blt_stub EXPORTABLE On) + set_target_properties(blt_stub PROPERTIES EXPORT_NAME blt::blt_stub) + install(TARGETS blt_stub + EXPORT bltTargets) + blt_export_tpl_targets(EXPORT bltTargets NAMESPACE blt) + install(EXPORT bltTargets + DESTINATION lib/cmake/raja) + endif() +endif () foreach(dep ${TPL_DEPS}) # If the target is EXPORTABLE, add it to the export set get_target_property(_is_imported ${dep} IMPORTED) if(NOT ${_is_imported}) install(TARGETS ${dep} - EXPORT RAJA - DESTINATION lib) + EXPORT RAJATargets + DESTINATION lib/cmake/raja) # Namespace target to avoid conflicts set_target_properties(${dep} PROPERTIES EXPORT_NAME RAJA::${dep}) endif() diff --git a/cmake/SetupRajaOptions.cmake b/cmake/SetupRajaOptions.cmake index 50e2f18c5d..934e720c41 100644 --- a/cmake/SetupRajaOptions.cmake +++ b/cmake/SetupRajaOptions.cmake @@ -5,9 +5,6 @@ # SPDX-License-Identifier: (BSD-3-Clause) ################################################################################ -# Enable OpenMP by by default -set(RAJA_ENABLE_OPENMP On CACHE BOOL "Build OpenMP support") - set(RAJA_ENABLE_WARNINGS_AS_ERRORS Off CACHE BOOL "") set(ENABLE_GTEST_DEATH_TESTS On CACHE BOOL "Enable tests asserting failure.") @@ -18,6 +15,8 @@ option(RAJA_ENABLE_TBB "Build TBB support" Off) option(RAJA_ENABLE_TARGET_OPENMP "Build OpenMP on target device support" Off) option(RAJA_ENABLE_SYCL "Build SYCL support" Off) +option(RAJA_ENABLE_VECTORIZATION "Build experimental vectorization support" On) + option(RAJA_ENABLE_REPRODUCERS "Build issue reproducers" Off) option(RAJA_ENABLE_EXERCISES "Build exercises " On) @@ -30,7 +29,7 @@ option(RAJA_ENABLE_BOUNDS_CHECK "Enable bounds checking in RAJA::Views/Layouts" option(RAJA_TEST_EXHAUSTIVE "Build RAJA exhaustive tests" Off) option(RAJA_TEST_OPENMP_TARGET_SUBSET "Build subset of RAJA OpenMP target tests when it is enabled" On) option(RAJA_ENABLE_RUNTIME_PLUGINS "Enable support for loading plugins at runtime" Off) -option(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL "Enable use of device function pointers in hip backend" OFF) +option(RAJA_ALLOW_INCONSISTENT_OPTIONS "Enable inconsistent values for ENABLE_X and RAJA_ENABLE_X options" Off) option(RAJA_ENABLE_DESUL_ATOMICS "Enable support of desul atomics" Off) set(DESUL_ENABLE_TESTS Off CACHE BOOL "") diff --git a/docs/sphinx/dev_guide/branch_development.rst b/docs/sphinx/dev_guide/branch_development.rst index e4092a26c3..16723645fe 100644 --- a/docs/sphinx/dev_guide/branch_development.rst +++ b/docs/sphinx/dev_guide/branch_development.rst @@ -29,14 +29,14 @@ Persistent, Protected Branches --------------------------------- The **main** and **develop** branches are the two primary branches we use. -They always exist and are protected in the RAJA GitHub project in that -changes to them only occur as a result of approved pull requests. The +They always exist and are protected in the RAJA GitHub project, meaning that +changes to them can only occur as a result of approved pull requests. The distinction between the main and develop branches is an important part of Gitflow. * The *main* branch records the release history of the project. Each time the main branch is changed, a new tag for a new code version is made. - See :ref:`semver-label` for a description of the version numbering scheme + See :ref:`version-label` for a description of the version labeling scheme we use. * The *develop* branch is used to integrate and test new features and most @@ -45,15 +45,15 @@ Gitflow. .. important:: **Development never occurs directly on the main branch or develop branch.** -All other branches in the RAJA repo are temporary and are used to perform -specific development tasks. When such a branch is no longer needed (e.g., -after it is merged), the branch is deleted typically. +All other branches are temporary and are used to perform specific development +tasks. When such a branch is no longer needed (e.g., after it is merged), the +branch is deleted typically. ---------------- Feature Branches ---------------- -*Feature* branches are created off of other branches (usually develop) and are +A *feature* branch is created from another branch (usually develop) and is used to develop new features, bug fixes, etc. before they are merged to develop and eventually main. *Feature branches are temporary*, living only as long as they are needed to complete development tasks they contain. @@ -78,8 +78,9 @@ When all issues and comments arising in PR review discussion have been addressed, the PR has been approved, and all continuous integration checks have passed, the pull request can be merged. -.. important:: **Feature branches never interact directly with the main - branch.** +.. important:: **Feature branches almost never interact directly with the main + branch.** One exception is when a bug fix is needed in + the main branch to tag a patch release. --------------------------- Other Important Branches @@ -95,7 +96,7 @@ Gitflow Illustrated The figure below shows the basics of how branches interact in Gitflow. -.. figure:: git-workflow-gitflow2.png +.. figure:: ./figures/git-workflow-gitflow2.png This figure shows typical interactions between key branches in the Gitflow workflow. Here, development is shown following the v0.1.0 release. While diff --git a/docs/sphinx/dev_guide/build_configurations.rst b/docs/sphinx/dev_guide/build_configurations.rst index a554b977de..fc43632188 100644 --- a/docs/sphinx/dev_guide/build_configurations.rst +++ b/docs/sphinx/dev_guide/build_configurations.rst @@ -6,40 +6,43 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. host_config: +.. _build_config-label: ************************** RAJA Build Configurations ************************** -RAJA must be built and tested with a wide range of compilers and with -all of its supported back-ends. The project currently maintains two -ways to build and test important configurations in a reproducible manner: +To meet user needs, RAJA is built and tested with a wide range of compilers for +all of its supported back-ends. Automated continuous integration (CI) testing +employed by the project is described in :ref:`ci-label`. During day-to-day +development, the project currently maintains two ways to build and test +configurations in a reproducible manner: * **Build scripts.** The RAJA source repository contains a collection of simple build scripts that are used to generate build configurations - for platforms in the Livermore Computing Center primarily. + for a variety of platforms, such as Livermore Computing (LC) systems, + MacOS, and Linux environments. * **Generated host-config files.** The RAJA repository includes a mechanism to generate *host-config* files (i.e., CMake cache files) using `Spack `_. -Each of these specifies compiler versions and options, a build target -(Release, Debug, etc.), RAJA features to enable (OpenMP, CUDA, etc.), +The configurations specify compiler versions and options, build targets +(Release, Debug, etc.), RAJA features to enable (OpenMP, CUDA, HIP, etc.), and paths to required tool chains, such as CUDA, ROCm, etc. They are described briefly in the following sections. +.. _build_scripts-label: + =================== RAJA Build Scripts =================== -The build scripts in the RAJA ``scripts`` directory are used mostly by RAJA -developers to quickly create a build environment to compile and run tests -during code development. - -Each script is executed from the top-level RAJA directory. THe scripts for -CPU-only platforms require an argument that indicate the compiler version. +Build scripts mentioned above live in the +`RAJA/scripts `_ directory. +Each script is executed from the top-level RAJA directory. The scripts for +CPU-only platforms require an argument that indicates the compiler version. For example, .. code-block:: bash @@ -47,18 +50,18 @@ For example, $ ./scripts/lc-builds/toss3_clang.sh 10.0.1 Scripts for GPU-enabled platforms require three arguments: the device -compiler version, followed by the compute architecture, followed by the host +compiler version, the target compute architecture, and the host compiler version. For example, .. code-block:: bash $ ./scripts/lc-builds/blueos_nvcc_gcc.sh 10.2.89 sm_70 8.3.1 -When a script is run, it creates a uniquely-named build directory in the -top-level RAJA directory and runs CMake with arguments contained in the script -to create a build environment in the new directory. One then goes into that -directory and runs make to build RAJA, its tests, example codes, etc. -For example, +When a script is run, it creates a build directory named for the configuration +in the top-level RAJA directory and runs CMake with arguments contained in the +script to create a build environment in the new directory. One then goes into +that directory and runs 'make' to build RAJA, and depending on options +passed to CMake RAJA tests, example codes, etc. For example, .. code-block:: bash @@ -67,29 +70,26 @@ For example, $ make -j $ make test -Eventually, these scripts may go away and be superceded by the Spack-based -host-config file generation process when that achieves the level of -compiler coverage that the scripts have. +.. _spack_host_config-label: - -============================ -Generated Host-Config Files -============================ +================================== +Spack-Generated Host-Config Files +================================== The RAJA repository contains two submodules `uberenv `_ and `radiuss-spack-configs `_ that work together to generate host-config files. These are projects in the -GitHub LLNL organization and contain utilities shared by various projects. -The main uberenv script can be used to drive Spack to generate a *host-config* -file that contains all the information required to define a RAJA build -environment. The host-config file can then be passed to CMake using the '-C' -option to create a build configuration. *Spack specs* defining compiler -configurations are maintained in files in the radiuss-spack-configs -repository. +GitHub LLNL organization and contain utilities shared and maintained by +various projects. The main uberenv script is used to drive Spack to generate +a *host-config* file (i.e., a CMake *cache* file) that contains all the +information required to define a RAJA build environment. The generated file +can then be passed to CMake using the '-C' option to create a build +configuration. *Spack specs* defining compiler configurations are maintained +in files in the radiuss-spack-configs repository. -RAJA shares its uberenv workflow with other projects. The documentation -for this is available in `RADIUSS Uberenv Guide `_. +Additional documentation for this process is available in the +`RADIUSS Uberenv Guide `_. Generating a RAJA host-config file @@ -97,34 +97,38 @@ Generating a RAJA host-config file This section describes the host-config file generation process for RAJA. -Machine specific configurations +Platform configurations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Compiler configurations for Livermore computer platforms are contained in -in sub-directories in the RAJA ``scripts/uberenv/spack_configs`` directory: +Compiler configurations for Livermore computer platforms are contained +in sub-directories of the ``RAJA/scripts/radiuss-spack-configs`` submodule +directory: .. code-block:: bash - $ ls -c1 ./scripts/uberenv/spack_configs + $ ls -c1 ./scripts/radiuss-spack-configs + toss_4_x86_64_ib_cray + toss_4_x86_64_ib + toss_3_x86_64_ib blueos_3_ppc64le_ib darwin - toss_3_x86_64_ib - blueos_3_ppc64le_ib_p9 config.yaml + blueos_3_ppc64le_ib_p9 + ... -To see currently supported configurations, please see the contents of the -``compilers.yaml`` file in each of these sub-directories. +To see available configurations, please see the contents of the +``compilers.yaml`` and ``packages.yaml`` files in each sub-directory. Generating a host-config file ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The main uberenv python script can be invoked from the top-level RAJA directory +The ``uberenv.py`` python script can be run from the top-level RAJA directory to generate a host-config file for a desired configuration. For example, .. code-block:: bash - $ python ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0" - $ python ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0~shared+openmp tests=benchmarks" + $ python3 ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0" + $ python3 ./scripts/uberenv/uberenv.py --spec="%gcc@8.1.0~shared+openmp tests=benchmarks" Each command generates a corresponding host-config file in the top-level RAJA directory. The file name contains the platform and OS to which it applies, and @@ -134,17 +138,16 @@ the compiler and version. For example, hc-quartz-toss_3_x86_64_ib-gcc@8.1.0-fjcjwd6ec3uen5rh6msdqujydsj74ubf.cmake -Specs that are exercised during the Gitlab CI process are found YAML files in -the ``RAJA/.gitlab`` directory. See :ref:`vettedspecs-label` for more -information. +This process is also used by our Gitlab CI testing effort. +See :ref:`ci-label` for more information. Building RAJA with a generated host-config file ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ To build RAJA with one of these host-config files, create a build directory and -run CMake in it by passing the host-config file to CMake using the '-C' option. -Then, run make and RAJA tests, if desired, to make sure the build was done -properly: +run CMake in it by passing a host-config file to CMake using the '-C' option. +Then, run 'make' to build RAJA. To ensure the build was successful, you may +want to run the RAJA tests. For example, .. code-block:: bash @@ -153,8 +156,16 @@ properly: $ cmake --build -j . $ ctest --output-on-failure -T test -It is also possible to use the configuration with a RAJA CI script outside -of the normal CI process: +You may also run the RAJA tests with the command + +.. code-block:: bash + + $ make test + +as an alternative to the 'ctest' command used above. + +It is also possible to use the configuration with the RAJA Gitlab CI script +outside of the Gitlab environment: .. code-block:: bash @@ -164,20 +175,25 @@ MacOS ^^^^^ In RAJA, the Spack configuration for MacOS contains the default compiler -corresponding to the OS version (`compilers.yaml`), and a commented section to -illustrate how to add `CMake` as an external package. You may install CMake +corresponding to the OS version in the ``compilers.yaml`` file in the +``RAJA/scripts/radiuss-spack-configs/darwin/`` directory, and a commented +section to illustrate how to add `CMake` as an external package in the +``packages.yaml`` in the same directory. You may also install CMake with `Homebrew `_, for example, and follow the process outlined above after it is installed. -============================ -Reproducing Docker Builds -============================ +.. _docker_local-label: -RAJA uses docker container images that it shares with other LLNL GitHub projects -for CI testing on GitHub. Currently, we use Azure for Linux, Windows, and MacOS -builds and also have Appveyor builds for Windows. +================================== +Reproducing Docker Builds Locally +================================== -You can reproduce these builds locally for testing with the following steps: +RAJA uses Docker container images that it shares with other LLNL GitHub projects +for Azure CI testing (see :ref:`azure_ci-label` for more information). +We use Azure Pipelines for Linux, Windows, and MacOS builds. + +You can reproduce these builds locally for testing with the following steps if +you have Docker installed. #. Run the command to build a local Docker image: @@ -185,8 +201,8 @@ You can reproduce these builds locally for testing with the following steps: $ DOCKER_BUILDKIT=1 docker build --target ${TARGET} --no-cache - Here, ${TARGET} is replaced with one of the names following "AS" in the - `RAJA Dockerfile `_ + Here, ``${TARGET}`` is replaced with one of the names following ``AS`` in + the `RAJA Dockerfile `_. #. To get dropped into a terminal in the Docker image, run the following: @@ -195,10 +211,15 @@ You can reproduce these builds locally for testing with the following steps: $ docker run -it axom/compilers:${COMPILER} /bin/bash - Here, ${COMPILER} is replaced with the compiler you want (see the + Here, ``${COMPILER}`` is replaced with the compiler you want (see the aforementioned Dockerfile). Then, you can build, run tests, edit files, etc. in the Docker image. Note that -the docker command has a '-v' argument that you can use to mount your local -directory in the image; e.g., -v `pwd`:/opt/RAJA would mount the pwd as -/opt/RAJA in the image. +the docker command has a ``-v`` argument that you can use to mount a local +directory in the image. For example + + .. code-block:: bash + + & docker -v pwd:/opt/RAJA + +will mount your current local directory as ``/opt/RAJA`` in the image. diff --git a/docs/sphinx/dev_guide/ci.rst b/docs/sphinx/dev_guide/ci.rst index 2ac2876ccb..38634a1301 100644 --- a/docs/sphinx/dev_guide/ci.rst +++ b/docs/sphinx/dev_guide/ci.rst @@ -12,78 +12,348 @@ Continuous Integration (CI) Testing ************************************ -The RAJA project employs multiple tools to run its tests for each GitHub -*pull request*, all of which must pass before the pull request can be merged. -These tools include: +.. important:: * All CI test checks must pass before a pull request can be + merged. + * The status (pass/fail and run) for all checks can be viewed by + clicking the appropriate link in the **checks** section of a + GitHub pull request. - * **Azure.** This runs builds for Linux, Windows, and MacOS environments - using a variety of compilers. While we do GPU builds for CUDA, HIP, and - SYCL on Azure, RAJA tests are run for each non-GPU build. +The CI tools used by the RAJA project, and which integrate with GitHub are: - * **Appveyor.** This runs builds and tests for a Windows environment for two - versions of the Visual Studio compiler. + * **Azure Pipelines** runs builds and tests for Linux, Windows, and MacOS + environments using recent versions of various compilers. While we do GPU + builds for CUDA, HIP, and SYCL on Azure, RAJA tests are only run for + CPU-only pipelines. See the + `RAJA Azure DevOps `_ project to learn + more about our testing there. - * **Gitlab CI.** This runs builds and tests on platforms in the Livermore - Computing *Collaboration Zone*. This is a recent addition for RAJA and - is a work-in-progress to get full coverage of compilers and tests we - need to exercise. + * **Gitlab** instances in the Livermore Computing (LC) Center + runs builds and tests in LC resource and compiler environments + important to many RAJA user applications. Execution of RAJA CI + pipelines on LC Gitlab resources has restrictions described below. If + you have access to LC resources, you can access additional information about + `LC GitLab CI `_ -These tools integrate seamlessly with GitHub. They will automatically -(re)run RAJA builds and tests as changes are pushed to each PR branch. Gitlab -CI execution on Livermore Computing resources has some restrictions which are -described below. +The tools automatically run RAJA builds and tests when a PR is created and +when changes are pushed to a PR branch. -Gitlab CI support is still being developed to make it more easy to use with -GitHub projects. The current state is described below. +The following sections describe basic elements of the operation of the CI tools. -.. note:: The status of checks (pass/fail, running status) for each of these - tools can be viewed by clicking the appropriate link in the check - section of a pull request. +.. _gitlab_ci-label: +========= Gitlab CI ========= -If all memmbers of a GitHub project are members of the LLNL GitHub organization -and have two-factor authentication enabled on their GitHub accounts, -auto-mirroring on the Livermore Computing Collaboration Zone Gitlab server is -enabled. Thus, Gitlab CI will run automatically for those projects on pull -requests that are made by project members. Otherwise, due to Livermore -Computing security policies, Gitlab CI must be launched manually by a *blessed* -GitHub user satisfying the constraints described above. To manually initiate -Gitlab CI on a pull request, add a comment with 'LGTM' in it. +The Gitlab CI instance used by the RAJA project lives in the Livermore +Computing (LC) Collaboration Zone (CZ). It runs builds and tests in LC +resource and compiler environments important to RAJA user applications at LLNL. + +Constraints +----------- + +Running Gitlab CI on Livermore Computing (LC) resources is constrained by LC +security policies. The policies require that all members of a GitHub project +be members of the LLNL GitHub organization and have two-factor authentication +enabled on their GitHub accounts to automatically mirror a GitHub repo and +trigger Gitlab CI functionality from GitHub. For compliant LLNL GitHub projects, +auto-mirroring of the GitHub repo on LC Gitlab is done when changes are pushed +to PRs for branches in the RAJA repo, but not for PRs for a branch on a fork of +the repo. An alternative procedure we use to handle this is described in +:ref:`contributing-label`. If you have access to LC resources, you can learn +more about `LC Gitlab mirroring `_. + +Gitlab CI (LC) Testing Workflow +-------------------------------------- + +The figure below shows the high-level steps in the RAJA Gitlab CI testing +process. The main steps, which we will discuss in more detail later, are: + + #. A *mirror* of the RAJA GitHub repo in the RAJA Gitlab project is updated + whenever the RAJA ``develop`` or ``main`` branches are changed as well + as when any PR branch in the RAJA GitHub project is changed. + #. Gitlab launches CI test pipelines. While running, the execution and + pass/fail status may be viewed and monitored in the Gitlab CI GUI. + #. For each resource and compiler combination, + `Spack `_ is used to generate a build + configuration in the form of a CMake cache file, or *host-config* file. + #. A host-config file is passed to CMake, which configures a RAJA build + space. Then, RAJA and its tests are compiled. + #. Next, the RAJA tests are run. + #. When a test pipeline completes, final results are reported in Gitlab. + +In the next section, we will describe the roles that specific files in the +RAJA repo play in defining these steps. + +.. figure:: ./figures/RAJA-Gitlab-Workflow2.png + + The main steps in the RAJA Gitlab CI testing workflow are shown in the + figure. This process is triggered when a developer makes a PR on the + GitHub project or whenever changes are pushed to the source branch of a PR. + +Gitlab CI (LC) Testing Files +-------------------------------------- + +The following figure shows directories and files in the RAJA repo that +support LC Gitlab CI testing. Files with names in blue are specific to RAJA +and are maintained by the RAJA team. Directories and files with names in red are +in Git submodules, shared and maintained with other projects. + +.. figure:: ./figures/RAJA-Gitlab-Files.png + + The figure shows directories and files in the RAJA repo that support Gitlab + CI testing. Files in blue are specific to RAJA and owned by the RAJA team. + Red directories and files are part of Git submodules shared with other + projects. + +In the following sections, we discuss how these files are used in the +steps in the RAJA Gitlab CI testing process summarized above. + +Launching CI pipelines (step 2) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In **step 2** of the diagram above, Gitlab launches RAJA test pipelines. +The `RAJA/.gitlab-ci.yml `_ file contains high-level testing information, +such as stages (resource allocation, build-and-test, and resource +deallocation) and locations of files that define which jobs will run +in each pipeline. For example, these items appear in the file as:: + + stages: + - r_allocate_resources + - r_build_and_test + - r_release_resources + - l_build_and_test + - c_build_and_test + - multi_project + +and:: + + include: + - local: .gitlab/ruby-templates.yml + - local: .gitlab/ruby-jobs.yml + - local: .gitlab/lassen-templates.yml + - local: .gitlab/lassen-jobs.yml + - local: .gitlab/corona-templates.yml + - local: .gitlab/corona-jobs.yml + +In the ``stages`` section above, prefixes 'r_', 'l_', and 'c_' refer to +resources in the LC on which tests are run. Specifically, the machines 'ruby', +'lassen', and 'corona', respectively. Jobs that will run in pipeline(s) on each +resource are defined in the files listed in the ``include`` section above. +Note that the stage labels above appear on each Gitlab CI run web page as the +title of a column containing other information about what is run in that stage, +such as build and test jobs. + +The `RAJA/.gitlab `_ +directory contains a *templates* and *jobs* file for each LC resource on which +test pipelines will be run. The ``-templates.yml`` files contain +information that is common across jobs that run on the corresponding resource, +such as commands and scripts that are run for stages identified in the +``RAJA/.gitlab-ci.yml`` file. For example, the +``RAJA/.gitlab/ruby-templates.yml`` file contains a section:: + + allocate_resources (on ruby): + variables: + GIT_STRATEGY: none + extends: .on_ruby + stage: r_allocate_resources + script: + - salloc -N 1 -p pdebug -t 45 --no-shell --job-name=${ALLOC_NAME} + +which contains the resource allocation command associated with the +``r_allocate_resources`` stage identifier on 'ruby'. Analogous stages are +defined similarly in other ``RAJA/.gitlab/-templates.yml`` files. + +The ``-jobs.yml`` files are described in the following sections. + +Running a CI build/test pipeline (steps 3, 4, 5, 6) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The `RAJA/scripts/gitlab/build_and_test.sh `_ file defines the steps executed +for each build and test run as well as information that will appear in the +log output for each step. First, the script invokes the +``RAJA/scripts/uberenv/uberenv.py`` Python script located in the +`uberenv `_ submodule:: + + ... + + python3 scripts/uberenv/uberenv.py --spec="${spec}" ${prefix_opt} + + ... + +Project specific settings related to which Spack version to use, where +Spack packages live, etc. are located in the +`RAJA/.uberenv_config.json `_ file. + +The uberenv python script invokes Spack to generate a CMake *host-config* +file containing a RAJA build specification **(step 3)**. To generate +a *host-config* file, Spack uses the +`RAJA Spack package `_, plus *Spack spec* information. +The ``RAJA/.gitlab/-jobs.yml`` file defines a build specification +(*Spack spec*) for each job that will be run on the corresponding resource. +For example, in the ``lassen-jobs.yml`` file, you will see an entry such as:: + + gcc_8_3_1_cuda_10_1_168: + variables: + SPEC: "+cuda %gcc@8.3.1 cuda_arch=70 ^cuda@10.1.168" + extends: .build_and_test_on_lassen + +This defines the *Spack spec* for the test job in which CUDA device code will +be built with the nvcc 10.1.168 compiler and non-device code will be compiled +with the GNU 8.3.1 compiler. In the Gitlab CI GUI, this pipeline will be +labeled ``gcc_8_3_1_cuda_10_1_168``. Details for compilers, such as file +system paths, target architecture, etc. are located in the +``RAJA/scripts/radiuss-spack-configs//compilers.yaml`` file for the +system type associated with the resource. Analogous information for packages +like CUDA and ROCm (HIP) are located in the corresponding +``RAJA/scripts/radiuss-spack-configs//packages.yaml`` file. + +.. note:: Please see :ref:`spack_host_config-label` for more information about + Spack-generated host-config files and how to use them for local + debugging. + +After the host-config file is generated, the +``scripts/gitlab/build_and_test.sh`` script creates a build space directory +and runs CMake in it, passing the host-config (cache) file. Then, it builds +the RAJA code and tests **(step 4)**:: + + ... + + build_dir="${build_root}/build_${hostconfig//.cmake/}" + + ... + + date + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~ Host-config: ${hostconfig_path}" + echo "~ Build Dir: ${build_dir}" + echo "~ Project Dir: ${project_dir}" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "" + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Building RAJA" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + rm -rf ${build_dir} 2>/dev/null + mkdir -p ${build_dir} && cd ${build_dir} + + ... + + cmake \ + -C ${hostconfig_path} \ + ${project_dir} + + cmake --build . -j ${core_counts[$truehostname]} + +Next, it runs the tests **(step 5)**:: + + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "~~~~~ Testing RAJA" + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + + ... + + cd ${build_dir} + + ... + + ctest --output-on-failure -T test 2>&1 | tee tests_output.txt + +Lastly, the script packages the test results into a JUnit XML file that +Gitlab uses for reporting the results in its GUI **(step 6)**:: + + echo "Copying Testing xml reports for export" + tree Testing + xsltproc -o junit.xml ${project_dir}/blt/tests/ctest-to-junit.xsl Testing/*/Test.xml + mv junit.xml ${project_dir}/junit.xml + +The commands shown here intermingle with other commands that emit messages, +timing information for various operations, etc. which appear in a log +file that can be viewed in the Gitlab GUI. + +.. _azure_ci-label: + +================== +Azure Pipelines CI +================== + +The Azure Pipelines tool builds and tests for Linux, Windows, and MacOS +environments. While we do builds for CUDA, HIP, and SYCL RAJA back-ends +in the Azure Linux environment, RAJA tests are only run for CPU-only pipelines. + +Azure Pipelines Testing Workflow +-------------------------------- + +The Azure Pipelines testing workflow for RAJA is much simpler than the Gitlab +testing process described above. + +The test jobs we run for each OS environment are specified in the +`RAJA/azure-pipelines.yml `_ file. This file defines the job steps, commands, +compilers, etc. for each OS environment in the associated ``- job:`` section. +A summary of the configurations we build are: + + * **Windows.** The ``- job: Windows`` Windows section contains information + for the Windows test builds. For example, we build and test RAJA as + a static and shared library. This is indicated in the Windows ``strategy`` + section:: + + strategy: + matrix: + shared: + ... + static: + ... + + We use the Windows/compiler image provided by the Azure application + indicated the ``pool`` section; for example:: -It is important to note that RAJA shares its Gitlab CI workflow with -other projects. See `Shared Gitlab CI Workflow `_ for more information. + pool: + vmImage: 'windows-2019' + **MacOS.** The ``- job: Mac`` section contains information for Mac test + builds. For example, we build RAJA using the the MacOS/compiler + image provided by the Azure application indicated in the ``pool`` section; + for example:: -.. _vettedspecs-label: + pool: + vmImage: 'macOS-latest' -Vetted Specs ------------- + **Linux.** The ``- job: Docker`` section contains information for Linux + test builds. We build and test RAJA using Docker container images generated + with recent versions of various compilers. The RAJA project shares these + images with other open-source LLNL RADIUSS projects and they are maintained + in the `RES-ops Docker `_ + project on GitHub. The builds we do at any point in time are located in + the ``strategy`` block:: -The *vetted* compiler specs are those which we use during the RAJA Gitlab CI -testing process. These can be viewed by looking at files in the RAJA -``.gitlab`` directory. For example, + strategy: + matrix: + gccX: + docker_target: ... + ... + clangY: + docker_target: ... + ... + nvccZ: + docker_target: ... -.. code-block:: bash + ... - $ ls -c1 .gitlab/*jobs.yml - .gitlab/lassen-jobs.yml - .gitlab/ruby-jobs.yml + The Linux OS the docker images are run on is indicated in the ``pool`` section; + for example:: -lists the yaml files containing the Gitlab CI jobs for the lassen and ruby -machines. + pool: + vmImage: 'ubuntu-latest' -Then, executing a command such as: +Docker Builds +------------- -.. code-block:: bash +For each Linux/Docker pipeline, the base container images, CMake, build, and +test commands are located in `RAJA/Dockerfile `_. - $ git grep -h "SPEC" .gitlab/ruby-jobs.yml | grep "gcc" - SPEC: "%gcc@4.9.3" - SPEC: "%gcc@6.1.0" - SPEC: "%gcc@7.3.0" - SPEC: "%gcc@8.1.0" +The base container images are built and maintained through the `RSE-Ops `_ RADIUSS project. A table of the most up to date containers can be found `here `_. These images are rebuilt regularly ensuring that we have the most up to date builds of each container / compiler. -will list the specs vetted on the ruby platform. +.. note:: Please see :ref:`docker_local-label` for more information about + reproducing Docker builds locally for debugging purposes. -More details to come... diff --git a/docs/sphinx/dev_guide/ci_tasks.rst b/docs/sphinx/dev_guide/ci_tasks.rst new file mode 100644 index 0000000000..ebf813cc97 --- /dev/null +++ b/docs/sphinx/dev_guide/ci_tasks.rst @@ -0,0 +1,233 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _ci_tasks-label: + +****************************************************** +Continuous Integration (CI) Testing Maintenance Tasks +****************************************************** + +In :ref:`ci-label`, we described RAJA CI workflows. This section +describes common CI testing maintenance tasks for RAJA and how to +perform them. + +.. _gitlab_ci_tasks-label: + +================= +Gitlab CI Tasks +================= + +The tasks in this section apply to GitLab CI running on Livermore +Computing (LC) resources. + +Changing Build Specs +--------------------- + +The builds for each LC platform on which we run Gitlab CI pipelines are +defined in ``-jobs.yml`` files in the `RAJA/.gitlab `_ directory. The key items +that change when a new build is added are: + + * the unique **label** that identifies the build on a web page for + a Gitlab CI pipeline, and + * the build **Spack spec**, which identifies the compiler and version, + compiler flags, etc. + +For example, an entry for a build using a clang compiler with CUDA is: + +.. code-block:: bash + + ibm_clang_10_0_1_cuda_10_1_168: + variables: + SPEC: "+cuda cuda_arch=70 %clang@ibm.10.0.1 ^cuda@10.1.168" + extends: .build_and_test_on_lassen + +To update, change the corresponding spec item, such as clang compiler +or version, or cuda version. Then, update the label accordingly. + +It is important to note that the build spec information must reside in +the ``compilers.yaml`` and/or ``packages.yaml`` file for the system type +in the `radiuss-spack-configs `_ submodule. If the desired information is not there, +try updating the submodule to a newer version. If the information +is still not available, create a branch in the +`RADIUSS Spack Configs `_ repo, add the needed spec info, and create a pull request. + +.. important:: Build spec information used in RAJA Gitlab CI pipelines + must exist in the ``compilers.yaml`` file and/or + ``packages.yaml`` file for the appropriate system type in + the `RADIUSS Spack Configs `_ repo. + +Changing Build/Run Parameters +------------------------------ + +The commands executed to acquire resources on each +system/system-type on which we run Gitlab CI are defined in the +`RAJA/.gitlab-ci.yml `_ file. The default execution time for each test pipeline is +also defined in the file using the variable ``DEFAULT_TIME``. These +commands and settings can remain as is for the most part. + +However, sometimes a particular pipeline will take longer to build and +run than the default allotted time. In this case, the default time can +be adjusted in the build spec information in the associated +``-jobs.yml`` file discussed in the previous section. +For example: + +.. code-block:: bash + + xl_16_1_1_7_cuda: + variables: + SPEC: "+cuda %xl@16.1.1.7 cuda_arch=70 ^cuda@10.1.168 ^cmake@3.14.5" + DEFAULT_TIME: 60 + allow_failure: true + extends: .build_and_test_on_lassen + +This example explicitly sets the build and test allocation time to 60 minutes: +``DEFAULT_TIME: 60``. Note that it also allows the pipeline to fail: +``allow_failure: true``. We do this in some cases where certain tests are known +to fail regularly. This allows the overall check status to report as passing, +even though the test pipeline annotated this way may fail. + + +Adding Test Pipelines +--------------------- + +Adding a test pipeline involves adding a new entry in the +``RAJA/.gitlab-ci.yml`` file. + +.. important:: Build spec information used in RAJA Gitlab CI pipelines + must exist in the ``compilers.yaml`` file and/or + ``packages.yaml`` file for the appropriate system type in + the `RADIUSS Spack Configs `_ repo. + + +.. _azure_ci_tasks-label: + +================= +Azure CI Tasks +================= + +The tasks in this section apply to RAJA Azure Pipelines CI. + +Changing Builds/Container Images +--------------------------------------- + +The builds we run in Azure are defined in the `RAJA/azure-pipelines.yml `_ file. + +Linux/Docker +............ + +To update or add a new compiler / job to Azure CI we need to edit both ``azure-pipelines.yml`` and ``Dockerfile``. + +If we want to add a new Azure pipeline to build with ``compilerX``, then in ``azure-pipelines.yml`` we can add the job like so:: + + -job: Docker + ... + strategy: + matrix: + ... + compilerX: + docker_target: compilerX + +Here, ``compilerX:`` defines the name of a job in Azure. ``docker_target: compilerX`` defines a variable ``docker_target``, which is used to determine what part of the ``Dockerfile`` to run. + +In the ``Dockerfile`` we will want to add our section that defines the commands for the ``compilerX`` job.:: + + FROM ghcr.io/rse-ops/compilerX-ubuntu-20.04:compilerX-XXX AS compilerX + ENV GTEST_COLOR=1 + COPY . /home/raja/workspace + WORKDIR /home/raja/workspace/build + RUN cmake -DCMAKE_CXX_COMPILER=compilerX ... && \ + make -j 6 &&\ + ctest -T test --output-on-failure + +Each of our docker builds is built up on a base image maintained by RSE-Ops, a table of available base containers can be found `here `_. We are also able to add target names to each build with ``AS ...``. This target name correlates to the ``docker_target: ...`` defined in ``azure-pipelines.yml``. + +The base containers are shared across multiple projects and are regularly rebuilt. If bugs are fixed in the base containers the changes will be automatically propagated to all projects using them in their Docker builds. + +Check `here `_ for a list of all currently available RSE-Ops containers. Please see the `RSE-Ops Containers Project `_ on Github to get new containers built that aren't yet available. + +Windows / MacOs +............... + +We run our Windows / MacOS builds directly on the Azure virtual machine instances. In order to update the Windows / MacOS instance we can change the ``pool`` under ``-job: Windows`` or ``-job: Mac``:: + + -job: Windows + ... + pool: + vmImage: 'windows-2019' + ... + -job: Mac + ... + pool: + vmImage: 'macOS-latest' + +Changing Build/Run Parameters +----------------------------- + +Linux/Docker +............ + +We can edit the build and run configurations of each docker build, in the ``RUN`` command. Such as adding CMake options or changing the parallel build value of ``make -j N`` for adjusting throughput. + +Each base image is built using `spack `_. For the most part the container environments are set up to run our CMake and build commands out of the box. However, there are a few exceptions where we need to ``spack load`` specific modules into the path. + + * **Clang** requires us to load LLVM for OpenMP runtime libraries.:: + + . /opt/spack/share/spack/setup-env.sh && spack load llvm + + **CUDA** for the cuda runtime.:: + + . /opt/spack/share/spack/setup-env.sh && spack load cuda + + **HIP** for the hip runtime and llvm-amdgpu runtime libraries.:: + + . /opt/spack/share/spack/setup-env.sh && spack load hip llvm-amdgpu + + **SYCL** requires us to run setupvars.sh:: + + source /opt/view/setvars.sh + +Windows / MacOS +............... + +Windows and MacOS build / run parameters can be configured directly in ``azure-pipelines.yml``. CMake options can be configured with ``CMAKE_EXTRA_FLAGS`` for each job. The ``-j`` value can also be edited directly in the Azure ``script`` definitions for each job. + +The commands executed to configure, build, and test RAJA for each +pipeline in Azure are located in the `RAJA/Dockerfile `_ file. +Each pipeline section begins with a line that ends with ``AS ...`` +where the ellipses in the name of a build-test pipeline. The name label +matches an entry in the Docker test matrix in the +``RAJA/azure-pipelines.yml`` file mentioned above. + + +.. _rajaperf_ci_tasks-label: + +================================ +RAJA Performance Suite CI Tasks +================================ + +The `RAJA Performance Suite `_ project CI +testing processes, directory/file structure, and dependencies are nearly +identical to that for RAJA, which is described in :ref:`ci-label`. Specifically, + + * The RAJA Performance Suite Gitlab CI process is driven by the + `RAJAPerf/.gitlab-ci.yml `_ file. + * The ``-jobs.yml`` and ``-templates.yml`` files reside + in the + `RAJAPerf/.gitlab `_ + directory. + * The ``build_and_test.sh`` script resides in the `RAJAPerf/scripts/gitlab `_ directory. + * The `RAJAPerf/Dockerfile `_ drives the Azure testing pipelines. + +The main difference is that for Gitlab CI, is that the Performance Suite uses +the RAJA submodules for ``uberenv`` and ``radiuss-spack-configs`` located in +the RAJA submodule to avoid redundant submodules. This is reflected in the +`RAJAPerf/.uberenv_config.json `_ +file which point at the relevant RAJA submodule locations. + +Apart from this minor difference, all CI maintenance and development tasks for +the RAJA Performance Suite follow the guidance in :ref:`ci_tasks-label`. diff --git a/docs/sphinx/dev_guide/contributing.rst b/docs/sphinx/dev_guide/contributing.rst index 8ac0dddd35..281da30f69 100644 --- a/docs/sphinx/dev_guide/contributing.rst +++ b/docs/sphinx/dev_guide/contributing.rst @@ -12,165 +12,209 @@ Contributing to RAJA ********************* -Since RAJA is a collaborative open source software project, we embrace -contributions from anyone who wants to add features or improve its existing -capabilities. This section describes basic processes to follow -for individuals outside of the core RAJA team to contribute new features or -bugfixes to RAJA. It assumes you are familiar with +RAJA is a collaborative open source software project and we encourage +contributions from anyone who wants to add features or improve its +capabilities. This section describes the following: + + * GitHub project access + * How to develop a RAJA *pull request* (PR) contribution. + * Requirements that must be met for a PR to be merged. + +We assume contributors are familiar with `Git `_, which we use for source code version control, and `GitHub `_, which is where our project is hosted. -This section describes development processes, such as: - - * Making a fork of the RAJA repository - * Creating a branch for development - * Creating a pull request (PR) - * Tests that your PR must pass before it can be merged into RAJA +.. important:: * Before a PR can be merged into RAJA, all test checks must pass + and the PR must be approved by at least one member of the + core RAJA team. + * Each RAJA contribution (feature, bugfix, etc.) must include + adequate tests, documentation, and code examples. The + *adequacy* of PR content, in this respect, is determined by + PR reviewers applying their professional judgment considering + the perspective of RAJA users and developers. + +======================= +GitHub Project Access +======================= + +RAJA maintains three levels of project access on it GitHub project: + + * **Core team members.** Individuals on the core RAJA team are frequent + RAJA contributors and participate regularly in project meetings, + discussions, and other project activities. They are members of + the LLNL GitHub organization and the ``RAJA-core`` GitHub team. Their + project privileges include the ability to create branches in the repository, + push code changes to the RAJA repo, make PRs, and merge them when they are + approved and all checks have passed. + * **Regular contributors.** Individuals, who are not on the core RAJA team, + but are members of the LLNL GitHub organization and are involved in some + aspects of RAJA development are considered regular contributors. They are + members of the ``RAJA-contrib`` GitHub team. Their project privileges + include the ability to create branches in the repository, push code changes + to the RAJA repo, and make PRs. However, they may not merge PRs and must + coordinate with the core team to have their work included in the develop + branch. This is mainly due to the way GitHub structures its project + access levels. + * **Everyone else.** Anyone with a GitHub account is welcome to contribute + to RAJA. Individuals outside of the two groups described above can make PRs + in the RAJA project, but must do so from a branch on a *fork* of + the RAJA repo. This is described below. + +======================= +Pull Request Process +======================= + +The following figure shows the basic elements of the RAJA PR contribution +workflow. Some details vary depending on RAJA GitHub project access level +of the contributor. The process involves four main steps: + + #. A RAJA contributor makes a PR on the RAJA GitHub project to merge a + branch on which she has developed a contribution into another RAJA branch, + typically, the develop branch. + #. When a PR is created, GitHub triggers Azure CI testing checks and + possibly Gitlab CI checks if the branch is part of the RAJA GItHub repo. + Running and pass/fail status is reported back to GitHub where it can be + viewed and monitored. + #. Meanwhile, RAJA team members and other contributors review the PR, + suggesting changes and/or approving when they think it is ready to merge. + #. When all checks pass and the PR is approved, the PR may be merged. + +.. figure:: ./figures/PR-Workflow.png + + The four main steps in the RAJA pull request (PR) process, which are + common practices for many software projects. + +This PR process should be familiar to nearly everyone who contributes to +s software project. If you would like more information about pull requests, +GitHub has a good +`PR guide `_ +on PR basics. + +.. important:: When you create a RAJA PR, you should enter a description of + its contents in the *PR template* form the team maintains for + this purpose. A good PR summary includes a descriptive title + of the the bug you fixed or the feature you have added. Other + relevant details that will assist others in reviewing your + contribution should also be included. ============ Forking RAJA ============ -If you are not a member of the LLNL organization on GitHub and of -the core RAJA team of developers, then you do not have permission to create -a branch in the RAJA repository. This is due to the policy adopted by the LLNL -organization on GitHub in which the RAJA project resides. Fortunately, you may -still contribute to RAJA by `forking the RAJA repo +As noted earlier, if you are not a member of the core RAJA development team, +or a recognized RAJA contributor, then you do not have permission to create a +branch in the RAJA GitHub repository. This choice is due to policies enforced +by the LLNL organization on GitHub (in which the RAJA project resides) and the +Livermore Computing (LC) organization (in which we run our Gitlab CI testing). +Fortunately, you may still contribute to RAJA by `forking the RAJA repo `_. Forking creates a copy of the RAJA -repository that you own. You can push code changes on that copy to GitHub and -create a pull request in the RAJA project. +repository that you own. You can make changes on your local copy and push them +your fork on GitHub. When you are ready to have your RAJA contribution reviewed +ad added to the RAJA project, you may create a pull request in the RAJA project. -.. note:: A contributor who is not a member of the LLNL GitHub organization - and the core team of RAJA developers cannot create a - branch in the RAJA repo. However, anyone can create a fork of the - RAJA project and create a pull request in the RAJA project. +.. note:: A contributor who is not a member of the core RAJA development team, + or a recognized RAJA contributor, cannot create a branch in the RAJA + GitHub repo. However, anyone can create a fork of the + RAJA project and create a pull request based on the fork in the + RAJA project. -========================= -Developing RAJA Code -========================= +=============================== +Developing A RAJA Contribution +=============================== New features, bugfixes, and other changes are developed on a **feature branch.** -Each such branch should be based on the RAJA ``develop`` branch. For more -information on the branch development model used in RAJA, please see +Each such branch should be based on the most current RAJA ``develop`` branch. +For more information on the branch development model used in RAJA, please see :ref:`branching-label`. When you want to make a contribution, first ensure -you have an up-to-date copy of the ``develop`` branch locally: +you have a local, up-to-date copy of the ``develop`` branch by running the +following commands: .. code-block:: bash $ git checkout develop $ git pull origin develop + $ git submodule update --init --recursive ----------------------- -Developing a Feature ----------------------- - -Assuming you are on the develop branch in your local copy of the RAJA repo, -and the branch is up-to-date, the first step toward developing a RAJA feature -is to create a new branch on which to perform your development. For example: - -.. code-block:: bash - - $ git checkout -b feature/ - -Proceed to modify your branch by committing changes with reasonably-sized -work portions (i.e., *atomic commits*), and add tests that will exercise your -new code. If you are creating new functionality, please add documentation to -the appropriate section of the `RAJA User Guide `_. The source files for the RAJA documentation are maintained in -the ``RAJA/docs`` directory. - -After your new code is complete, you've tested it, and developed appropriate -documentation, you can push your branch to GitHub and create a PR in the RAJA -project. It will be reviewed by members of the RAJA team, who will provide -comments, suggestions, etc. After it is approved and all CI checks pass, your -contribution will be merged into the RAJA repository. - -.. important:: When creating a branch that you intend to be merged into the - RAJA repo, please give it a succinct name that clearly describes - the contribution. For example, **feature/** - for a new feature, **bugfix/** for a bugfix, etc. +Then, in your local copy, you will be on the current version of develop branch +with all RAJA submodules synchronized with that. --------------------- -Developing a Bug Fix --------------------- +----------------------------------- +Feature and Bugfix Contributions +----------------------------------- -Contributing a bugfix follows the same process as described above. Be sure to -indicate in the name of your branch that it is for a bugfix; for example: +Assuming you are on an up-to-date develop branch in your local copy of RAJA, +the first step toward developing a RAJA contribution is to create a new branch +on which to do your development and push it to the remote origin of your local +copy. For example: .. code-block:: bash - $ git checkout -b bugfix/ - -We recommend that you add a test that reproduces the issue you have found -and demonstrates that the issue is resolved. To verify that you have done -this properly, build the code for your branch and then run ``make test`` to -ensure that your new test passes. - -When you are done, push your branch to GitHub, then create a PR in the RAJA -project. - ------------------------ -Creating a Pull Request ------------------------ - -You can create a pull request (PR) -`here `_. GitHub has a good -`PR guide `_ on -PR basics if you want more information. Ensure that the base branch for your -PR is the ``develop`` branch of RAJA. + $ git checkout -b /feature/ + $ git push -When you create a RAJA PR, you must enter a description of the contents of the -PR. We have a *PR template* for this purpose for you to fill in. Be sure to add -a descriptive title explaining the bug you fixed or the feature you have added -and any other relevant details that will assist the RAJA team in reviewing your -contribution. +where ``/feature/`` is the name of your feature +branch. Or, -When a PR is created in RAJA, it will be run through our automated testing -processes and be reviewed by RAJA team members. When the PR passes all -tests and it is approved, a member of the RAJA team will merge it. - -.. note:: Before a PR can be merged into RAJA, all CI checks must pass and - the PR must be approved by a member of the core team. +.. code-block:: bash ------ -Tests ------ + $ git checkout -b /bugfix/ + $ git push -RAJA uses multiple continuous integration (CI) tools to test every pull -request. See :ref:`ci-label` for more information. +where ``/bugfix/`` is the name of your bugfix branch. -All RAJA tests are in the ``RAJA/test`` directory and are split into -*unit tests* and *functional tests*. Unit tests are intended to test basic -interfaces and features of individual classes, methods, etc. Functional tests -are used to test combinations of RAJA features. We have organized our -tests to make it easy to see what is being tested and easy to add new tests. -For example, tests for each programming model back-end are exercised using -the same common, parameterized test code to ensure back-end support is -consistent. +Proceed to modify your branch by committing changes with reasonably-sized +work portions (i.e., *atomic commits*), and add tests that will exercise your +new code, and examples and documentation, as needed. If you are creating new +functionality, please add documentation to the appropriate section of the +`RAJA Documentation `_. The source +files for the RAJA documentation are maintained in the ``RAJA/docs`` directory +of the source repository. Consider adding example code(s) that illustrate +usage of the new features you develop to help users and other developers +understand your addition. These should be placed in the ``RAJA/examples`` +directory and referenced in the RAJA User Guide as appropriate. + +After your work is complete, you've tested it, and developed appropriate +documentation, you can push your local branch to GitHub and create a PR in the +RAJA project to merge your work into the RAJA develop branch. It will be +reviewed by members of the RAJA team, who will provide comments, suggestions, +etc. + +As we stated earlier, not all required :ref:`ci-label` checks can be run on a +PR made from a branch in a fork of RAJA. When the RAJA team has agreed to +accept your work, it will be pulled into the RAJA GitHub repo +(see :ref:`prfromfork-label`). Then, it will run through all required testing +and receive final reviews and approvals. When it is approved and all CI test +checks pass, your contribution will be merged into the RAJA repository, most +likely the develop branch. -.. important:: Please follow the sub-directory structure and code implementation - pattern for existing tests in the ``RAJA/test`` directory when - adding or modifying tests. +.. important:: When creating a branch that you intend to be merged into the + RAJA repo, please give it a succinct name that clearly describes + the contribution. For example, + **username/feature/** for a new feature, + **username/bugfix/** for a bugfix, etc. .. _prfromfork-label: ------------------------------------------------------------ -Testing Pull Requests from Branches in Forked Repositories ------------------------------------------------------------ +=========================================================== +Accepting A Pull Request From A Forked Repository +=========================================================== -Due to LLNL security policies and RAJA project policies, only a PR created -by someone on the RAJA core development team will be run automatically -through all RAJA CI tools. In particular, a PR made from branch on a forked -repository will not trigger Gitlab CI checks. Gitlab CI on internal LLNL -platforms will only be run on PRs that are made from branches in the GitHub -RAJA repository. This may change in the future to make it easier to work with -PRs from contributors that are not members of the LLNL organization on GitHub. +Due to LLNL security policies, some RAJA pull requests will not be able to +be run through all RAJA CI tools. The Livermore Computing (LC) Center +Gitlab systems restrict which GitHub PRs may automatically run through its +CI test pipelines. For example, a PR made from branch on a forked repository +will not trigger Gitlab CI checks. Gitlab CI on LC platforms will be run only +on PRs that are made from branches in the GitHub RAJA repository. +See :ref:`ci-label` for more information about RAJA PR testing. -.. note:: **RAJA core team members:** +.. note:: **The following process for accepting PR contributions from a fork + of the RAJA repo must be executed by a member of the RAJA team:** To facilitate testing contributions in PRs from forked repositories, we maintain a script to pull a PR branch from a forked repo into the - RAJA repo. First, identify the number of the PR. Then, run the - script from the top-level RAJA directory:: + RAJA repo. First, identify the number of the PR, which appears at + the top of your PR. Then, run a script from the top-level RAJA + directory:: $ ./scripts/make_local_branch_from_fork_pr -b diff --git a/docs/sphinx/dev_guide/figures/PR-Workflow.png b/docs/sphinx/dev_guide/figures/PR-Workflow.png new file mode 100644 index 0000000000..13d5853e9f Binary files /dev/null and b/docs/sphinx/dev_guide/figures/PR-Workflow.png differ diff --git a/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Files.png b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Files.png new file mode 100644 index 0000000000..1ee658bdaa Binary files /dev/null and b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Files.png differ diff --git a/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Workflow2.png b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Workflow2.png new file mode 100644 index 0000000000..2985739683 Binary files /dev/null and b/docs/sphinx/dev_guide/figures/RAJA-Gitlab-Workflow2.png differ diff --git a/docs/sphinx/dev_guide/git-workflow-gitflow2.png b/docs/sphinx/dev_guide/figures/git-workflow-gitflow2.png similarity index 100% rename from docs/sphinx/dev_guide/git-workflow-gitflow2.png rename to docs/sphinx/dev_guide/figures/git-workflow-gitflow2.png diff --git a/docs/sphinx/dev_guide/index.rst b/docs/sphinx/dev_guide/index.rst index 664fe31e47..bac04460e6 100644 --- a/docs/sphinx/dev_guide/index.rst +++ b/docs/sphinx/dev_guide/index.rst @@ -12,17 +12,19 @@ RAJA Developer Guide #################### -The RAJA Developer Guide is a work-in-progress.... - -This guide documents key software development processes used by the RAJA -project so that they are understood and uniformly applied by contributors. +The RAJA Developer Guide documents software development processes +followed by the RAJA project. The main goal of the guide is to ensure +all project contributors understand the key elements of the processes so +that they are consistently applied. .. toctree:: :maxdepth: 1 contributing - ci - build_configurations branch_development + build_configurations + ci + ci_tasks + tests release_process - semantic_versioning + versioning diff --git a/docs/sphinx/dev_guide/release_process.rst b/docs/sphinx/dev_guide/release_process.rst index 2886cd06c7..21196e2f76 100644 --- a/docs/sphinx/dev_guide/release_process.rst +++ b/docs/sphinx/dev_guide/release_process.rst @@ -12,7 +12,14 @@ RAJA Release Process ******************************************* -The RAJA release process typically involves the following sequence of steps: +RAJA is considered part of the **RAJA Portability Suite** set of projects. +Currently, the Suite includes `Umpire `_, `CHAI `_, and `camp `_, in addition to RAJA. + +.. important:: Releases for the Suite are coordinated, meaning that when a + non-patch release is done for one, a new version release is + done for all Suite projects. + +The RAJA release process includes the following sequence of steps: #. Identify all work (features in development, outstanding PRs, etc.) to be to be included in the release. @@ -24,19 +31,19 @@ The RAJA release process typically involves the following sequence of steps: into the **main branch.** When it is approved and all CI checks pass, merge the release candidate branch into the RAJA main branch. #. On GitHub, make a new release with a tag for the release. Following our - convention, the tag label should have the format ``vMM.mm.pp``. See - :ref:`semver-label` for a description of the version numbering scheme we + convention, the tag label should have the format ``YYYY.mm.pp``. See + :ref:`version-label` for a description of the version numbering scheme we use. In the GitHub release description, please note key features, bugfixes, etc. in the release. These should be a high-level summary of the contents of the ``RELEASE_NOTES.md`` file in the RAJA repo, which may contain more detailed information. Also, add a note to the release description to remind users to download the gzipped tarfile for the release instead of the assets GitHub creates for the release. - The GitHub-created assets do not contain the RAJA submodules and will + The GitHub-created assets do not contain the RAJA submodules and may cause issues for users as a result. - .. important:: For consistency, please follow a similar description - pattern for all RAJA releases. + .. important:: For consistency, please follow a similar release + description pattern for all RAJA releases. #. Check out the main branch locally and make sure it is up-to-date. Then, generate the release tarfile by running the script @@ -47,8 +54,12 @@ The RAJA release process typically involves the following sequence of steps: #. Edit the release in GitHub and upload the tarfile to the release. #. Make a PR to merge the main branch into the develop branch. After it passes all CI checks and is approved, merge the PR. This will ensure that - all changes done to finalize the release will not be lost in future - changes to the develop branch. + all changes done to finalize the release will be included in the develop + branch and future work on that branch. + +After a RAJA release is done, there a other tasks that typically need to be +performed to update content in other projects. These task are described in +:ref:`post_release-label`. .. _rcbranch-label: @@ -100,9 +111,8 @@ Hotfix Branch =========================== *Hotfix* branches are used in the (hopefully!) rare event that a bug is found -shortly after a release and which has the potential to negatively impact RAJA -users. A hotfix branch is used to address the issue and make a new release -containing only the fixed code. +shortly after a release that may negatively impact RAJA users. A hotfix branch +will address the issue be merged into both develop and main branches. A hotfix branch is *made from main* with the name **hotfix/**. The issue is fixed (hopefully quickly!) and the release notes file is updated on @@ -114,17 +124,19 @@ similar to the process described in :ref:`release-label`. For completeness, the key steps for performing a hotfix release are: #. Make a **hotfix** branch from main for a release (hotfix/), fix the - issue on the branch and verify, testing against user code if necessary, - and update the release notes file as needed. + issue on the branch and verify, testing against user code if necessary. + Update the release notes and RAJA patch version number as described + in :ref:`rcbranch-label`. #. When the hotfix branch is ready, make a PR for it to be merged into the **main branch.** When that is approved and all CI checks pass, merge it into the RAJA main branch. #. On GitHub, make a new release with a tag for the release. Following our - convention, the tag label should have the format ``vMM.mm.ppp``. In the - GitHub release description, note that the release is a bugfix release - and describe the issue that is resolved. Also, add a note to the release - description to download the gzipped tarfile for the release rather than - one of the assets GitHub creates as part of the release. + convention, the tag label should have the format ``YYYY.mm.pp``, where + only the **patch** portion of the release tag should differ from the + last release. In the GitHub release description, note that the release + is a bugfix release and describe the issue that is resolved. Also, add + a note to the release description to download the gzipped tarfile for the + release rather than the assets GitHub creates as part of the release. #. Check out the main branch locally and make sure it is up-to-date. Then, generate the tarfile for the release by running the script ``./scripts/make_release_tarball.sh`` from the top-level RAJA directory. @@ -134,3 +146,32 @@ the key steps for performing a hotfix release are: #. Make a PR to merge the main branch into the develop branch. After it passes all CI checks and is approved, merge the PR. This will ensure that changes for the bugfix will be included in future development. + +.. _post_release-label: + +========================= +Post-release Activities +========================= + +After a RAJA release is complete, other tasks are performed to update content +in other repositories, typically. These tasks include: + + * Update the `RAJAProxies `_ project + to the newly RAJA Portability Suite projects. This typically consists of + updating the submodules to the new RAJA Portability Suite project + versions, making sure the proxy-apps build and run correctly. When this + is done, tag a release for proxy-app project. + * Update the RAJA Spack package in the + `Spack repository `_. This requires some + knowledge of Spack and attention to details and Spack conventions. Please + see :ref:`spack_package-label` for details. + +.. _spack_package-label: + +========================= +Spack Package Update +========================= + +Describe how to update the RAJA Spack package.... + + diff --git a/docs/sphinx/dev_guide/tests.rst b/docs/sphinx/dev_guide/tests.rst new file mode 100644 index 0000000000..5d401bbb43 --- /dev/null +++ b/docs/sphinx/dev_guide/tests.rst @@ -0,0 +1,299 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tests-label: + +*************************** +RAJA Tests +*************************** + +As noted in :ref:`ci-label`, all RAJA test checks must pass before any PR +contribution will be merged. Additionally, we recommend that contributors +include new tests in their code contributions when adding new features +and bug fixes. + +.. note:: If RAJA team members think adequate testing is not included in a + PR branch, they will ask for additional testing to be added during + the review process. + +.. _tests_organization-label: + +========================= +Test Organization +========================= + +The goals of the RAJA test organization are to: + + * Make it easy to see what is tested and where tests live. We want + developers and users to be able to find tests easily and know where + to put new tests when they add them. + * Parameterize tests as much as reasonable to ensure that features work with + all supported RAJA back-ends and we are testing them consistently. We want + the source files for each test case to allow testing of each RAJA back-end. + Specifically, tests for each back-end are generated by instantiating the + same source routines with different type information. + * Have test source code generated for compilation by CMake when the code is + configured. This significantly reduces code redundancy and enables our + test parameterization goals. + +.. important: RAJA uses the `GoogleTest `_ framework, which is included in the `BLT `_ build system that RAJA uses. + +All RAJA tests reside in the +`RAJA/test `_ directory. +The test directory structure looks like this:: + + RAJA/test/functional/forall + kernel + scan + ... + include/... + integration/... + unit/algorithm + atomic + index + ... + +RAJA tests are partitioned into three main categories: + + * **Unit tests** exercise basic interfaces and features of individual RAJA + classes and methods in standalone fashion; i.e., integrated with other + parts of RAJA as minimally as is reasonable. RAJA unit tests reside + in sub-directories of the `RAJA/test/unit `_ directory. + * **Functional tests** integrate multiple RAJA features in common ways to + test how RAJA is used in practice. RAJA functional tests reside + in sub-directories of the `RAJA/test/functional `_ directory. + * **Integration tests** exercise features that integrate RAJA with other + libraries, such as Kokkos performance tools as plug-ins. RAJA integration + tests reside in sub-directories of the `RAJA/test/integration `_ directory. + +The `RAJA/test/include `_ directory contains header files that define types and other items that are +commonly used in various tests. + +.. important:: Please follow the existing sub-directory structure and code + implementation patterns for RAJA tests when adding or modifying + tests. + +.. _tests_anatomy-label: + +========================= +Anatomy Of A Test Case +========================= + +This section discusses in some detail the structure of files for a single +RAJA test case and how the work together. In particular, we describe the set +of basic tests that exercise ``RAJA::forall`` execution with various RAJA +segment types. + +.. note:: The implementation pattern described in the following sections is + similarly used by all other RAJA tests. + +Since these tests integrate multiple RAJA features, it is considered a +*functional* test. The files for this test are located in the +`RAJA/test/functional/forall/segment `_ directory. The contents of the directory are:: + + $ ls -c1 -R ./test/functional/forall/segment + ./test/functional/forall/segment: + tests + test-forall-segment.cpp.in + CMakeLists.txt + + ./test/functional/forall/segment/tests: + test-forall-RangeStrideSegment.hpp + test-forall-RangeSegment.hpp + test-forall-ListSegment.hpp + +Next, we describe these and their relationships. + +.. _tests_source-label: + +Test Source File +----------------- + +The `test-forall-segment.cpp.in `_ file is the +parameterized test source file. It contains header file include statements:: + + // + // test/include headers + // + #include "RAJA_test-base.hpp" + #include "RAJA_test-camp.hpp" + #include "RAJA_test-index-types.hpp" + + #include "RAJA_test-forall-data.hpp" + #include "RAJA_test-forall-execpol.hpp" + + // + // Header for tests in ./tests directory + // + // Note: CMake adds ./tests as an include dir for these tests. + // + #include "test-forall-@SEGTYPE@.hpp" + +The first set of header files live in the ``RAJA/test/include`` directory +mentioned earlier. The headers are centrally located since their contents +are shared with other test files. The last include statement pulls in the +header file containing the parameterized tests for the corresponding RAJA +segment type. + +Next, a ``camp::cartesian_product`` type is defined to assemble sets of types +used in the parameterized tests:: + + // + // Cartesian product of types used in parameterized tests + // + using @BACKEND@ForallSegmentTypes = + Test< camp::cartesian_product>::Types; + +The first template argument defining the ``camp::cartesian_product object`` +type refers to a list of segment index types defined in the +`RAJA_test-index-types.hpp `_ header file. +The second argument refers to a list +of RAJA/camp resource types appropriate for the RAJA execution back-end defined +in the `RAJA_test-camp.hpp `_ header file (see :ref:`tests_header-label` for +where this is used). The third argument refers to a list of RAJA +execution policy types defined in the +`RAJA_test-forall-execpol.hpp `_ +header file. This results in the generation of a combinatorial collection of +typed tests being run. Each test is defined by a unique tuple of types, +described in :ref:`tests_header-label`. + +Lastly, the parameterized set of tests is instantiated:: + + // + // Instantiate parameterized test + // + INSTANTIATE_TYPED_TEST_SUITE_P(@BACKEND@, + Forall@SEGTYPE@Test, + @BACKEND@ForallSegmentTypes); + +``INSTANTIATE_TYPED_TEST_SUITE_P`` is a GoogleTest macro. The first +argument is a label noting the RAJA back-end used for the generated tests. +This can be used to filter the tests when they are manually run. +The second argument is a label identifying the test set, and the +third argument matches the CMake generated name for the +``camp::cartesian_product`` type described above. + +.. important:: The second argument passed to the + ``INSTANTIATE_TYPED_TEST_SUITE_P`` macro must match the name of + the test suite class discussed in :ref:`tests_header-label`. + +.. _tests_cmakelists-label: + +CMakeLists.txt File +-------------------- + +The concrete version of each of the items described above is generated by +CMake when a RAJA build is configured. CMake fills in the segment type and +back-end identifiers, ``@SEGTYPE@`` and ``@BACKEND@``, respectively. These +identifiers and the test file and executable generation process is defined in +the +`CMakeLists.txt `_ file in the test directory. If you look in the file, +you will see nested loops over RAJA back-ends and segment types which +process the test source file ``test-forall-segment.cpp.in`` multiple times +to create a uniquely named source file for each back-end/segment type +combination in the RAJA build space. Each source file will be compiled into +a similarly named, unique test executable when the code is compiled. + +.. _tests_header-label: + +Test Header files +-------------------- + +Recall the line in the test source file:: + + #include "test-forall-@SEGTYPE@.hpp" + +This identifies the header file containing the actual test code used to +generate the tests. The test header files are located in the +`RAJA/test/functional/forall/segment/tests `_ directory. The main elements of +each test header file are described next. We use the +`test-forall-RangeSegment.hpp `_ file to +illustrate the essential test implementation elements. + +The file contains the following important items: + + * test implementation method + * typed test suite class + * typed test invocation + * type test suite registration + +The test implementation is contained in a parameterized template method:: + + template + void ForallRangeSegmentTestImpl(INDEX_TYPE first, INDEX_TYPE last) + { + ... + } + +Here, the template parameters identify the index type of the RAJA +segment ``INDEX_TYPE``, the resource type for allocating test memory in the +proper execution environment ``WORKING_RES``, and the execution policy +``EXEC_POLICY`` for the ``RAJA::forall`` method used to run the tests. + +The test suite class plugs into the GoogleTest framework:: + + TYPED_TEST_SUITE_P(ForallRangeSegmentTest); + template + class ForallRangeSegmentTest : public ::testing::Test + { + }; + +using the ``TYPED_TEST_SUITE_P`` GoogleTest macro. + +.. important:: The name of the test class must be identical to the label passed + to the GoogleTest ``TYPED_TEST_SUITE_P`` macro. + +The specific tests that are run are defined by calls to the test implementation +template method ``ForallRangeSegmentTestImpl`` described above:: + + TYPED_TEST_P(ForallRangeSegmentTest, RangeSegmentForall) + { + using INDEX_TYPE = typename camp::at>::type; + using WORKING_RES = typename camp::at>::type; + using EXEC_POLICY = typename camp::at>::type; + + // test zero-length range segment + ForallRangeSegmentTestImpl(INDEX_TYPE(3), INDEX_TYPE(3)); + + ForallRangeSegmentTestImpl(INDEX_TYPE(0), INDEX_TYPE(27)); + ForallRangeSegmentTestImpl(INDEX_TYPE(1), INDEX_TYPE(2047)); + ForallRangeSegmentTestImpl(INDEX_TYPE(1), INDEX_TYPE(32000)); + + runNegativeTests(); + } + +Here, ``TYPED_TEST_P`` is a GoogleTest macro defining the method for +executing the tests. Note that the first three lines +in the method extract the template parameter types from the ``camp::tuple`` +produced by the ``camp::cartesian_product`` described earlier in +:ref:`tests_source-label`. If you look in the file, you will see an example of +how we use C++ SFINAE to exclude running tests with negative index values +for index types that are unsigned. + +.. important:: * The label passed as the first argument to the GoogleTest + ``TYPED_TEST_P`` macro must match the name of the test suite + class. The second argument is discussed below. + * It is critical to use the same type ordering when extracting + the types that was used when the ``camp::cartesian_product`` + type was defined in the test source file, described in + :ref:`tests_source-label`. + +Lastly, the test suite is registered with GoogleTest using the +``REGISTER_TYPED_TEST_SUITE_P`` macro:: + + REGISTER_TYPED_TEST_SUITE_P(ForallRangeSegmentTest, + RangeSegmentForall); + +.. important:: * The label passed as the first argument to the GoogleTest + ``REGISTER_TYPED_TEST_SUITE_P`` macro must match the name of + the test suite class. + * The label passed as the second argument to the GoogleTest + ``REGISTER_TYPED_TEST_SUITE_P`` macro must match the label + passed as the second argument to the ``TYPED_TEST_P`` macro. diff --git a/docs/sphinx/dev_guide/semantic_versioning.rst b/docs/sphinx/dev_guide/versioning.rst similarity index 68% rename from docs/sphinx/dev_guide/semantic_versioning.rst rename to docs/sphinx/dev_guide/versioning.rst index 3e77d2e340..62df10c2b9 100644 --- a/docs/sphinx/dev_guide/semantic_versioning.rst +++ b/docs/sphinx/dev_guide/versioning.rst @@ -6,21 +6,39 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _semver-label: +.. _version-label: -*********************** +**************************** +RAJA Release Version Naming +**************************** + +Prior to the RAJA release in March 2022, the RAJA project used the *Semantic +Versioning* scheme for assigning release tag names. At the March 2022 release, +we changed the release naming scheme to use ``YYYY.mm.pp``, for year, month, +and patch number. So, for example, the March 2022 release is labeled v2022.03.0.The main motivation for the release naming scheme is to do coordinated releases +with the `Umpire `_, +`CHAI `_, and +`camp `_ projects, which are considered parts +of the **RAJA Portability Suite**. In a coordinated release, all the projects +will have the same release name. If a project requires a patch release between +coordinated releases, it will indicate that by incrementing the patch number; +for example, v2022.03.1. + +The following sections describe the Semantic Versioning scheme for reference +and posterity. + +==================== Semantic Versioning -*********************** +==================== -The RAJA project uses the *semantic* versioning scheme for assigning -release numbers. Semantic versioning is a methodology for assigning a version -number to a software release in a way that conveys specific meaning about -code modifications from version to version. +Semantic versioning is a +methodology for assigning a version number to a software release in a way that +conveys specific meaning about code modifications from version to version. See `Semantic Versioning `_ for a more detailed description. -============================ -Version Numbers and Meaning -============================ +------------------------------------- +Semantic Version Numbers and Meaning +------------------------------------- Semantic versioning is based on a three part version number `MM.mm.pp`: @@ -41,9 +59,9 @@ Semantic versioning is based on a three part version number `MM.mm.pp`: number is always changed when a hotfix branch is merged into main, or when changes are made to main that only contain bug fixes. -=========================================== -What Does a Change in Version Number Mean? -=========================================== +----------------------------------------------------- +What Does a Change in Semantic Version Number Mean? +----------------------------------------------------- A key consideration in meaning for these three version numbers is that the software has a public API. Changes to the API or code functionality diff --git a/docs/sphinx/user_guide/config_options.rst b/docs/sphinx/user_guide/config_options.rst index c120e51761..89487f7874 100644 --- a/docs/sphinx/user_guide/config_options.rst +++ b/docs/sphinx/user_guide/config_options.rst @@ -22,9 +22,7 @@ their defaults. RAJA Option Types ============================= -Nearly all Cmake options used in RAJA contain the prefix ``RAJA_`` to give -users flexibility to enable/disable individual compilation features for RAJA, -specifically. RAJA contains two types of options, those that exist in +RAJA contains two types of options, those that exist in RAJA only and those that are similar to standard CMake options or options provided by BLT; i.e., *dependent options* in CMake terminology. RAJA dependent option names are the same as the associated CMake and BLT option @@ -34,16 +32,18 @@ names, but with the ``RAJA_`` prefix added. options that can be controlled with CMake or BLT variants. * Dependent options are typically used for *disabling* features. - For example, providing the option ``-DRAJA_ENABLE_TESTS=Off`` - to CMake will disable compilation of RAJA tests, even if the - option ``-DENABLE_TESTS=On`` is also provided. + For example, when the CMake option ``-DENABLE_TESTS=On`` is + used to enable tests in the build of an application that includes + multiple CMake-based package builds, providing the CMake option + ``-DRAJA_ENABLE_TESTS=Off`` will disable compilation of RAJA + tests, while compiling them for other packages. * We recommend using the option names without the ``RAJA_`` prefix, when available, to enable features at compile time to avoid potential undesired behavior. For example, passing the option ``-DRAJA_ENABLE_CUDA=On`` to CMake will not enable CUDA because ``ENABLE_CUDA`` is off by default. So to enable CUDA, you need - to pass the ``-DENABLE_CUDA=On`` option to Cmake. + to pass the ``-DENABLE_CUDA=On`` option to CMake. ======================= Setting Options @@ -74,26 +74,34 @@ need to do that using appropriate CMake variables. All RAJA options are set like regular CMake variables. RAJA settings for default options, compilers, flags for optimization, etc. can be found in files in the ``RAJA/cmake`` directory and top-level ``CMakeLists.txt`` file. -Configuration variables can be set by passing -arguments to CMake on the command line when CMake is called, or by setting -options in a CMake *cache file* and passing that file to CMake using the -CMake ``-C`` options. For example, to enable RAJA OpenMP functionality, -pass the following argument to CMake:: +Configuration variables can be set by passing arguments to CMake on the +command line when calling CMake. For example, to enable RAJA OpenMP +functionality, pass the following argument to CMake:: - -DENABLE_OPENMP=On + cmake ... \ + -DENABLE_OPENMP=On \ + ... -The RAJA repository contains a collection of CMake cache files -(we call them *host-config* files) that may be used as a guide for users trying -to set their own options. See :ref:`configopt-raja-hostconfig-label`. +Alternatively, CMake options may be set in a CMake *cache file* and passing +that file to CMake using the CMake ``-C`` option; for example:: -Next, we summarize RAJA options and their defaults. + cmake ... \ + -C my_cache_file.cmake \ + ... + +The directories ``RAJA/scripts/*-builds`` contain scripts that run CMake for +various build configurations. These contain cmake invocations that use CMake +cache files (we call them *host-config* files) and may be used as a guide for +users trying to set their own options. + +Next, we summarize RAJA CMake options and their defaults. .. _configopt-raja-features-label: -==================================== -Available RAJA Options and Defaults -==================================== +========================================== +Available RAJA CMake Options and Defaults +========================================== RAJA uses a variety of custom variables to control how it is compiled. Many of these are used internally to control RAJA compilation and do @@ -113,7 +121,8 @@ build process for all of the code. The following tables describe which variables set RAJA options and and their default settings: -* **Examples, tests, warnings, etc.** +Examples, tests, warnings, etc. +-------------------------------- CMake variables can be used to control whether RAJA tests, examples, tutorial exercises, etc. are built when RAJA is compiled. @@ -123,23 +132,24 @@ tutorial exercises, etc. are built when RAJA is compiled. ========================= ========================================= (RAJA_)ENABLE_TESTS On (RAJA_)ENABLE_EXAMPLES On - (RAJA_)ENABLE_BENCHMARKS Off - (RAJA_)ENABLE_COVERAGE Off (supported for GNU compilers only) RAJA_ENABLE_EXERCISES On + (RAJA_)ENABLE_BENCHMARKS Off RAJA_ENABLE_REPRODUCERS Off + (RAJA_)ENABLE_COVERAGE Off (supported for GNU compilers only) ========================= ========================================= -RAJA can also be configured to build with compiler warnings reported as -errors, which may be useful to make sure your application builds cleanly: +Other configuration options are available to specialize how RAJA is compiled: - ================================ ====================== - Variable Default - ================================ ====================== - (RAJA_)ENABLE_WARNINGS_AS_ERRORS Off - ================================ ====================== + ================================== ========================= + Variable Default + ================================== ========================= + (RAJA_)ENABLE_WARNINGS_AS_ERRORS Off + RAJA_ENABLE_FORCEINLINE_RECURSIVE On (Intel compilers only) + RAJA_ALLOW_INCONSISTENT_OPTIONS Off + ================================== ========================= RAJA Views/Layouts may be configured to check for out of bounds -indexing at runtime: +indexing at run time: ========================= ====================== Variable Default @@ -147,11 +157,30 @@ indexing at runtime: RAJA_ENABLE_BOUNDS_CHECK Off ========================= ====================== -Note that RAJA bounds checking is a runtime check and will add -considerable execution time overhead. Thus, this feature should only be -used for correctness checking and should be disabled for production builds. - -* **Programming model back-end support** +.. note:: RAJA bounds checking is a run time check and will add considerable + execution time overhead. Thus, this feature should only be used for + debugging and correctness checking and should be disabled for + production builds. + +RAJA Features +------------------- + +Some RAJA features are enabled by RAJA-specific CMake variables. + + =========================== ======================================= + Variable Meaning + =========================== ======================================= + RAJA_ENABLE_RUNTIME_PLUGINS Enable support for dynamically loaded + RAJA plugins. Default is off. + RAJA_ENABLE_DESUL_ATOMICS Replace RAJA atomic implementations + with Desul variants at compile-time. + Default is off. + RAJA_ENABLE_VECTORIZATION Enable SIMD/SIMT intrinsics support. + Default is on. + =========================== ======================================= + +Programming model back-end support +------------------------------------- Variables that control which RAJA programming model back-ends are enabled are as follows (names are descriptive of what they enable): @@ -159,11 +188,11 @@ are as follows (names are descriptive of what they enable): ========================== ============================================ Variable Default ========================== ============================================ - (RAJA_)ENABLE_OPENMP On + (RAJA_)ENABLE_OPENMP Off (RAJA_)ENABLE_CUDA Off + RAJA_ENABLE_CLANG_CUDA Off (RAJA_)ENABLE_HIP Off - RAJA_ENABLE_TARGET_OPENMP Off (when on, (RAJA_)ENABLE_OPENMP must - also be on!) + RAJA_ENABLE_TARGET_OPENMP Off (when on, ENABLE_OPENMP must also be on) RAJA_ENABLE_TBB Off RAJA_ENABLE_SYCL Off ========================== ============================================ @@ -180,37 +209,50 @@ Other programming model specific compilation options are also available: CUDA_ARCH sm_35 (based on hardware support) RAJA_ENABLE_EXTERNAL_ROCPRIM Off RAJA_ENABLE_ROCTX Off - RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL Off (enables device function - pointers in HIP back-end) ====================================== ================================= Turning the ``(RAJA_)ENABLE_CLANG_CUDA`` variable on will build CUDA code with the native support in the Clang compiler. -The ``RAJA_ENABLE_EXTERNAL_CUB`` variable is used to enable use of an -external install of the NVIDIA CUB support library. When Off, the CUB -library included in the CUDA toolkit will still be used, if available. -Starting with CUDA 11, CUB is installed as part of the CUDA toolkit and -the NVIDIA Thrust library requires that install of CUB. We recommended -projects use the CUB included with the CUDA toolkit for compatibility with -Thrust and applications using Thrust. Users should take note of the CUB -install used by RAJA to ensure they use the same include directories when -configuring their applications. - -The ``RAJA_ENABLE_EXTERNAL_ROCPRIM`` variable is used to enable use of an -external install of the AMD rocPRIM support library. When Off, the -rocPRIM library included in the ROCm install will be used, when available. -We recommend projects use the rocPRIM included with the ROCm install when -available. Users should take note of the rocPRIM install used by RAJA to -ensure they use the same include directories when configuring their -applications. - .. note:: See :ref:`getting_started-label` for more information about - setting other options for RAJA back-ends. + using the ``RAJA_ENABLE_EXTERNAL_CUB`` and + ``RAJA_ENABLE_EXTERNAL_ROCPRIM`` variables, as well other + RAJA back-ends. + +Timer Options +-------------- + +RAJA provides a simple portable timer class that is used in RAJA +example codes to determine execution timing and can be used in other apps +as well. This timer can use any of three internal timers depending on +your preferences, and one should be selected by setting the 'RAJA_TIMER' +variable. -* **Data types, sizes, alignment, etc.** + ====================== ====================== + Variable Values + ====================== ====================== + RAJA_TIMER chrono (default), + gettime, + clock + ====================== ====================== + +What these variables mean: -RAJA provides type aliases that can be used to parameterize floating + ============================= ======================================== + Value Meaning + ============================= ======================================== + chrono Use the std::chrono library from the + C++ standard library + gettime Use `timespec` from the C standard + library time.h file + clock Use `clock_t` from time.h + ============================= ======================================== + +Data types, sizes, alignment, etc. +------------------------------------- + +The options discussed in this section are typically not needed by users. +They are provided for special cases when users want to parameterize floating point types in applications, which makes it easier to switch between types. .. note:: RAJA data types in this section are provided as a convenience to @@ -297,35 +339,8 @@ in units of **bytes**. For details on the options in this section are used, please see the header file ``RAJA/include/RAJA/util/types.hpp``. -* **Timer Options** - -RAJA provides a simple portable timer class that is used in RAJA -example codes to determine execution timing and can be used in other apps -as well. This timer can use any of three internal timers depending on -your preferences, and one should be selected by setting the 'RAJA_TIMER' -variable. - - ====================== ====================== - Variable Values - ====================== ====================== - RAJA_TIMER chrono (default) - gettime - clock - ====================== ====================== - -What these variables mean: - - ============================= ======================================== - Value Meaning - ============================= ======================================== - chrono Use the std::chrono library from the - C++ standard library - gettime Use `timespec` from the C standard - library time.h file - clock Use `clock_t` from time.h - ============================= ======================================== - -* **Other RAJA Features** +Other RAJA Features +------------------- RAJA contains some features that are used mainly for development or may not be of general interest to RAJA users. These are turned off be default. @@ -340,10 +355,6 @@ They are described here for reference and completeness. tolerance enabled run (e.g., number of faults detected, recovered from, recovery overhead, etc.) - RAJA_ENABLE_RUNTIME_PLUGINS Enable support for dynamically loaded - RAJA plugins. - RAJA_ENABLE_DESUL_ATOMICS Replace RAJA atomic implementations - with desul variants at compile-time. =========================== ======================================= diff --git a/docs/sphinx/user_guide/feature/atomic.rst b/docs/sphinx/user_guide/feature/atomic.rst index 20799be569..64f3241ad5 100644 --- a/docs/sphinx/user_guide/feature/atomic.rst +++ b/docs/sphinx/user_guide/feature/atomic.rst @@ -6,20 +6,45 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _atomics-label: +.. _feat-atomics-label: -======== -Atomics -======== +=================== +Atomic Operations +=================== RAJA provides portable atomic operations that can be used to update values at arbitrary memory locations while avoiding data races. They are described in this section. -A complete working example code that shows RAJA atomic usage can be found in -:ref:`atomichist-label`. +.. note:: All RAJA atomic operations are in the namespace ``RAJA``. + +.. note:: Each RAJA atomic operation is templated on an *atomic policy* + type, which **must be compatible with the execution policy used by + the kernel in which it is used.** For example, in + a CUDA kernel, a CUDA atomic policy type must be used. + +For more information about available RAJA atomic policies, please see +:ref:`atomicpolicy-label`. + +.. note:: RAJA support for CUDA atomic operations may be specific to + the compute architecture for which the code is compiled. Please + see :ref:`cudaatomics-label` for more information. + +RAJA currently supports two different implementations of atomic operations +via the same basic interface. The default implementation is the original one +developed in RAJA and which has been available for several years. Alternatively, +one can choose an implementation based on +`DESUL `_ at compile time. Please see +:ref:`desul-atomics-label` for more information. Eventually, we plan to +deprecate the original RAJA implementation and provide only the DESUL +implementation. The RAJA atomic interface is expected to change when we switch +over to DESUL atomic support. Specifically, the atomic policy noted above will +no longer be used. + +Please see the following tutorial sections for detailed examples that use +RAJA atomic operations: -.. note:: * All RAJA atomic operations are in the namespace ``RAJA``. + * :ref:`tut-atomichist-label`. .. _atomic-ops: @@ -27,59 +52,57 @@ A complete working example code that shows RAJA atomic usage can be found in Atomic Operations ----------------- -RAJA atomic support includes a variety of the most common atomic operations. +RAJA atomic support the most common atomic operations. -.. note:: * Each RAJA atomic operation is templated on an *atomic policy*. - * Each method described in the table below returns the value of - the potentially modified argument (i.e., \*acc) immediately before - the atomic operation is applied, in case it is needed by a user. - * See :ref:`atomics-label` for details about CUDA atomic operations. +.. note:: Each atomic method described below returns the value of + the potentially modified argument (i.e., \*acc) immediately before + the atomic operation is applied, in case a user requires it. ^^^^^^^^^^^ Arithmetic ^^^^^^^^^^^ -* ``atomicAdd< atomic_policy >(T* acc, T value)`` - Add value to \*acc. +* ``atomicAdd< atomic_policy >(T* acc, T value)`` - Add ``value`` to ``\*acc``. -* ``atomicSub< atomic_policy >(T* acc, T value)`` - Subtract value from \*acc. +* ``atomicSub< atomic_policy >(T* acc, T value)`` - Subtract ``value`` from ``\*acc``. ^^^^^^^^^^^ Min/max ^^^^^^^^^^^ -* ``atomicMin< atomic_policy >(T* acc, T value)`` - Set \*acc to min of \*acc and value. +* ``atomicMin< atomic_policy >(T* acc, T value)`` - Set ``\*acc`` to min of ``\*acc`` and ``value``. -* ``atomicMax< atomic_policy >(T* acc, T value)`` - Set \*acc to max of \*acc and value. +* ``atomicMax< atomic_policy >(T* acc, T value)`` - Set ``\*acc`` to max of ``\*acc`` and ``value``. ^^^^^^^^^^^^^^^^^^^^ Increment/decrement ^^^^^^^^^^^^^^^^^^^^ -* ``atomicInc< atomic_policy >(T* acc)`` - Add 1 to \*acc. +* ``atomicInc< atomic_policy >(T* acc)`` - Add 1 to ``\*acc``. -* ``atomicDec< atomic_policy >(T* acc)`` - Subtract 1 from \*acc. +* ``atomicDec< atomic_policy >(T* acc)`` - Subtract 1 from ``\*acc``. -* ``atomicInc< atomic_policy >(T* acc, T compare)`` - Add 1 to \*acc if \*acc < compare, else set \*acc to zero. +* ``atomicInc< atomic_policy >(T* acc, T compare)`` - Add 1 to ``\*acc`` if ``\*acc`` < ``compare``, else set ``\*acc`` to zero. -* ``atomicDec< atomic_policy >(T* acc, T compare)`` - Subtract 1 from \*acc if \*acc != 0 and \*acc <= compare, else set \*acc to compare. +* ``atomicDec< atomic_policy >(T* acc, T compare)`` - Subtract 1 from ``\*acc`` if ``\*acc`` != 0 and ``\*acc`` <= ``compare``, else set ``\*acc`` to ``compare``. ^^^^^^^^^^^^^^^^^^^^ Bitwise operations ^^^^^^^^^^^^^^^^^^^^ -* ``atomicAnd< atomic_policy >(T* acc, T value)`` - Bitwise 'and' equivalent: Set \*acc to \*acc & value. Only works with integral data types. +* ``atomicAnd< atomic_policy >(T* acc, T value)`` - Bitwise 'and' equivalent: Set ``\*acc`` to ``\*acc`` & ``value``. Only works with integral data types. -* ``atomicOr< atomic_policy >(T* acc, T value)`` - Bitwise 'or' equivalent: Set \*acc to \*acc | value. Only works with integral data types. +* ``atomicOr< atomic_policy >(T* acc, T value)`` - Bitwise 'or' equivalent: Set ``\*acc`` to ``\*acc`` | ``value``. Only works with integral data types. -* ``atomicXor< atomic_policy >(T* acc, T value)`` - Bitwise 'xor' equivalent: Set \*acc to \*acc ^ value. Only works with integral data types. +* ``atomicXor< atomic_policy >(T* acc, T value)`` - Bitwise 'xor' equivalent: Set ``\*acc`` to ``\*acc`` ^ ``value``. Only works with integral data types. ^^^^^^^^^^^^^^^^^^^^ Replace ^^^^^^^^^^^^^^^^^^^^ -* ``atomicExchange< atomic_policy >(T* acc, T value)`` - Replace \*acc with value. +* ``atomicExchange< atomic_policy >(T* acc, T value)`` - Replace ``\*acc`` with ``value``. -* ``atomicCAS< atomic_policy >(T* acc, Tcompare, T value)`` - Compare and swap: Replace \*acc with value if and only if \*acc is equal to compare. +* ``atomicCAS< atomic_policy >(T* acc, Tcompare, T value)`` - Compare and swap: Replace ``\*acc`` with ``value`` if and only if ``\*acc`` is equal to ``compare``. Here is a simple example that shows how to use an atomic operation to compute an integral sum on a CUDA GPU device:: @@ -95,8 +118,8 @@ an integral sum on a CUDA GPU device:: cudaDeviceSynchronize(); *sum = 0; - RAJA::forall< RAJA::cuda_exec >(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (RAJA::Index_type i) { + RAJA::forall< RAJA::cuda_exec >(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE (int i) { RAJA::atomicAdd< RAJA::cuda_atomic >(sum, 1); @@ -108,7 +131,7 @@ After this kernel executes, the value reference by 'sum' will be 'N'. AtomicRef ^^^^^^^^^^^^^^^^^^^^ -RAJA also provides an atomic interface similar to the C++20 'std::atomic_ref', +RAJA also provides an interface similar to the C++20 ``std::atomic_ref``, but which works for arbitrary memory locations. The class ``RAJA::AtomicRef`` provides an object-oriented interface to the atomic methods described above. For example, after the following operations:: @@ -122,22 +145,14 @@ atomic methods described above. For example, after the following operations:: the value of 'val' will be 5. ------------------ -Atomic Policies ------------------ - -For more information about available RAJA atomic policies, please see -:ref:`atomicpolicy-label`. - - .. _cudaatomics-label: --------------------------------------- CUDA Atomics Architecture Dependencies --------------------------------------- -The internal implementations for RAJA atomic operations may vary depending -on which CUDA architecture is available and/or specified when the RAJA +The implementations for RAJA atomic operations may vary depending +on which CUDA architecture is available and/or specified when RAJA is configured for compilation. The following rules apply when the following CUDA architecture level is chosen: @@ -155,28 +170,36 @@ CUDA architecture level is chosen: * CUDA native 64-bit double `atomicAdd` is used. +.. _desul-atomics-label: + --------------------- DESUL Atomics Support --------------------- -RAJA provides support for the use of `DESUL Atomics `_ as -an alternative backend to the default implementation of RAJA atomics. DESUL atomics are considered an **experimental** feature in RAJA at this point. DESUL atomics -may impact the performance of some atomic functions. While switching -to DESUL atomics typically yields positive or neutral performance results, some atomic +RAJA provides the ability to use +`DESUL Atomics `_ as +an alternative to the default implementation of RAJA atomics. DESUL atomics +are considered an **experimental** feature in RAJA at this point and may +impact the performance of some atomic functions. While DESUL atomics typically +yields better or similar performance to RAJA default atomics, some atomic operations may perform worse when using DESUL. -To enable DESUL Atomics: - -#. Ensure that RAJA and its dependencies are configured to use C++14. -#. Set ``RAJA_ENABLE_DESUL_ATOMICS=On``. - -Enabling DESUL Atomics alters RAJA atomic functions to be wrapper-functions for their -DESUL counterparts. This removes the need for user code changes to switch between -DESUL and RAJA implementations. The exception to this is when RAJA atomic helper functions -are used instead of the backwards-compatible API functions specified by :ref:`atomic-ops`. By *helper functions*, we mean the RAJA atomic methods which take a reduction policy object as the first argument, instead of specifying the reduction policy type as a template parameter. - -DESUL atomic functions are compiled with the proper back-end implementation based on the scope in which they are -called, which removes the need to specify atomic policies for -target back-ends. As a result, atomic policies such as ``cuda_atomic`` or ``omp_atomic`` -are ignored when DESUL is enabled, but are still necessary to pass in as parameters -to the RAJA API. This will likely change in the future and RAJA atomic policies will be removed. +To enable DESUL atomics, pass the option to CMake when configuring a RAJA +build: ``-DRAJA_ENABLE_DESUL_ATOMICS=On``. + +Enabling DESUL atomics alters RAJA atomic functions to be wrapper-functions +for their DESUL counterparts. This removes the need for user code changes to +switch between DESUL and RAJA implementations for the most part. The exception +to this is when RAJA atomic helper functions are used instead of the +backward-compatible API functions specified by :ref:`atomic-ops`. By +*helper functions*, we mean the RAJA atomic methods which take an atomic +policy object as the first argument, instead of specifying the atomic policy +type as a template parameter. + +DESUL atomic functions are compiled with the proper back-end implementation +based on the scope in which they are called, which removes the need to specify +atomic policies for target back-ends. As a result, atomic policies such as +``RAJA::cuda_atomic`` or ``RAJA::omp_atomic`` are ignored when DESUL is +enabled, but are still necessary to pass in as parameters to the RAJA API. +This will likely change in the future when we switch to use DESUL atomics +exclusively and remove the default RAJA atomic operations. diff --git a/docs/sphinx/user_guide/feature/iteration_spaces.rst b/docs/sphinx/user_guide/feature/iteration_spaces.rst index ef3ab9c7e9..fafb35a90e 100644 --- a/docs/sphinx/user_guide/feature/iteration_spaces.rst +++ b/docs/sphinx/user_guide/feature/iteration_spaces.rst @@ -6,7 +6,7 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _index-label: +.. _feat-index-label: ================================ Indices, Segments, and IndexSets @@ -20,12 +20,15 @@ order for loop iterates, aggregate and partition iterates, as well as other configurations. In this section, we introduce RAJA index and iteration space concepts and types. -More examples of RAJA iteration space usage can be found in the -:ref:`indexset-label` and :ref:`vertexsum-label` sections of the tutorial. - .. note:: All RAJA iteration space types described here are located in the namespace ``RAJA``. +Please see the following tutorial sections for detailed examples that use +RAJA iteration space concepts: + + * :ref:`tut-indexset-label` + * :ref:`tut-vertexsum-label` + .. _indices-label: ------- @@ -37,146 +40,43 @@ identify loop iterates. Any lambda expression that represents all or part of a loop body passed to a ``RAJA::forall`` or ``RAJA::kernel`` method will take at least one loop index variable argument. RAJA iteration space types are templates that allow users to use any integral type for an -index variable. The index variable type may be explicitly specified by a user. -RAJA also provides the ``RAJA::Index_type`` type, which is used as a default -in some circumstances for convenience by allowing use of a common type -alias to typed constructs without explicitly specifying the type. -The ``RAJA::Index_type`` type is an alias to the C++ type ``std::ptrdiff_t``, -which is appropriate for most compilers to generate useful loop-level -optimizations. +index variable. .. _segments-label: -------------- -Segments -------------- - -A RAJA **Segment** represents a set of loop indices that one wants to -execute as a unit. RAJA provides Segment types for contiguous index ranges, -constant (non-unit) stride ranges, and arbitrary lists of indices. - -Stride-1 Segments -^^^^^^^^^^^^^^^^^^^ - -A ``RAJA::TypedRangeSegment`` is the fundamental type for representing a -stride-1 (i.e., contiguous) range of indices. - -.. figure:: ../figures/RangeSegment.png - - A range segment defines a stride-1 index range [beg, end). - -One can create an explicitly-typed range segment or one with the default -``RAJA::Index_type`` index type. For example,:: - - // A stride-1 index range [beg, end) using type int. - RAJA::TypedRangeSegment int_range(beg, end); - - // A stride-1 index range [beg, end) using the RAJA::Index_type default type - RAJA::RangeSegment default_range(beg, end); - -.. note:: When using a RAJA range segment, no loop iterations will be run when - begin is greater-than-or-equal-to end similar to a C-style for-loop. - -Strided Segments -^^^^^^^^^^^^^^^^^^^ - -A ``RAJA::TypedRangeStrideSegment`` defines a range with a constant stride -that is given explicitly stride, including negative stride. - -.. figure:: ../figures/RangeStrideSegment.png - - A range-stride segment defines an index range with arbitrary stride [beg, end, stride). - -One can create an explicitly-typed strided range segment or one with the -default ``RAJA::Index_type`` index type. For example,:: +----------------------- +Segments and IndexSets +----------------------- - // A stride-2 index range [beg, end, 2) using type int. - RAJA::TypedRangeStrideSegment stride2_range(beg, end, 2); +A RAJA **Segment** represents a set of indices that one wants to +execute as a unit for a kernel. RAJA provides the following Segment types: - // A index range with -1 stride [0, N-1, -1) using the RAJA::Index_type default type - RAJA::RangeStrideSegment neg1_range( N-1, -1, -1); + * ``RAJA::TypedRangeSegment`` represents a stride-1 range + * ``RAJA::TypedRangeStrideSegment`` represents a (non-unit) stride range + * ``RAJA::TypedListSegment`` represents an arbitrary set of indices -Using a range with a stride of '-1' as above in a RAJA loop traversal template -will run the loop indices in reverse order. That is, using 'neg1_range' -from above:: - - RAJA::forall< RAJA::seq_exec >( neg1_range, [=] (RAJA::Index_type i) { - printf("%ld ", i); - } ); - -will print the values:: - - N-1 N-2 N-3 .... 1 0 - -RAJA strided ranges support both positive and negative stride values. The -following items are worth noting: - -.. note:: When using a RAJA strided range, no loop iterations will be run - under the following conditions: - * Stride > 0 and begin > end - * Stride < 0 and begin < end - * Stride == 0 - -List Segments -^^^^^^^^^^^^^^ - -A ``RAJA::TypedListSegment`` is used to define an arbitrary set of loop -indices, akin to an indirection array. - -.. figure:: ../figures/ListSegment.png - - A list segment defines an arbitrary collection of indices. Here, we have a list segment with 5 irregularly-spaced indices. - -A list segment is created by passing an array of integral values to a list -segment constructor. For example:: - - // Create a vector holding some integer index values - std::vector idx = {0, 2, 3, 4, 7, 8, 9, 53}; - - // Create list segment with these loop indices where the indices are - // stored in the host memory space - camp::resources::Resource host_res{camp::resources::Host()}; - RAJA::TypedListSegment idx_list( &idx[0], idx.size(), - host_res ); - -Using a list segment in a RAJA loop traversal template will run the loop -indices specified in the array passed to the list segment constructor. That -is, using 'idx_list' from above:: - - RAJA::forall< RAJA::seq_exec >( idx_list, [=] (RAJA::Index_type i) { - printf("%ld ", i); - } ); - -will print the values:: - - 0 2 3 4 7 8 9 53 +A ``RAJA::TypedIndexSet`` is a container that can hold an arbitrary collection +of segments to compose iteration patterns in a single kernel invocation. -Note that a ``RAJA::TypedListSegment`` constructor can take a pointer to -an array of indices and an array length, as shown above. If the indices are -in a container, such as ``std::vector`` that provides ``begin()``, ``end()``, -and ``size()`` methods, the length argument is not required. For example:: +Segment and IndexSet types are used in ``RAJA::forall`` and other RAJA kernel +execution mechanisms to define the iteration space for a kernel. - std::vector idx = {0, 2, 3, 4, 7, 8, 9, 53}; +.. note:: Iterating over the indices of all segments in a RAJA index set + requires a two-level execution policy, with two template parameters, + as shown above. The first parameter specifies how to iterate over + the segments. The second parameter specifies how each segment will + execute. See :ref:`indexsetpolicy-label` for more information about + RAJA index set execution policies. - camp::resources::Resource host_res{camp::resources::Host()}; - RAJA::TypedListSegment idx_list( idx, host_res ); +.. note:: It is the responsibility of the user to ensure that segments are + defined properly when using RAJA index sets. For example, if the + same index appears in multiple segments, the corresponding loop + iteration will be run multiple times. -Similar to range segment types, RAJA provides ``RAJA::ListSegment``, which is -a type alias to ``RAJA::TypedListSegment`` using ``RAJA::Index_type`` as the -template type parameter. - -By default, the list segment constructor copies the indices in the array -passed to it to the memory space specified by the resource argument. -The resource argument is required so that the segment index values are in the -proper memory space for the kernel to run. Since the kernel is run on -the CPU host in this example (indicated by the ``RAJA::seq_exec`` execution -policy), we pass a host resource object to the list segment constructor. -If, for example, the kernel was to run on a GPU using a CUDA or HIP -execution policy, then the resource type passed to the camp resource -constructor would be ``camp::resources::Cuda()`` or -``camp::resources::Hip()``, respectively. +Please see :ref:`tut-indexset-label` for a detailed discussion of how to create +and use these segment types. -Segment Types and Iteration +Segment Types and Iteration ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It is worth noting that RAJA segment types model **C++ iterable interfaces**. @@ -192,56 +92,4 @@ and two types: * value_type Thus, any iterable type that defines these methods and types appropriately -can be used as a segment with RAJA traversal templates. - -.. _indexsets-label: - --------------- -IndexSets --------------- - -A ``RAJA::TypedIndexSet`` is a container that can hold an arbitrary collection -of segment objects of arbitrary type as illustrated in the following figure, -where we have two contiguous ranges and an irregularly-spaced list of indices. - -.. figure:: ../figures/IndexSet.png - - An index set with 2 range segments and one list segment. - -We can create an index set that describes such an iteration space:: - - // Create an index set that can hold range and list segments with the - // default index type - RAJA::TypedIndexSet< RAJA::RangeSegment, RAJA::ListSegment > iset; - - // Add two range segments and one list segment to the index set - iset.push_back( RAJA::RangeSegment( ... ) ); - iset.push_back( RAJA::ListSegment(...) ); - iset.push_back( RAJA::RangeSegment( ... ) ); - -Now that we've created this index set object, we can pass it to any RAJA -loop execution template to execute the indices defined by its segments:: - - // Define an index set execution policy type that will iterate over - // its segments in parallel (OpenMP) and execute each segment sequentially - using ISET_EXECPOL = RAJA::ExecPolicy< RAJA::omp_parallel_segit, - RAJA::seq_exec >; - - // Run a kernel with iterates defined by the index set - RAJA::forall(iset, [=] (int i) { ... }); - -In this example, the loop iterations will execute in three chunks defined by -the two range segments and one list segment. The segments will be iterated -over in parallel using OpenMP, and each segment will execute sequentially. - -.. note:: Iterating over the indices of all segments in a RAJA index set - requires a two-level execution policy, with two template parameters, - as shown above. The first parameter specifies how to iterate over - the seqments. The second parameter specifies how each segment will - execute. See :ref:`indexsetpolicy-label` for more information about - RAJA index set execution policies. - -.. note:: It is the responsibility of the user to ensure that segments are - defined properly when using RAJA index sets. For example, if the - same index appears in multiple segments, the corresponding loop - iteration will be run multiple times. +can be used as a segment with RAJA kernel execution templates. diff --git a/docs/sphinx/user_guide/feature/local_array.rst b/docs/sphinx/user_guide/feature/local_array.rst index 3dc81d3856..9716708af1 100644 --- a/docs/sphinx/user_guide/feature/local_array.rst +++ b/docs/sphinx/user_guide/feature/local_array.rst @@ -6,7 +6,7 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _local_array-label: +.. _feat-local_array-label: =========== Local Array @@ -15,7 +15,7 @@ Local Array This section introduces RAJA *local arrays*. A ``RAJA::LocalArray`` is an array object with one or more dimensions whose memory is allocated when a RAJA kernel is executed and only lives within the scope of the kernel -execution. To motivate the concept and usage, consider a simple C++ example +execution. To motivate the concept and usage, consider a simple C example in which we construct and use two arrays in nested loops:: for(int k = 0; k < 7; ++k) { //k loop @@ -35,9 +35,10 @@ in which we construct and use two arrays in nested loops:: } Here, two stack-allocated arrays are defined inside the outer 'k' loop and -used in both inner 'j' loops. This loop pattern may be also be expressed -using RAJA local arrays in a ``RAJA::kernel_param`` kernel. We show a -RAJA variant below, which matches the implementation above, and then discuss +used in both inner 'j' loops. + +This loop pattern may be also be written using RAJA local arrays in a +``RAJA::kernel_param`` kernel. We show this next, and then discuss its constituent parts:: // @@ -73,8 +74,8 @@ its constituent parts:: // Define the kernel // - RAJA::kernel_param ( RAJA::make_tuple(RAJA::RangeSegment(0,5), - RAJA::RangeSegment(0,7)), + RAJA::kernel_param ( RAJA::make_tuple(RAJA::TypedRangeSegment(0,5), + RAJA::TypedRangeSegment``. The local array initialization is done in the first +lambda expression, and the local array values are printed in the second lambda +expression. + +.. note:: ``RAJA::LocalArray`` types support arbitrary dimensions and extents + in each dimension. ------------------- Memory Policies ------------------- -``RAJA::LocalArray`` supports CPU stack-allocated memory and CUDA GPU shared -memory and thread private memory. See :ref:`localarraypolicy-label` for a -discussion of available memory policies. +``RAJA::LocalArray`` supports CPU stack-allocated memory and CUDA or HIP GPU +shared memory and thread private memory. See :ref:`localarraypolicy-label` +for a discussion of available memory policies. diff --git a/docs/sphinx/user_guide/feature/loop_basic.rst b/docs/sphinx/user_guide/feature/loop_basic.rst index d425a31ff5..80ef7896e4 100644 --- a/docs/sphinx/user_guide/feature/loop_basic.rst +++ b/docs/sphinx/user_guide/feature/loop_basic.rst @@ -12,52 +12,25 @@ Elements of Loop Execution ============================================== -In this section, we describe the basic elements of RAJA loop kernel -execution. ``RAJA::forall``, ``RAJA::kernel``, and ``RAJA::expt::launch`` -(aka *RAJA Teams*) template methods comprise the RAJA interface for loop -execution. ``RAJA::forall`` methods execute simple, non-nested loops, -``RAJA::kernel`` methods support nested loops and other complex loop -kernels and transformations, and ``RAJA::expt::launch`` creates an execution -space in which algorithms are expressed in terms of nested loops using -the ``RAJA::expt::loop`` method. - -.. note:: * The ``forall`` , and ``kernel`` methods are in the - namespace ``RAJA``, while ``launch`` is found under - the RAJA namespace for experimental features ``RAJA::expt``. - - * A ``RAJA::forall`` loop execution method is a template on an - *execution policy* type. A ``RAJA::forall`` method takes two - arguments: - - * an iteration space object, such as a contiguous range of loop - indices, and - * a single lambda expression representing the loop body. - - * Each ``RAJA::kernel`` method is a template on a policy that - contains statements with *execution policy* types appropriate for - the kernel structure; e.g., an execution policy for each level in a - loop nest. A ``RAJA::kernel`` method takes multiple arguments: - - * a *tuple* of iteration space objects, and - * one or more lambda expressions representing portions of - the loop kernel body. - - * The ``RAJA::expt::launch`` method is a template on both host and - device policies to create an execution space for kernels. - Since both host and device poilices are specified, the launch - method can be used to select at run-time whether to run a kernel - on the host or device. Algorithms are expressed inside the - execution space as nested loops using ``RAJA::loop`` methods. - - * Hierarchical parallelism can be expressed using the thread and - thread-team model with ``RAJA::expt::loop`` methods as found in - programming models such as CUDA/HIP. - -Various examples showing how to use ``RAJA::forall``, ``RAJA::kernel``, ``RAJA::launch`` -methods may be found in the :ref:`tutorial-label`. - -For more information on RAJA execution policies and iteration space constructs, -see :ref:`policies-label` and :ref:`index-label`, respectively. +The ``RAJA::forall``, ``RAJA::expt::dynamic_forall``, ``RAJA::kernel``, and ``RAJA::launch`` +template methods comprise the RAJA interface for kernel +execution. ``forall`` methods execute simple, non-nested loops, +``RAJA::kernel`` methods support nested loops and other complex loop +kernels and transformations, and ``RAJA::launch`` creates an execution +space in which kernels are written in terms of nested loops using +the ``RAJA::loop`` method. + +.. note:: The ``forall`` , ``kernel``, and ``launch`` methods are in the ``RAJA`` + namespace, while ``dynamic_forall`` is in the RAJA namespace for + experimental features ``RAJA::expt``. ``RAJA::expt::dynamic_forall`` + will be moved to the ``RAJA`` namespace in a future RAJA release. + +For more information on RAJA execution policies and iteration space constructs, +see :ref:`feat-policies-label` and :ref:`feat-index-label`, respectively. + +The following sections describe the basic aspects of these methods. +Detailed examples showing how to use ``RAJA::forall``, ``RAJA::kernel``, ``RAJA::launch`` methods may be found in the :ref:`tutorial-label`. Links to specific +RAJA tutorial sections are provided in the sections below. .. _loop_elements-forall-label: @@ -65,53 +38,93 @@ see :ref:`policies-label` and :ref:`index-label`, respectively. Simple Loops (RAJA::forall) --------------------------- -As noted earlier, a ``RAJA::forall`` template executes simple -(i.e., non-nested) loops. For example, a C-style loop that adds two vectors, -like this:: +Consider a C-style loop that adds two vectors:: for (int i = 0; i < N; ++i) { c[i] = a[i] + b[i]; } -may be written using RAJA as:: +This may be written using ``RAJA::forall`` as:: - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { + RAJA::forall(RAJA::TypesRangeSegment(0, N), [=] (int i) { c[i] = a[i] + b[i]; }); -A ``RAJA::forall`` method is a template on an execution policy type and takes -two arguments: an object describing the loop iteration space, such as a RAJA -range segment (shown here), and a lambda expression for the loop body. Applying -different loop execution policies enables the loop to run in different ways; -e.g., using different programming model back-ends. Different iteration space -objects enable the loop iterates to be partitioned, reordered, run in -different threads, etc. +A ``RAJA::forall`` loop execution method is a template that takes an +*execution policy* type template parameter. A ``RAJA::forall`` method takes +two arguments: an iteration space object, such as a contiguous range of loop +indices as shown here, and a single lambda expression representing the loop +kernel body. + +Applying different loop execution policies enables the loop to run in +different ways; e.g., using different programming model back-ends. Different +iteration space objects enable the loop iterates to be partitioned, reordered, +run in different threads, etc. Please see :ref:`feat-index-label` for details +about RAJA iteration spaces. .. note:: Changing loop execution policy types and iteration space constructs - enables loops to run in different ways by recompiling the code and + enables loops to run in different ways by recompiling the code and without modifying the loop kernel code. -While loop execution using ``RAJA::forall`` methods is a subset of -``RAJA::kernel`` functionality, described next, we maintain the -``RAJA::forall`` interface for simple loop execution because the syntax is +As an extension of ``RAJA::forall``, the ``RAJA::expt::dynamic_forall`` method enables users +to compile using a list of execution policies and choose the execution policy at run-time. +For example, a user may want to have N policies available and at run-time choose which policy to use:: + + using exec_pol_list = camp::list; + int pol = i; //run-time value + + RAJA::expt::dynamic_forall(pol, RAJA::TypedRangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + }); + + +While static loop execution using ``forall`` methods is a subset of +``RAJA::kernel`` functionality, described next, +we maintain the ``forall`` interfaces for simple loop execution because the syntax is simpler and less verbose for that use case. -.. note:: Data arrays in lambda expressions used with RAJA are typically - RAJA Views (see :ref:`view-label`) or bare pointers as shown in +.. note:: Data arrays in lambda expressions used with RAJA are typically + RAJA Views (see :ref:`feat-view-label`) or bare pointers as shown in the code snippets above. Using something like 'std::vector' is - non-portable (won't work in GPU kernels, generally) and would add + non-portable (won't work in GPU kernels, generally) and would add excessive overhead for copying data into the lambda data environment when captured by value. +Please see the following tutorial sections for detailed examples that use +``RAJA::forall``: + + * :ref:`tut-addvectors-label` + * :ref:`tut-dotproduct-label` + * :ref:`tut-reduction-label` + * :ref:`tut-atomichist-label` + * :ref:`tut-indexset-label` + * :ref:`tut-vertexsum-label` + * :ref:`tut-permutedlayout-label` + + .. _loop_elements-kernel-label: ---------------------------- Complex Loops (RAJA::kernel) ---------------------------- -A ``RAJA::kernel`` template provides ways to compose and execute arbitrary -loop nests and other complex kernels. To introduce the RAJA *kernel* interface, -consider a (N+1)-level C-style loop nest:: +A ``RAJA::kernel`` template provides ways to compose and execute arbitrary +loop nests and other complex kernels. +The ``RAJA::kernel`` interface employs similar concepts to ``RAJA::forall`` +but extends it to support much more complex kernel structures. +Each ``RAJA::kernel`` method is a template that takes an *execution policy* +type template parameter. The execution policy can be an arbitrarily complex +sequence of nested templates that define a kernel execution pattern. +In its simplest form, ``RAJA::kernel`` takes two arguments: +a *tuple* of iteration space objects, and a lambda expression representing +the kernel inner loop body. In more complex usage, ``RAJA::kernel`` can take +multiple lambda expressions representing different portions of the loop +kernel body. + +To introduce the RAJA *kernel* interface, consider a (N+1)-level C-style loop +nest:: for (int iN = 0; iN < NN; ++iN) { ... @@ -120,8 +133,8 @@ consider a (N+1)-level C-style loop nest:: } } -Note that we could write this by nesting ``RAJA::forall`` statements and -it would work for some execution policy choices:: +It is important to note that we do not recommend writing a RAJA version of +this by nesting ``RAJA::forall`` statements. For example:: RAJA::forall(IN, [=] (int iN) { ... @@ -131,37 +144,40 @@ it would work for some execution policy choices:: ... } -However, this approach treats each loop level as an independent entity. This +This would work for some execution policy choices, but not in general. +Also, this approach treats each loop level as an independent entity, which makes it difficult to parallelize the levels in the loop nest together. So it -may limit the amount of parallelism that can be exposed and the types of +may limit the amount of parallelism that can be exposed and the types of parallelism that may be used. For example, if an OpenMP or CUDA parallel execution policy is used on the outermost loop, then all inner loops -would be run sequentially in each thread. It also makes it difficult to perform -transformations like loop interchange and loop collapse without changing the +would be run sequentially in each thread. It also makes it difficult to perform +transformations like loop interchange and loop collapse without changing the source code, which breaks RAJA encapsulation. -.. note:: **We do not recommend nesting ``RAJA::forall`` statements.** +.. note:: **We do not recommend using nested ``RAJA::forall`` statements.** -The RAJA *kernel* interface facilitates parallel execution and compile-time -transformation of arbitrary loop nests and other complex loop structures. -It can treat a complex loop structure as a single entity, which simplifies -the ability to transform and apply different parallel execution patterns by -changing the execution policy type and *not the kernel code*. +The ``RAJA::kernel`` interface facilitates parallel execution and compile-time +transformation of arbitrary loop nests and other complex loop structures. +It can treat a complex loop structure as a single entity, which enables +the ability to transform and apply different parallel execution patterns by +changing the execution policy type and **not the kernel code**, in many cases. -The loop above nest may be written using the RAJA kernel interface as:: +The C-style loop above nest may be written using ``RAJA::kernel`` as:: - using KERNEL_POL = - RAJA::KernelPolicy< RAJA::statement::For > ... - > + > >; - + RAJA::kernel< KERNEL_POL >( - RAJA::make_tuple(RAJA::RangeSegment(0, NN), ..., RAJA::RangeSegment(0, N0), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, NN), + ..., + RAJA::TypedRangeSegment(0, N0), [=] (int iN, ... , int i0) { // inner loop body @@ -169,130 +185,141 @@ The loop above nest may be written using the RAJA kernel interface as:: ); -A ``RAJA::kernel`` method takes a ``RAJA::KernelPolicy`` type template -parameter, and a tuple of iteration spaces and a sequence of lambda -expressions as arguments. - In the case we discuss here, the execution policy contains a nested sequence -of ``RAJA::statement::For`` statements, one for each level in the loop nest. -Each ``For`` statement takes three template parameters: +of ``RAJA::statement::For`` types, indicating an iteration over each level in +the loop nest. Each of these statement types takes three template parameters: - * an integral index parameter that binds the ``For`` statement to the item - in the iteration space tuple corresponding to that index, - * an execution policy type for the associated loop nest level, and + * an integral index parameter that binds the statement to the item + in the iteration space tuple corresponding to that index + * an execution policy type for the associated loop nest level * an *enclosed statement list* (described in :ref:`loop_elements-kernelpol-label`). .. note:: The nesting of ``RAJA::statement::For`` types is analogous to the nesting of for-statements in the C-style version of the loop nest. - One can think of the '<, >' symbols enclosing the template parameter + One can think of the '<, >' symbols enclosing the template parameter lists as being similar to the curly braces in C-style code. -Here, the innermost type in the kernel policy is a +Here, the innermost type in the kernel policy is a ``RAJA::statement::Lambda<0>`` type indicating that the first lambda expression -(argument zero of the sequence of lambdas passed to the ``RAJA::kernel`` method) -will comprise the inner loop body. We only have one lambda in this example -but, in general, we can have any number of lambdas and we can use any subset +(argument zero of a sequence of lambdas passed to the ``RAJA::kernel`` method) +will comprise the inner loop body. We only have one lambda in this example +but, in general, we can have any number of lambdas and we can use any subset of them, with ``RAJA::statement::Lambda`` types placed appropriately in the -execution policy, to construct a loop kernel. For example, placing -``RAJA::statement::Lambda`` types between ``RAJA::statement::For`` statements +execution policy, to construct a loop kernel. For example, placing +``RAJA::statement::Lambda`` types between ``RAJA::statement::For`` statements enables non-perfectly nested loops. -RAJA offers two types of lambda statements. The first as illustratated -above, requires that each lambda expression passed to a ``RAJA::kernel`` method -**must take an index argument for each iteration space in the tuple**. -With this type of lambda statement, the entire iteration space must be active -in a containing ``For`` construct. A compile time ``static_assert`` will be -triggered if any of the arguments are undefined, indicating that something -is not correct. +RAJA offers two types of ``RAJA::statement::Lambda`` statements. The simplest +form, shown above, requires that each lambda expression passed to a +``RAJA::kernel`` method **must take an index argument for each iteration +space.** With this type of lambda statement, the entire iteration space must +be active in a surrounding ``For`` construct. A compile time ``static_assert`` +will be triggered if any of the arguments are undefined, indicating that +something is not correct. -The second type of lambda statement, an extension of the first, takes additional -template parameters which specify which iteration space indices are passed -as lambda arguments. The result is that a kernel lambda only needs to accept -iteration space index arguments that are used in the lambda body. +A second ``RAJA::statement::Lambda`` type, which is an extension of the first, +takes additional template parameters which specify which iteration spaces +are passed as lambda arguments. The result is that a kernel lambda only needs +to accept iteration space index arguments that are used in the lambda body. The kernel policy list with lambda arguments may be written as:: - using KERNEL_POL = - RAJA::KernelPolicy< RAJA::statement::For> > ... - > + > >; -The template parameter ``RAJA::Segs`` is used to specify which elements in the -segment tuple are used to pass arguments to a lambda. RAJA offers other -types such as ``RAJA::Offsets``, and ``RAJA::Params`` to identify offsets and -parameters in segments and param tuples respectively to be used as lambda -argumentsx. See :ref:`matrixmultiply-label` and -:ref:`matrixtransposelocalarray-label` for detailed examples. +The template parameter ``RAJA::Segs`` is used to specify indices from which +elements in the segment tuple are passed as arguments to the lambda, and in +which argument order. Here, we pass all segment indices so the lambda kernel +body definition could be identical to on passed to the previous RAJA version. +RAJA offers other types such as ``RAJA::Offsets``, and ``RAJA::Params`` to +identify offsets and parameters in segments and parameter tuples that could be +passed to ``RAJA::kernel`` methods. See :ref:`tut-matrixmultiply-label` +for an example. .. note:: Unless lambda arguments are specified in RAJA lambda statements, the loop index arguments for each lambda expression used in a RAJA - kernel loop body **must match** the contents of the - *iteration space tuple* in number, order, and type. Not all index - arguments must be used in a lambda, but they **all must appear** - in the lambda argument list and **all must be in active loops** to be - well-formed. In particular, your code will not compile if this is - not done correctly. If an argument is unused in a lambda expression, - you may include its type and omit its name in the argument list to - avoid compiler warnings just as one would do for a regular C++ + kernel loop body **must match** the contents of the + *iteration space tuple* in number, order, and type. Not all index + arguments must be used in a lambda, but they **all must appear** + in the lambda argument list and **all must be in active loops** to be + well-formed. In particular, your code will not compile if this is + not done correctly. If an argument is unused in a lambda expression, + you may include its type and omit its name in the argument list to + avoid compiler warnings just as one would do for a regular C++ method with unused arguments. -For RAJA nested loops implemented with ``RAJA::kernel``, as shown here, the -loop nest ordering is determined by the order of the nested policies, starting -with the outermost loop and ending with the innermost loop. +For RAJA nested loops implemented with ``RAJA::kernel``, as shown here, the +loop nest ordering is determined by the order of the nested policies, starting +with the outermost loop and ending with the innermost loop. -.. note:: The integer value that appears as the first parameter in each - ``RAJA::statement::For`` template indicates which iteration space - tuple entry or lambda index argument it corresponds to. **This - allows loop nesting order to be changed simply by changing the - ordering of the nested policy statements**. This is analogous to - changing the order of 'for-loop' statements in C-style nested loop +.. note:: The integer value that appears as the first parameter in each + ``RAJA::statement::For`` template indicates which iteration space + tuple entry or lambda index argument it corresponds to. **This + allows loop nesting order to be changed simply by changing the + ordering of the nested policy statements**. This is analogous to + changing the order of 'for-loop' statements in C-style nested loop code. -See :ref:`matmultkernel-label` for a complete example showing RAJA nested -loop functionality and :ref:`nestedreorder-label` for a detailed example -describing nested loop reordering. +.. note:: In general, RAJA execution policies for ``RAJA::forall`` and + ``RAJA::kernel`` are different. A summary of all RAJA execution + policies that may be used with ``RAJA::forall`` or ``RAJA::kernel`` + may be found in :ref:`feat-policies-label`. -.. note:: In general, RAJA execution policies for ``RAJA::forall`` and - ``RAJA::kernel`` are different. A summary of all RAJA execution - policies that may be used with ``RAJA::forall`` or ``RAJA::kernel`` - may be found in :ref:`policies-label`. - -Finally, a discussion of how to construct ``RAJA::KernelPolicy`` types and -available ``RAJA::statement`` types can be found in +A discussion of how to construct ``RAJA::KernelPolicy`` types and +available ``RAJA::statement`` types can be found in :ref:`loop_elements-kernelpol-label`. --------------------------------- -Team based loops (RAJA::launch) --------------------------------- +Please see the following tutorial sections for detailed examples that use +``RAJA::kernel``: + + * :ref:`tut-kernelnestedreorder-label` + * :ref:`tut-kernelexecpols-label` + * :ref:`tut-matrixtranspose-label` + * :ref:`tut-offsetlayout-label` + * :ref:`tut-matrixmultiply-label` -The *RAJA Teams* framework aims to unify thread/block based +------------------------------------------ +Hierarchical loops (RAJA::launch) +------------------------------------------ + +The ``RAJA::launch`` template is an alternative interface to +``RAJA::kernel`` that may be preferred for certain types of complex kernels +or based on coding style preferences. + +``RAJA::launch`` optionally allows either host or device execution +to be chosen at run time. The method takes an execution policy type that +will define the execution environment inside a lambda expression for a kernel +to be run on a host, device, or either. Kernel algorithms are written inside +main lambda expression using ``RAJA::loop`` methods. + +The ``RAJA::launch`` framework aims to unify thread/block based programming models such as CUDA/HIP/SYCL while maintaining portability on -host backends (OpenMP, sequential). When using the ``RAJA::kernel`` -interface, developers express all aspects of nested loop execution in the -execution policy type on which the ``RAJA::kernel`` method is templated. -In contrast, the ``RAJA::launch`` interface allows users to express +host back-ends (OpenMP, sequential). As we showed earlier, when using the +``RAJA::kernel`` interface, developers express all aspects of nested loop +execution in an execution policy type on which the ``RAJA::kernel`` method +is templated. +In contrast, the ``RAJA::launch`` interface allows users to express nested loop execution in a manner that more closely reflects how one would -write conventional nested C-style for-loop code. Additionally, *RAJA Teams* -introduces run-time host or device selectable kernel execution. The main -application of *RAJA Teams* is imperfectly nested loops. Using the -``RAJA::expt::launch method`` developers are provided with an execution -space enabling them to express algorithms in terms of nested -``RAJA::expt::loop`` statements:: +write conventional nested C-style for-loop code. For example, here is an +example of a ``RAJA::launch`` kernel that copies values from an array in +into a *shared memory* array:: - RAJA::expt::launch(select_CPU_or_GPU) - RAJA::expt::Grid(RAJA::expt::Teams(NE), RAJA::expt::Threads(Q1D)), - [=] RAJA_HOST_DEVICE (RAJA::expt::Launch ctx) { + RAJA::launch(select_CPU_or_GPU) + RAJA::LaunchParams(RAJA::Teams(NE), RAJA::Threads(Q1D)), + [=] RAJA_HOST_DEVICE (RAJA::Launch ctx) { - RAJA::expt::loop (ctx, RAJA::RangeSegment(0, teamRange), [&] (int bx) { + RAJA::loop (ctx, RAJA::RAJA::TypedRangeSegment(0, teamRange), [&] (int bx) { RAJA_TEAM_SHARED double s_A[SHARE_MEM_SIZE]; - RAJA::expt::loop (ctx, RAJA::RangeSegment(0, threadRange), [&] (int tx) { + RAJA::loop (ctx, RAJA::RAJA::TypedRangeSegment(0, threadRange), [&] (int tx) { s_A[tx] = tx; }); @@ -301,41 +328,55 @@ space enabling them to express algorithms in terms of nested )}; }); - -The underlying idea of *RAJA Teams* is to enable developers to express nested -parallelism in terms of teams and threads. Similar to the CUDA programming model, -development is done using a collection of threads, threads are grouped into teams. -Using the ``RAJA::expt::loop`` methods iterations of the loop may be executed by threads -or teams (depending on the execution policy). The launch context serves to synchronize -threads within the same team. The *RAJA Teams* abstraction consist of three main concepts. - - * *Launch Method*: creates an execution space in which developers may express - their algorithm in terms of nested ``RAJA::expt::loop`` statements. The loops are then - executed by threads or thread-teams. The method is templated on both a host - and device execution space and enables run-time selection of the execution environment. - - * *Resources*: holds a number of teams and threads (akin to CUDA blocks/threads). - - * *Loops*: are used to express hierarchical parallelism. Work within a loop is mapped to either teams or threads. Team shared memory - is available by using the ``RAJA_TEAM_SHARED`` macro. Team shared memory enables - threads in a given team to share data. In practice, team policies are typically - aliases for RAJA GPU block policies in the x,y,z dimensions (for example cuda_block_direct), - while thread policies are aliases for RAJA GPU thread policies (for example cuda_thread_direct) - x,y,z dimensions. On the host, teams and threads may be mapped to sequential - loop execution or OpenMP threaded regions. - -The team loop interface combines concepts from ``RAJA::forall`` and ``RAJA::kernel``. -Various policies from ``RAJA::kernel`` are compatible with the ``RAJA Teams`` -framework. + +The idea underlying ``RAJA::launch`` is to enable developers to express +hierarchical parallelism in terms of teams and threads. Similar to the CUDA +programming model, development is done using a collection of threads, and +threads are grouped into teams. Using the ``RAJA::loop`` methods +iterations of the loop may be executed by threads or teams depending on the +execution policy type. The launch context serves to synchronize threads within +the same team. The ``RAJA::launch`` interface has three main concepts: + + * ``RAJA::launch`` template. This creates an execution environment in + which a kernel implementation is written using nested ``RAJA::loop`` + statements. The launch policy template parameter used with the + ``RAJA::launch`` method enables specification of both a host and + device execution environment, which enables run time selection of + kernel execution. + + * ``RAJA::LaunchParams`` type. This type takes a number of teams and and a + number of threads as arguments. + + * ``RAJA::loop`` template. These are used to define hierarchical + parallel execution of a kernel. Operations within a loop are mapped to + either teams or threads based on the execution policy template parameter + provided. + +Team shared memory is available by using the ``RAJA_TEAM_SHARED`` macro. Team +shared memory enables threads in a given team to share data. In practice, +team policies are typically aliases for RAJA GPU block policies in the +x,y,z dimensions, while thread policies are aliases for RAJA GPU thread +policies in the x,y,z dimensions. In a host execution environment, teams and +threads may be mapped to sequential loop execution or OpenMP threaded regions. +Often, the ``RAJA::LaunchParams`` method can take an empty argument list for +host execution. + +Please see the following tutorial sections for detailed examples that use +``RAJA::launch``: + + * :ref:`tut-launchintro-label` + * :ref:`tut-launchexecpols-label` + * :ref:`tut-matrixtranspose-label` .. _loop_elements-CombiningAdapter-label: --------------------------------- -MultiDimensional loops using Simple loop APIs (RAJA::CombiningAdapter) --------------------------------- +------------------------------------------------------------------------ +Multi-dimensional loops using simple loop APIs (RAJA::CombiningAdapter) +------------------------------------------------------------------------ A ``RAJA::CombiningAdapter`` object provides ways to run perfectly nested loops -with simple loop APIs like ``RAJA::forall`` and ``RAJA::WorkGroup`` :ref:`workgroup-label`. +with simple loop APIs like ``RAJA::forall`` and those described in +:ref:`workgroup-label`. To introduce the ``RAJA ::CombiningAdapter`` interface, consider a (N+1)-level C-style loop nest:: @@ -357,13 +398,13 @@ loops and pass the adapter to a ``RAJA::forall`` statement to execute them:: RAJA::forall(adapter.getRange(), adapter); A ``RAJA::CombiningAdapter`` object is a template combining a loop body and -iteration spaces. The maker function template takes a lambda expression for the -loop body and an arbitrary number of segment arguments. It provides a flattened -index space via the ``getRange`` method that can be passed as the iteration space -to the simple loop API. The object itself can be passed into the loop API as the -loop body. The object's call operator does the conversion of the flat single -dimensional index into the multi-dimensional index space, calling the provided -lambda with the appropriate indices. - -.. note:: CombiningAdapter currently only supports ``RAJA::RangeSegment`` and +iteration spaces. The ``RAJA::make_CombingingAdapter`` template method takes +a lambda expression for the loop body and an arbitrary number of index +arguments. It provides a *flattened* iteration space via the ``getRange`` +method that can be passed as the iteration space to the ``RAJA::forall`` +method, for example. The object's call operator does the conversion of the +flat single dimensional index into the multi-dimensional index space, calling +the provided lambda with the appropriate indices. + +.. note:: CombiningAdapter currently only supports ``RAJA::TypedRangeSegment`` segments. diff --git a/docs/sphinx/user_guide/feature/plugins.rst b/docs/sphinx/user_guide/feature/plugins.rst index 4773c2bf17..c7cd3c63b0 100644 --- a/docs/sphinx/user_guide/feature/plugins.rst +++ b/docs/sphinx/user_guide/feature/plugins.rst @@ -6,7 +6,7 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _plugins-label: +.. _feat-plugins-label: ======== Plugins @@ -36,50 +36,41 @@ can be added to a project as easily as making a shared object file and setting ``RAJA_PLUGINS`` to the appropriate path. ^^^^^^^^^^^^^^^^^^^ -Quick Start Guide +Plugins Quick Start ^^^^^^^^^^^^^^^^^^^ **Static Plugins** -1. Build RAJA normally. - -2. Either use an ``#include`` statement within the code or compiler flags to load your plugin file with your project at compile time. A brief example of this would be something like ``g++ project.cpp plugin.cpp -lRAJA -fopenmp -ldl -o project``. - -3. When you run your project, your plugin should work. +#. Build RAJA normally. +#. Use an ``#include`` statement in your code or pass options to the compiler to load your plugin file with your project at compile time. For example: ``g++ project.cpp plugin.cpp -lRAJA -ldl -o project``. +#. When you run your project, your plugin should work. **Dynamic Plugins** -1. Build RAJA normally. - -2. Compile your plugin to be a shared object file with a .so extension. A brief -example of this would be something like ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``. - -3. Set the environment variable ``RAJA_PLUGINS`` to be the path of your .so file. -This can either be the path to its directory or to the shared object file itself. -If the path is to a directory, it will attempt to load all .so files in that -directory. - -4. When you run your project, your plugins should work. +#. Build RAJA normally. +#. Compile your plugin to be a shared object file with ``.so`` extension. For example: ``g++ plugin.cpp -lRAJA -fPIC -shared -o plugin.so``. +#. Set the environment variable ``RAJA_PLUGINS`` to the path of your ``.so`` file. This can either be the path to its directory or to the shared object file itself. If the path is a directory, all ``.so`` files in that directory will be loaded. +#. When you run your project, your plugins should work. ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Interfacing with Plugins ^^^^^^^^^^^^^^^^^^^^^^^^^^^ The RAJA plugin API allows for limited interfacing between a project and a -plugin. There are a couple of functions that allow for this to take place, +plugin. There are a couple of methods to call in your code: ``init_plugins`` and ``finalize_plugins``. These will call the corresponding -``init`` and ``finalize`` functions, respectively, of *every* currently loaded +``init`` and ``finalize`` methods, respectively, of *every* currently loaded plugin. It's worth noting that plugins don't require either an init or finalize -function by default. +method by default. -* ``RAJA::util::init_plugins();`` - Will call the ``init`` function of every +* ``RAJA::util::init_plugins();`` will call the ``init`` method of every currently loaded plugin. -* ``RAJA::util::init_plugins("path/to/plugins");`` - Does the same as the above - call to ``init_plugins``, but will also dynamically load plugins located at - the path specified. +* ``RAJA::util::init_plugins("path/to/plugins");`` will call the ``init`` + method of every currently loaded plugin and, in addition, will also + dynamically load plugins located at the given path. -* ``RAJA::util::finalize_plugins();`` - Will call the ``finalize`` function of +* ``RAJA::util::finalize_plugins();`` will call the ``finalize`` method of every currently loaded plugin. @@ -88,51 +79,56 @@ Creating Plugins For RAJA -------------------------- Plugins are classes derived from the ``RAJA::util::PluginStrategy`` base class -and implement the required functions for the API. An example implementation -can be found at the bottom of this page. +and implement the required virtual methods for the API. An example +implementation can be found at the bottom of this page. -^^^^^^^^^^^ -Functions -^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^ +Plugin API methods +^^^^^^^^^^^^^^^^^^^^ -The ``preLaunch`` and ``postLaunch`` functions are automatically called by -RAJA before and after executing a kernel that uses ``RAJA::forall`` or -``RAJA::kernel`` methods. +The following list summarizes the virtual methods in the +``RAJA::util::PluginStrategy`` base class. -* ``void init(const PluginOptions& p) override {}`` - runs on all plugins when - a user calls ``init_plugins`` +* ``void init(const PluginOptions& p) override {}`` is called on all plugins + when a user calls ``init_plugins()`` -* ``void preCapture(const PluginContext& p) override {}`` - is called before - lambda capture in ``RAJA::forall`` or ``RAJA::kernel``. +* ``void preCapture(const PluginContext& p) override {}`` is called before + lambda capture in RAJA kernel execution methods. -* ``void postCapture(const PluginContext& p) override {}`` - is called after - lambda capture in ``RAJA::forall`` or ``RAJA::kernel``. +* ``void postCapture(const PluginContext& p) override {}`` is called after + lambda capture in RAJA kernel execution methods. -* ``void preLaunch(const PluginContext& p) override {}`` - is called before - ``RAJA::forall`` or ``RAJA::kernel`` runs a kernel. +* ``void preLaunch(const PluginContext& p) override {}`` is called before + a RAJA kernel execution method runs a kernel. -* ``void postLaunch(const PluginContext& p) override {}`` - is called after - ``RAJA::forall`` or ``RAJA::kernel`` runs a kernel. +* ``void postLaunch(const PluginContext& p) override {}`` is called after + a RAJA kernel execution method runs a kernel. -* ``void finalize() override {}`` - Runs on all plugins when a user calls +* ``void finalize() override {}`` is called on all plugins when a user calls ``finalize_plugins``. This will also unload all currently loaded plugins. -``init`` and ``finalize`` are never called by RAJA by default and are only -called when a user calls ``RAJA::util::init_plugins()`` or -``RAJA::util::finalize_plugin()``, respectively. +.. note:: The pre/post methods above are automatically called + before and after executing a kernel with ``RAJA::forall`` or + ``RAJA::kernel`` kernel execution methods. + +.. note:: The ``init`` and ``finalize`` methods are never called by + default and are only called when a user calls + ``RAJA::util::init_plugins()`` or ``RAJA::util::finalize_plugin()``, + respectively. ^^^^^^^^^^^^^^^^^ Static Loading ^^^^^^^^^^^^^^^^^ -If a plugin is to be loaded into a project at compile time, adding the -following method call will add the plugin to the RAJA ``PluginRegistry`` and will -be loaded every time the compiled executable is run. This requires the plugin -to be loaded with either an ``#include`` statement within a project or with -source code line such as:: +If a plugin is to be loaded into a project at compile time, it must be +loaded with either an ``#include`` statement in the project source code or +by calling the following method in the project source code, which adds the +plugin to the RAJA ``PluginRegistry`::` static RAJA::util::PluginRegistry::add P("Name", "Description"); +In either case, the plugin will be loaded every time the compiled +project executable is run. ^^^^^^^^^^^^^^^^^ Dynamic Loading @@ -142,39 +138,40 @@ If a plugin is to be dynamically loaded in a project at run time, the RAJA plugin API requires a few conditions to be met. The following must be true about the plugin, not necessarily of the project using it. -1. **The plugin must have the following factory function.** This will return - a pointer to an instance of your plugin. Note that using ``extern "C"`` is - required to search for the ``getPlugin()`` method call for the dynamically - loaded plugin correctly:: +#. The plugin must have the following factory method that returns + a pointer to an instance of your plugin:: - extern "C" RAJA::util::PluginStrategy *getPlugin () + extern "C" RAJA::util::PluginStrategy* getPlugin() { return new MyPluginName; } + Note that using ``extern "C"`` is required to search for the ``getPlugin()`` + method call for the dynamically loaded plugin correctly. -2. **The plugin must be compiled to be a shared object with a .so extension.** - A simple example containing required flags would be: ``g++ plugin.cpp -lRAJA -fopenmp -fPIC -shared -o plugin.so``. +#. The plugin must be compiled to be a shared object with a ``.so`` extension. + For example: ``g++ plugin.cpp -lRAJA -fPIC -shared -o plugin.so``. - At the moment, RAJA will only attempt to load files with .so extensions. + At the moment, RAJA will only attempt to load files with ``.so`` extensions. It's worth noting why these flags (or their equivalents) are important. - * ``-lRAJA -fopenmp`` are standard flags for compiling the RAJA library. + * ``-lRAJA`` is a standard flag for linking the RAJA library. * ``-fPIC`` tells the compiler to produce *position independent code*, which prevents conflicts in the address space of the executable. * ``-shared`` will let the compiler know that you want the resulting object file to be shared, removing the need for a *main* as well as - giving dynamically loaded executables access to functions flagged + giving dynamically loaded executables access to methods flagged with ``extern "C"``. -3. **The** ``RAJA_PLUGINS`` **environment variable has been set**, or a user - has made a call to ``RAJA::util::init_plugins("path");`` with a path - specified to either a directory or a .so file. It's worth noting that these - are not mutually exclusive. RAJA will look for plugins based on the - environment variable on program startup and new plugins may be loaded after - that by calling the ``init_plugins()`` method. +#. The ``RAJA_PLUGINS`` environment variable must be set, or the project code + must call ``RAJA::util::init_plugins("path");``. Either of these approaches + is required to supply the path to either a directory containing the plugin + or its ``.so`` file. It's worth noting that these are not mutually + exclusive. RAJA will look for plugins based on the environment variable on + program startup and new plugins may be loaded after that by calling the + ``init_plugins()`` method. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -195,16 +192,16 @@ CHAI Plugin ^^^^^^^^^^^^^^^^^^^^^ RAJA provides abstractions for parallel execution, but does not support -a memory model for managing data in heterogeneous memory spaces. -The `CHAI library `_ provides an array abstraction -that integrates with RAJA to enable automatic copying of data at runtime to the -proper execution memory space for a RAJA-based kernel based on the -RAJA exection policy used to execute the kernel. Then, the data can be accessed -inside the kernel as needed. +a memory model for managing data in heterogeneous memory spaces. One +option for managing such data is to use `CHAI `_, +which provides an array abstraction that integrates with RAJA to enable +automatic copying of data at runtime to the proper execution memory space for +a RAJA-based kernel determined by the RAJA execution policy used to execute the +kernel. Then, the data can be accessed inside the kernel as needed. To build CHAI with RAJA integration, you need to download and install CHAI with -the ``ENABLE_RAJA_PLUGIN`` option turned on. Please see the `CHAI project -`_ for details. +the ``ENABLE_RAJA_PLUGIN`` option turned on. Please see +`CHAI `_ for details. After CHAI has been built with RAJA support enabled, applications can use CHAI ``ManangedArray`` objects to access data inside a RAJA kernel. For example:: @@ -215,7 +212,7 @@ After CHAI has been built with RAJA support enabled, applications can use CHAI array[i] = i * 2.0f; }); - RAJA::forall(0, 1000, [=] (int i) { + RAJA::forall(0, 1000, [=] (int i) { std::cout << "array[" << i << "] is " << array[i] << std::endl; }); @@ -223,8 +220,8 @@ Here, the data held by ``array`` is allocated on the host CPU. Then, it is initialized on a CUDA GPU device. CHAI sees that the data lives on the CPU and is needed in a GPU device data environment since it is used in a kernel that will run with a RAJA CUDA execution policy. So it copies the data from -CPU to GPU, making it available for access in the RAJA kernel. Next, -it is printed in the second kernel which runs on the CPU (indicated by the +CPU memory to GPU memory, making it available for access in the RAJA kernel. +The data is printed in the second kernel which runs on the CPU (indicated by the RAJA sequential execution policy). So CHAI copies the data back to the host CPU. All necessary data copies are done transparently on demand for each kernel. diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 37bd07f769..0588c5a900 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -6,21 +6,21 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _policies-label: +.. _feat-policies-label: ================== Policies ================== -This section describes RAJA policies for loop kernel execution, -scans, sorts, reductions, atomics, etc. Each policy is a type that is passed to -a RAJA template method or class to specialize its behavior. Typically, the -policy indicates which programming model back-end to use and sometimes -specifies additional information about the execution pattern, such as -number of CUDA threads per thread block, whether execution is synchronous -or asynchronous, etc. +RAJA kernel execution methods take an execution policy type template parameter +to specialize execution behavior. Typically, the policy indicates which +programming model back-end to use and other information about the execution +pattern, such as number of CUDA threads per thread block, whether execution is +synchronous or asynchronous, etc. This section describes RAJA policies for +loop kernel execution, scans, sorts, reductions, atomics, etc. Please +detailed examples in :ref:`tutorial-label` for a variety of use cases. -As RAJA functionality evolves, new policies will be added and some may +As RAJA functionality evolves, new policies are added and some may be redefined and to work in new ways. .. note:: * All RAJA policies are in the namespace ``RAJA``. @@ -81,11 +81,11 @@ policies. Typically, they work by providing an *outer policy* and an flexibility to create more complex execution patterns. -.. note:: To control the number of threads used by OpenMP policies +.. note:: To control the number of threads used by OpenMP policies, set the value of the environment variable 'OMP_NUM_THREADS' (which is fixed for duration of run), or call the OpenMP routine 'omp_set_num_threads(nthreads)' in your application, which allows - one to change the number of threads at runtime. + one to change the number of threads at run time. The full policies are described in the following table. Partial policies are described in other tables below. @@ -167,26 +167,24 @@ a template argument as described above. omp_for_runtime_exec forall, Same as applying kernel (For) 'omp for schedule(runtime)' + omp_parallel_collapse_exec kernel Use in Collapse statement + (Collapse + to parallelize multiple + ArgList) loop levels in loop nest + indicated using ArgList ====================================== ============= ========================== -.. important:: **RAJA only provides a nowait policy option for static schedule** - since that is the only schedule case that can be used with - nowait and be correct in general when chaining multiple loops - in a single parallel region. Paraphrasing the OpenMP standard: +.. important:: **RAJA only provides a nowait policy option for static + scheduling** since that is the only schedule case that can be + used with nowait and be correct in general when executing + multiple loops in a single parallel region. Paraphrasing the + OpenMP standard: *programs that depend on which thread executes a particular loop iteration under any circumstance other than static schedule are non-conforming.* .. note:: As in the RAJA full policies for OpenMP scheduling, the ``ChunkSize`` is optional. If not provided, the default chunk size that the OpenMP - implementation applies will be used. For this case, - the RAJA policy syntax is - ``omp_for_{static|dynamic|guided}_exec< >``, which will result - in the OpenMP pragma - ``omp for schedule({static|dynamic|guided})`` being applied. - Similarly, for ``nowait`` static policy, the RAJA policy syntax is - ``omp_for_nowait_static_exec< >``, which will result in the OpenMP - pragma ``omp for schedule(static) nowait`` being applied. + implementation applies will be used. .. note:: As noted above, RAJA inner OpenMP policies must only be used within an **existing** parallel region to work properly. Embedding an inner @@ -230,8 +228,8 @@ a template argument as described above. Threading Building Block (TBB) Parallel CPU Policies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -RAJA provides a basic set of TBB execution policies for users who would like -to try it. +RAJA provides a basic set of TBB execution policies for use with the +RAJA TBB back-end, which supports a subset of RAJA features. ====================================== ============= ========================== Threading Building Blocks Policies Works with Brief description @@ -261,7 +259,7 @@ to try it. // do some more parallel work - This allows changing number of workers at runtime. + This allows changing number of workers at run time. GPU Policies for CUDA and HIP @@ -301,7 +299,7 @@ policies have the prefix ``hip_``. threads in y-dimension cuda/hip_thread_z_loop kernel (For) Same as above, but for threads in z-dimension - cuda/hip_flatten_block_threads_{xyz} Teams (Loop) Reshapes threads in a + cuda/hip_flatten_block_threads_{xyz} Launch (Loop) Reshapes threads in a multi-dimensional thread team into one-dimension, accepts any permutation @@ -323,14 +321,14 @@ policies have the prefix ``hip_``. blocks in y-dimension cuda/hip_block_z_loop kernel (For) Same as above, but use blocks in z-dimension - cuda/hip_global_thread_x Teams (Loop) Creates a unique thread - id for each thread on the - x dimension of the grid + cuda/hip_global_thread_x Launch (Loop) Creates a unique thread + id for each thread on + x-dimension of the grid (expt namespace) - cuda/hip_global_thread_y Teams (Loop) Same as above, but uses + cuda/hip_global_thread_y Launch (Loop) Same as above, but uses threads in y-dimension (expt namespace) - cuda/hip_global_thread_z Teams (Loop) Same as above, but uses + cuda/hip_global_thread_z Launch (Loop) Same as above, but uses threads in z-dimension (expt namespace) cuda/hip_warp_direct kernel (For) Map work to threads @@ -391,8 +389,8 @@ Several notable constraints apply to RAJA CUDA/HIP *thread-direct* policies. different thread dimensions), the product of sizes of the corresponding iteration spaces cannot be greater than the maximum allowable threads per block. Typically, this is - equ:math:`\leq` 1024; e.g., attempting to launch a CUDA kernel - with more than 1024 threads per block will cause the CUDA runtime + 1024 threads per block. Attempting to execute a kernel with more + than the maximum allowed the CUDA runtime to complain about *illegal launch parameters.* * **Thread-direct policies are recommended only for certain loop patterns, such as tiling.** @@ -521,16 +519,14 @@ device, for example. They are summarized in the following table. RAJA IndexSet Execution Policies ----------------------------------------------------- -When an IndexSet iteration space is used in RAJA, such as passing an IndexSet -to a ``RAJA::forall`` method, an index set execution policy is required. An -index set execution policy is a **two-level policy**: an 'outer' policy for -iterating over segments in the index set, and an 'inner' policy used to -execute the iterations defined by each segment. An index set execution policy -type has the form:: - - RAJA::ExecPolicy< segment_iteration_policy, segment_execution_policy> +When an IndexSet iteration space is used in RAJA by passing an IndexSet +to a ``RAJA::forall`` method, for example, an index set execution policy is +required. An index set execution policy is a **two-level policy**: an 'outer' +policy for iterating over segments in the index set, and an 'inner' policy +used to execute the iterations defined by each segment. An index set execution +policy type has the form:: -See :ref:`indexsets-label` for more information. + RAJA::ExecPolicy< segment_iteration_policy, segment_execution_policy > In general, any policy that can be used with a ``RAJA::forall`` method can be used as the segment execution policy. The following policies are @@ -559,7 +555,7 @@ tbb_segit Iterate over index set segments in Parallel Region Policies ------------------------- -Earlier, we discussed an example using the ``RAJA::region`` construct to +Earlier, we discussed using the ``RAJA::region`` construct to execute multiple kernels in an OpenMP parallel region. To support source code portability, RAJA provides a sequential region concept that can be used to surround code that uses execution back-ends other than OpenMP. For example:: @@ -625,7 +621,8 @@ sycl_reduce any SYCL Reduction in a SYCL kernel (device ======================= ============= ========================================== .. note:: RAJA reductions used with SIMD execution policies are not - guaranteed to generate correct results at present. + guaranteed to generate correct results. So they should not be used + for kernels containing reductions. .. _atomicpolicy-label: @@ -640,34 +637,37 @@ type. Atomic policy types are distinct from loop execution policy types. policy for the kernel in which the atomic operation is used. The following table summarizes RAJA atomic policies and usage. -========================= ============= ======================================== -Atomic Policy Loop Policies Brief description - to Use With -========================= ============= ======================================== -seq_atomic seq_exec, Atomic operation performed in a - loop_exec non-parallel (sequential) kernel. -omp_atomic any OpenMP Atomic operation performed in an OpenMP. - policy multithreading or target kernel; i.e., - apply ``omp atomic`` pragma. -cuda/hip_atomic any CUDA/HIP Atomic operation performed in a CUDA/HIP - policy kernel. -cuda/hip_atomic_explicit any CUDA/HIP Atomic operation performed in a CUDA/HIP - policy kernel that may also be used in a host - execution context. The atomic policy - takes a host atomic policy template - argument. See additional explanation - and example below. -builtin_atomic seq_exec, Compiler *builtin* atomic operation. - loop_exec, - any OpenMP - policy -auto_atomic seq_exec, Atomic operation *compatible* with loop - loop_exec, execution policy. See example below. - any OpenMP Can not be used inside cuda/hip - policy, explicit atomic policies. - any CUDA/HIP - policy -========================= ============= ======================================== +============================= ============= ======================================== +Atomic Policy Loop Policies Brief description + to Use With +============================= ============= ======================================== +seq_atomic seq_exec, Atomic operation performed in a + loop_exec non-parallel (sequential) kernel. +omp_atomic any OpenMP Atomic operation performed in an OpenMP. + policy multithreading or target kernel; i.e., + apply ``omp atomic`` pragma. +cuda/hip/sycl_atomic any Atomic operation performed in a + CUDA/HIP/SYCL CUDA/HIP/SYCL kernel. + policy + +cuda/hip_atomic_explicit any CUDA/HIP Atomic operation performed in a CUDA/HIP + policy kernel that may also be used in a host + execution context. The atomic policy + takes a host atomic policy template + argument. See additional explanation + and example below. +builtin_atomic seq_exec, Compiler *builtin* atomic operation. + loop_exec, + any OpenMP + policy +auto_atomic seq_exec, Atomic operation *compatible* with loop + loop_exec, execution policy. See example below. + any OpenMP Can not be used inside cuda/hip + policy, explicit atomic policies. + any + CUDA/HIP/SYCL + policy +============================= ============= ======================================== .. note:: The ``cuda_atomic_explicit`` and ``hip_atomic_explicit`` policies take a host atomic policy template parameter. They are intended to @@ -680,10 +680,9 @@ Here is an example illustrating use of the ``cuda_atomic_explicit`` policy:: RAJA::atomicAdd< RAJA::cuda_atomic_explicit >(&sum, 1); }; - RAJA::forall< RAJA::cuda_exec >(RAJA::RangeSegment seg(0, N), kernel); + RAJA::forall< RAJA::cuda_exec >(RAJA::TypedRangeSegment seg(0, N), kernel); - RAJA::forall< RAJA::omp_parallel_for_exec >(RAJA::RangeSegment seg(0, N), - kernel); + RAJA::forall< RAJA::omp_parallel_for_exec >(RAJA::TypedRangeSegment seg(0, N), kernel); In this case, the atomic operation knows when it is compiled for the device in a CUDA kernel context and the CUDA atomic operation is applied. Similarly @@ -692,7 +691,7 @@ used and the OpenMP version of the atomic operation is applied. Here is an example illustrating use of the ``auto_atomic`` policy:: - RAJA::forall< RAJA::cuda_exec >(RAJA::RangeSegment seg(0, N), + RAJA::forall< RAJA::cuda_exec >(RAJA::TypedRangeSegment seg(0, N), [=] RAJA_DEVICE (RAJA::Index_type i) { RAJA::atomicAdd< RAJA::auto_atomic >(&sum, 1); @@ -705,7 +704,8 @@ execution policy was used, the OpenMP version of the atomic operation would be used. .. note:: * There are no RAJA atomic policies for TBB (Intel Threading Building - Blocks) execution contexts at present. + Blocks) execution contexts since reductions are not supported + for the RAJA TBB back-end. * The ``builtin_atomic`` policy may be preferable to the ``omp_atomic`` policy in terms of performance. @@ -717,7 +717,7 @@ Local Array Memory Policies ``RAJA::LocalArray`` types must use a memory policy indicating where the memory for the local array will live. These policies are described -in :ref:`local_array-label`. +in :ref:`feat-local_array-label`. The following memory policies are available to specify memory allocation for ``RAJA::LocalArray`` objects: @@ -743,18 +743,20 @@ of Statements that are composed in the order that they appear in the kernel policy to construct a kernel. A Statement may contain an enclosed StatmentList. Thus, a ``RAJA::KernelPolicy`` type is really just a StatementList. The main Statement types provided by RAJA are ``RAJA::statement::For`` and -``RAJA::statement::Lambda``, that we have shown above. A 'For' Statement -indicates a for-loop structure and takes three template arguments: -'ArgId', 'ExecPolicy', and 'EnclosedStatements'. The ArgID identifies the -position of the item it applies to in the iteration space tuple argument to the -``RAJA::kernel`` method. The ExecPolicy is the RAJA execution policy to -use on that loop/iteration space (similar to ``RAJA::forall``). -EnclosedStatements contain whatever is nested within the template parameter -list to form a StatementList, which will be executed for each iteration of -the loop. The ``RAJA::statement::Lambda`` invokes the lambda -corresponding to its position (LambdaID) in the sequence of lambda expressions -in the ``RAJA::kernel`` argument list. For example, a simple sequential -for-loop:: +``RAJA::statement::Lambda``, that we discussed in +:ref:`loop_elements-kernel-label`. +A ``RAJA::statement::For`` type +indicates a for-loop structure. The ``ArgID`` parameter is an integral constant +that identifies the position of the iteration space in the iteration space +tuple passed to the ``RAJA::kernel`` method to be used for the loop. The +``ExecPolicy`` is the RAJA execution policy to use on the loop, which is +similar to ``RAJA::forall`` usage. The ``EnclosedStatements`` type is a +nested template parameter that contains whatever is needed to execute the +kernel and which forms a valid StatementList. The +``RAJA::statement::Lambda`` +type invokes the lambda expression corresponding to its position 'LambdaID' +in the sequence of lambda expressions in the ``RAJA::kernel`` argument list. +For example, a simple sequential for-loop:: for (int i = 0; i < N; ++i) { // loop body @@ -770,7 +772,7 @@ can be represented using the RAJA kernel interface as:: >; RAJA::kernel( - RAJA::make_tuple(N_range), + RAJA::make_tuple(range), [=](int i) { // loop body } @@ -787,15 +789,16 @@ RAJA::kernel Statement Types The list below summarizes the current collection of statement types that can be used with ``RAJA::kernel`` and ``RAJA::kernel_param``. More detailed explanation along with examples of how they are used can be found in -:ref:`tutorial-label`. +the ``RAJA::kernel`` examples in :ref:`tutorial-label`. -.. note:: * ``RAJA::kernel_param`` functions similar to ``RAJA::kernel`` - except that the second argument is a *tuple of parameters* used - in a kernel for local arrays, thread local variables, tiling - information, etc. +.. note:: All of the statement types described below are in the namespace + ``RAJA::statement``. For brevity, we omit the namespaces in + the discussion in this section. -.. note:: * All of the statement types described below are in the namespace - ``RAJA::statement``. For breavity, we omit the namespaces. +.. note:: ``RAJA::kernel_param`` functions similarly to ``RAJA::kernel`` + except that the second argument is a *tuple of parameters* used + in a kernel for local arrays, thread local variables, tiling + information, etc. Several RAJA statements can be specialized with auxilliary types, which are described in :ref:`auxilliarypolicy_label`. @@ -814,11 +817,11 @@ There is one statement specific to OpenMP kernels. * ``OmpSyncThreads`` applies the OpenMP ``#pragma omp barrier`` directive. -Statement types that lauch CUDA or HIP GPU kernels are listed next. They work +Statement types that launch CUDA or HIP GPU kernels are listed next. They work similarly for each back-end and their names are distinguished by the prefix ``Cuda`` or ``Hip``. For example, ``CudaKernel`` or ``HipKernel``. -* ``Cuda/HipKernel< EnclosedStatements>`` launches ``EnclosedStatements' as a GPU kernel; e.g., a loop nest where the iteration spaces of each loop level are associated with threads and/or thread blocks as described by the execution policies applied to them. This kernel launch is synchronous. +* ``Cuda/HipKernel< EnclosedStatements>`` launches ``EnclosedStatements`` as a GPU kernel; e.g., a loop nest where the iteration spaces of each loop level are associated with threads and/or thread blocks as described by the execution policies applied to them. This kernel launch is synchronous. * ``Cuda/HipKernelAsync< EnclosedStatements>`` asynchronous version of Cuda/HipKernel. @@ -838,11 +841,11 @@ similarly for each back-end and their names are distinguished by the prefix * ``Cuda/HipKernelExpAsync`` asynchronous version of Cuda/HipKernelExp. -* ``Cuda/HipSyncThreads`` invokes CUDA or HIP '__syncthreads()' barrier. +* ``Cuda/HipSyncThreads`` invokes CUDA or HIP ``__syncthreads()`` barrier. -* ``Cuda/HipSyncWarp`` invokes CUDA '__syncwarp()' barrier. **Note: warp sync is not supported, so the HIP variant is a no-op. +* ``Cuda/HipSyncWarp`` invokes CUDA ``__syncwarp()`` barrier. Warp sync is not supported in HIP, so the HIP variant is a no-op. -Statement types that lauch SYCL kernels are listed next. +Statement types that launch SYCL kernels are listed next. * ``SyclKernel`` launches ``EnclosedStatements`` as a SYCL kernel. This kernel launch is synchronous. @@ -858,14 +861,14 @@ e.g., by allowing CPU cache blocking or use of GPU shared memory. * ``ForICount< ArgId, ParamId, ExecPolicy, EnclosedStatements >`` abstracts an inner for-loop within an outer tiling loop **where it is necessary to obtain the local iteration index in each tile**. The ``ArgId`` indicates which entry in the iteration space tuple to which the loop applies and the ``ParamId`` indicates the position of the tile index parameter in the parameter tuple. The ``ExecPolicy`` and ``EnclosedStatements`` are similar to what they represent in a ``statement::For`` type. It is often advantageous to use local arrays for data accessed in tiled loops. -RAJA provides a statement for allocating data in a :ref:`local_array-label` +RAJA provides a statement for allocating data in a :ref:`feat-local_array-label` object according to a memory policy. See :ref:`localarraypolicy-label` for more information about such policies. * ``InitLocalMem< MemPolicy, ParamList<...>, EnclosedStatements >`` allocates memory for a ``RAJA::LocalArray`` object used in kernel. The ``ParamList`` entries indicate which local array objects in a tuple will be initialized. The ``EnclosedStatements`` contain the code in which the local array will be accessed; e.g., initialization operations. RAJA provides some statement types that apply in specific kernel scenarios. -* ``Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads in a multi-threaded code region to a single thread. The ``ReducePolicy`` is similar to what it represents for RAJA reduction types. ``ParamId`` specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. ``Operator`` is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`scanops-label`). After the reduction is complete, the ``EnclosedStatements`` execute on the thread that received the final reduced value. +* ``Reduce< ReducePolicy, Operator, ParamId, EnclosedStatements >`` reduces a value across threads in a multithreaded code region to a single thread. The ``ReducePolicy`` is similar to what it represents for RAJA reduction types. ``ParamId`` specifies the position of the reduction value in the parameter tuple passed to the ``RAJA::kernel_param`` method. ``Operator`` is the binary operator used in the reduction; typically, this will be one of the operators that can be used with RAJA scans (see :ref:`feat-scanops-label`). After the reduction is complete, the ``EnclosedStatements`` execute on the thread that received the final reduced value. * ``If< Conditional >`` chooses which portions of a policy to run based on run-time evaluation of conditional statement; e.g., true or false, equal to some value, etc. @@ -877,10 +880,10 @@ RAJA provides some statement types that apply in specific kernel scenarios. Auxilliary Types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following list summarizes auxillary types used in the above statments. These +The following list summarizes auxilliary types used in the above statements. These types live in the ``RAJA`` namespace. - * ``tile_fixed`` tile policy argument to a ``Tile`` or ``TileTCount`` statement; partitions loop iterations into tiles of a fixed size specified by ``TileSize``. This statement type can be used as the ``TilePolicy`` template paramter in the ``Tile`` statements above. + * ``tile_fixed`` tile policy argument to a ``Tile`` or ``TileTCount`` statement; partitions loop iterations into tiles of a fixed size specified by ``TileSize``. This statement type can be used as the ``TilePolicy`` template parameter in the ``Tile`` statements above. * ``tile_dynamic`` TilePolicy argument to a Tile or TileTCount statement; partitions loop iterations into tiles of a size specified by a ``TileSize{}`` positional parameter argument. This statement type can be used as the ``TilePolicy`` template paramter in the ``Tile`` statements above. @@ -892,6 +895,5 @@ types live in the ``RAJA`` namespace. * ``ValuesT`` argument to a Lambda statement; used to specify compile time constants, of type T, that will be used as lambda arguments. - Examples that show how to use a variety of these statement types can be found -in :ref:`tutorialcomplex-label`. +in :ref:`loop_elements-kernel-label`. diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst index 4e69d087d0..808669f03f 100644 --- a/docs/sphinx/user_guide/feature/reduction.rst +++ b/docs/sphinx/user_guide/feature/reduction.rst @@ -6,7 +6,7 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _reductions-label: +.. _feat-reductions-label: ==================== Reduction Operations @@ -15,12 +15,10 @@ Reduction Operations RAJA does not provide separate loop execution methods for loops containing reduction operations like some other C++ loop programming abstraction models. Instead, RAJA provides reduction types that allow users to perform reduction -operations in ``RAJA::forall`` and ``RAJA::kernel`` kernels in a portable, -thread-safe manner. Users may use as many reduction objects in a loop kernel -as they need. Available RAJA reduction types are described in this section. - -A detailed example of RAJA reduction usage can be found in -:ref:`reductions-label`. +operations in kernels launched using ``RAJA::forall``, ``RAJA::kernel``, +and ``RAJA::launch`` methods in a portable, thread-safe manner. Users may +use as many reduction objects in a loop kernel as they need. Available RAJA +reduction types are described in this section. .. note:: All RAJA reduction types are located in the namespace ``RAJA``. @@ -28,14 +26,19 @@ Also .. note:: * Each RAJA reduction type is templated on a **reduction policy** and a **reduction value type** for the reduction variable. The - **reduction policy type must be compatibe with the execution - policy used by the kernel.** For example, in a CUDA kernel, - a CUDA reduction policy must be used. + **reduction policy type must be compatible with the execution + policy used by the kernel in which it is used.** For example, in + a CUDA kernel, a CUDA reduction policy must be used. * Each RAJA reduction type accepts an **initial reduction value or values** at construction (see below). * Each RAJA reduction type has a 'get' method to access reduced values after kernel execution completes. +Please see the following tutorial sections for detailed examples that use +RAJA reductions: + + * :ref:`tut-reduction-label`. + ---------------- Reduction Types @@ -156,3 +159,153 @@ Reduction Policies For more information about available RAJA reduction policies and guidance on which to use with RAJA execution policies, please see :ref:`reducepolicy-label`. + +-------------------------------- +Experimental Reduction Interface +-------------------------------- + +An experimental reduction interface is now available that offers several +usability and performance advantages over the current reduction model in RAJA. +The new interface allows ``RAJA::forall`` to take optional "plugin-like" +objects to extend the execution behavior of a ``RAJA::forall`` execution +context. + +The new interface passes ``RAJA::expt::Reduce`` objects as function +arguments to ``RAJA::forall`` and provides users with thread-local variables +of the reduction data type to be updated inside the lambda. This differs +from the current reduction model in which ``RAJA::ReduceOP`` +objects are captured by the user-supplied kernel body lambda expression. + + +RAJA::expt::Reduce +.................. +:: + + double* a = ...; + + double rs = 0.0; + double rm = 1e100; + + RAJA::forall ( Res, Seg, + RAJA::expt::Reduce(&rs), + RAJA::expt::Reduce(&rm), + [=] (int i, double& _rs, double& _rm) { + _rs += a[i]; + _rm = RAJA_MIN(a[i], _rm); + } + ); + + std::cout << rs ... + std::cout << rm ... + +* Each ``RAJA::expt::Reduce`` argument to ``RAJA::forall`` is templated on + a reduction operator, and takes a pointer to a target variable to write + the final reduction result to, ``&rs`` and ``&rm`` in the example code + above. The reduction operation will include the existing value of + the given target variable. +* The kernel body lambda expression passed to ``RAJA::forall`` must have a + parameter corresponding to each ``RAJA::expt::Reduce`` argument, ``_rs`` and + ``_rm`` in the example code. These parameters refer to a local target for each + reduction operation. It is important to note that the parameters follow the + kernel iteration variable, ``i`` in this case, and appear in the same order as the + corresponding ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. The + parameters' types must be references to the types used in the + ``RAJA::expt::Reduce`` arguments. +* The local variables referred to by ``_rs`` and ``_rm`` are initialized with the + *identity* of the reduction operation to be performed. +* The local variables are updated in the user supplied lambda. +* The local variables are reduced to a single value, combining their values across all + threads participating in the ``RAJA::forall`` execution. +* Finally, the target variable is updated with the result of the ``RAJA::forall`` reduction + by performing the reduction operation to combine the existing value of the target + variable and the result of the ``RAJA::forall`` reduction. +* The final reduction value is accessed by referencing the target variable + passed to ``RAJA::expt::Reduce`` in the ``RAJA::forall`` method. + +.. note:: In the above example ``Res`` is a resource object that must be + compatible with the ``EXEC_POL``. ``Seg`` is the iteration space + object for ``RAJA::forall``. + +.. important:: The order and types of the local reduction variables in the + kernel body lambda expression must match exactly with the + corresponding ``RAJA::expt::Reduce`` arguments to the + ``RAJA::forall`` to ensure that the correct result is obtained. + +RAJA::expt::ValLoc +.................. + +As with the current RAJA reduction interface, the new interface supports *loc* +reductions, which provide the ability to get a kernel/loop index at which the +final reduction value was found. With this new interface, *loc* reductions +are performed using ``ValLoc`` types. Since they are strongly typed, they +provide ``min()`` and ``max()`` operations that are equivalent to using +``RAJA_MIN()`` or ``RAJA_MAX`` macros as demonstrated in the code example below. +Users must use the ``getVal()`` and ``getLoc()`` methods to access the reduction +results:: + + double* a = ...; + + using VL_DOUBLE = RAJA::expt::ValLoc; + VL_DOUBLE rm_loc; + + RAJA::forall ( Res, Seg, + RAJA::expt::Reduce(&rm_loc), + [=] (int i, VL_DOUBLE& _rm_loc) { + _rm_loc = RAJA_MIN(VL_DOUBLE(a[i], i), _rm_loc); + //_rm_loc.min(VL_DOUBLE(a[i], i)); // Alternative to RAJA_MIN + } + ); + + std::cout << rm_loc.getVal() ... + std::cout << rm_loc.getLoc() ... + +Lambda Arguments +................ + +This interface takes advantage of C++ parameter packs to allow users to pass +any number of ``RAJA::expt::Reduce`` objects to the ``RAJA::forall`` method:: + + double* a = ...; + + using VL_DOUBLE = RAJA::expt::ValLoc; + VL_DOUBLE rm_loc; + double rs; + double rm; + + RAJA::forall ( Res, Seg, + RAJA::expt::Reduce(&rs), // --> 1 double added + RAJA::expt::Reduce(&rm), // --> 1 double added + RAJA::expt::Reduce(&rm_loc), // --> 1 VL_DOUBLE added + RAJA::expt::KernelName("MyFirstRAJAKernel"), // --> NO args added + [=] (int i, double& _rs, double& _rm, VL_DOUBLE& _rm_loc) { + _rs += a[i]; + _rm = RAJA_MIN(a[i], _rm); + _rm_loc.min(VL_DOUBLE(a[i], i)); + } + ); + + std::cout << rs ... + std::cout << rm ... + std::cout << rm_loc.getVal() ... + std::cout << rm_loc.getLoc() ... + +Again, the lambda expression parameters are in the same order as +the ``RAJA::expt::Reduce`` arguments to ``RAJA::forall``. Both the types and +order of the parameters must match to get correct results and to compile +successfully. Otherwise, a static assertion will be triggered:: + + LAMBDA Not invocable w/ EXPECTED_ARGS. + +.. note:: This static assert is only enabled when passing an undecorated C++ + lambda. Meaning, this check will not happen when passing + extended-lambdas (i.e. DEVICE tagged lambdas) or other functor like + objects. + +.. note:: The experimental ``RAJA::forall`` interface is more flexible than the + current implementation, other optional arguments besides + ``RAJA::expt::Reduce`` can be passed to a ``RAJA::forall`` to extend + its behavior. In the above example we demonstrate using + ``RAJA::expt::KernelName``, which wraps a ``RAJA::forall`` executing + under a ``HIP`` or ``CUDA`` policy in a named region. Use of + ``RAJA::expt::KernelName`` does not require an additional + parameter in the lambda expression. diff --git a/docs/sphinx/user_guide/feature/resource.rst b/docs/sphinx/user_guide/feature/resource.rst index 1a28720553..284fdd560f 100644 --- a/docs/sphinx/user_guide/feature/resource.rst +++ b/docs/sphinx/user_guide/feature/resource.rst @@ -6,175 +6,232 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _resource-label: +.. _feat-resource-label: ========= Resources ========= -This section describes the basic concepts of Resource types and their -functionality in ``RAJA::forall``. Resources are used as an interface to -various backend constructs and their respective hardware. Currently there -exists Resource types for ``Cuda``, ``Hip``, ``Omp`` (target) and ``Host``. -Resource objects allow the user to execute ``RAJA::forall`` calls -asynchronously on a respective thread/stream. The underlying concept of each -individual Resource is still under development and it should be considered -that functionality / behaviour may change. +This section describes the basic concepts of resource types and how to use +them with RAJA-based kernels using ``RAJA::forall``, ``RAJA::kernel``, `` +RAJA::launch``, etc. Resources are used as an interface to various RAJA +back-end constructs and their respective hardware. Currently there +exist resource types for ``Cuda``, ``Hip``, ``Omp`` (target) and ``Host``. +Resource objects allow one to allocate and deallocate storage in memory spaces +associated with RAJA back-ends and copy data between memory spaces. They also +allow one to execute RAJA kernels asynchronously on a respective thread/stream. +Resource support in RAJA is rudimentary at this point and its functionality / +behavior may change as it is developed. .. note:: * Currently feature complete asynchronous behavior and streamed/threaded support is available only for ``Cuda`` and - ``Hip`` resources. - * The ``RAJA::resources`` namespace aliases the ``camp::resources`` + ``Hip`` resources. + * RAJA resource support is based on camp resource support. The + ``RAJA::resources`` namespace aliases the ``camp::resources`` namespace. Each resource has a set of underlying functionality that is synonymous across -all resource types. +each resource type. ===================== =============================================== Methods Brief description ===================== =============================================== get_platform Returns the underlying camp platform - the resource is associated with. - get_event Return an Event object for the resource from + associated with the resource. + get_event Return an event object for the resource from the last resource call. - allocate Allocate data per the resource's given - backend. - deallocate Deallocate data per the resource's given - backend. - memcpy Perform a memory copy from a src location - to a destination location from the - resource's backend. - memset Set memory value per the resourse's - given backend. - wait_for Enqueue a wait on the resource's stream/thread - for a user passed event to occur. + allocate Allocate data on a resource back-end. + deallocate Deallocate data on a resource back-end. + memcpy Perform a memory copy from a source location + to a destination location on a resource back-end. + memset Set memory value in an allocation on a resource + back-end. + wait Wait for all operations enqueued on a resource to + complete before proceeding. + wait_for Enqueue a wait on a resource stream/thread + for a user passed event to complete. ===================== =============================================== -.. note:: ``deallocate``, ``memcpy`` and ``memset`` will only work with +.. note:: ``deallocate``, ``memcpy`` and ``memset`` operations only work with pointers that correspond to memory locations that have been allocated on the resource's respective device. -Each resource type also defines specific backend information/functionality. +Each resource type also defines specific back-end information/functionality. For example, each CUDA resource contains a ``cudaStream_t`` value with an -associated get method. See the individual functionality for each resource -in ``raja/tpl/camp/include/resource/``. +associated get method. The basic interface for each resource type is +summarized in `Camp resource `_. .. note:: Stream IDs are assigned to resources in a round robin fashion. The - number of independent streams for a given backend is limited to the + number of independent streams for a given back-end is limited to the maximum number of concurrent streams that the back-end supports. ------------ Type-Erasure ------------ -Resources can be declared in two formats: An erased resource, and a concrete -resource. The underlying runtime functionality is the same for both formats. -An erased resource allows a user the ability to change the resource backend -at runtime. +Resources can be declared in two ways, as a type-erased resource or as a +concrete resource. The underlying run time functionality is the same for both. -Concrete CUDA resource:: +Here is one way to construct a concrete CUDA resource type:: - RAJA::resources::Cuda my_cuda_res; + RAJA::resources::Cuda my_cuda_res; -Erased resource:: +A type-erased resource allows a user the ability to change the resource +back-end at run time. For example, to choose a CUDA GPU device resource or +host resource at run time, one could do the following:: - if (use_gpu) - RAJA::resources::Resource my_res{RAJA::resources::Cuda()}; - else - RAJA::resources::Resource my_res{RAJA::resources::Host()}; + RAJA::resources::Resource* my_res = nullptr; + if (use_gpu) + my_res = new RAJA::resources::Resource{RAJA::resources::Cuda()}; + else + my_res = new RAJA::resources::Resource{RAJA::resources::Host()}; -Memory allocation on resources:: +When ``use_gpu`` is true, ``my_res`` will be a CUDA GPU device resource. +Otherwise, it will be a host CPU resource. - int* a1 = my_cuda_res.allocate(ARRAY_SIZE); - int* a2 = my_res.allocate(ARRAY_SIZE); +------------------- +Memory Operations +------------------- -If ``use_gpu`` is ``true``, then the underlying type of ``my_res`` is a CUDA -resource. Therefore ``a1`` and ``a2`` will both be allocated on the GPU. If -``use_gpu`` is ``false``, then only ``a1`` is allocated on the GPU, and -``a2`` is allocated on the host. +The example discussed in this section illustrates most of the memory +operations that can be performed with +A common use case for a resource is to manage arrays in the appropriate +memory space to use in a kernel. Consider the following code example:: + // create a resource for a host CPU and a CUDA GPU device + RAJA::resources::Resource host_res{RAJA::resources::Host()}; + RAJA::resources::Resource cuda_res{RAJA::resources::Cuda()}; ------- -Forall ------- + // allocate arrays in host memory and device memory + int N = 100; + + int* host_array = host_res.allocate(N); + int* gpu_array = cuda_res.allocate(N); + + // initialize values in host_array.... + + // initialize gpu_array values to zero + cuda_res.memset(gpu_array, 0, sizeof(int) * N); + + // copy host_array values to gpu_array + cuda_res.memcpy(gpu_array, host_array, sizeof(int) * N); + + // execute a CUDA kernel that uses gpu_array data + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int i) { + // modify values of gpu_array... + } + ); + + // copy gpu_array values to host_array + cuda_res.memcpy(host_array, gpu_array, sizeof(int) * N); + + // do something with host_array on CPU... + + // de-allocate array storage + host_res.deallocate(host_array); + cuda_res.deallocate(gpu_array); + +Here, we create a CUDA GPU device resource and a host CPU resource and use +them to allocate an array in GPU memory and one in host memory, respectively. +Then, after initializing the host array, we use the CUDA resource to copy the +host array to the GPU array storage. Next, we run a CUDA device kernel +which modifies the GPU array. After using the CUDA resource to copy the GPU +array values into the host array, we can do something with the values +generated in the GPU kernel on the CPU host. Lastly, we de-allocate the +arrays. + +-------------------------------- +Kernel Execution and Resources +-------------------------------- + +Resources can be used with the following RAJA kernel execution interfaces: -A resource is an optional argument to a ``RAJA::forall`` call. When used, -it is passed as the first argument to the method:: + * ``RAJA::forall`` + * ``RAJA::kernel`` + * ``RAJA::launch`` + * ``RAJA::sort`` + * ``RAJA::scan`` - RAJA::forall(my_gpu_res, .... ) +Although we show examples using mainly ``RAJA::forall`` in the following +discussion, resource usage with the other methods listed is similar and +provides similar behavior. -When specifying a CUDA or HIP resource, the ``RAJA::forall`` is executed -aynchronously on a stream. Currently, CUDA and HIP are the only Resources -that enable asynchronous threading with a ``RAJA::forall``. All other calls -default to using the ``Host`` resource until further support is added. +Usage +^^^^^ + +Specifically, a resource can be passed optionally as the first argument in +a call to one of these methods. For example:: -The Resource type that is passed to a ``RAJA::forall`` call must be a concrete -type. This is to allow for a compile-time assertion that the resource is not -compatible with the given execution policy. For example:: + RAJA::forall(my_res, .... ); + +.. note:: When a resource is not passed when calling one of the methods listed + above, the *default* resource type associated with the execution + policy is used in the internal implementation. + +When passing a CUDA or HIP resource, the method will execute asynchronously +on a GPU stream. Currently, CUDA and HIP are the only resource types that +enable asynchronous threading. + +.. note:: Support for OpenMP CPU multithreading, which would use the + ``RAJA::resources::Host`` resource type, and OpenMP target offload + which would use the ``RAJA::resources::Omp`` resource type, + is incomplete and under development. + +The resource type passed to one of the methods listed above must be a +concrete type; i.e., not type erased. The reason is that this allows +consistency checking via a compile-time assertion to ensure that the passed +resource is compatible with the given execution policy. For example:: - using ExecPol = RAJA::cuda_exec_async; - RAJA::resources::Cuda my_cuda_res; - RAJA::resources::Resource my_res{RAJA::resources::Cuda()}; - RAJA::resources::Host my_host_res; - - RAJA::forall(my_cuda_res, .... ) // Compiles. - RAJA::forall(my_res, .... ) // Compilation Error. Not Concrete. - RAJA::forall(my_host_res, .... ) // Compilation Error. Mismatched Resource and Exec Policy. - -Below is a list of the currently available concrete resource types and their -execution policy suport. - - ======== ============================== - Resource Policies supported - ======== ============================== - Cuda | cuda_exec - | cuda_exec_async - | cuda_exec_explicit - Hip | hip_exec - | hip_exec_async - Omp* | omp_target_parallel_for_exec - | omp_target_parallel_for_exec_n - Host | loop_exec - | seq_exec - | openmp_parallel_exec - | omp_for_schedule_exec - | omp_for_nowait_schedule_exec - | simd_exec - | tbb_for_dynamic - | tbb_for_static - ======== ============================== - -.. note:: The ``RAJA::resources::Omp`` resource is still under development. - -IndexSet policies require two execution policies (see :ref:`indexsets-label`). -Currently, users may only pass a single resource to a forall method taking -an IndexSet argument. This resource is used for the inner execution of -each Segment in the IndexSet:: - - using ExecPol = RAJA::ExecPolicy>; - RAJA::forall(my_cuda_res, iset, .... ); - - -When a resource is not provided by the user, a *default* resource is assigned, -which can be accessed in a number of ways. It can be accessed directly from + using ExecPol = RAJA::cuda_exec_async; + + RAJA::resources::Cuda my_cuda_res; + RAJA::forall(my_cuda_res, .... ); // Successfully compiles + + RAJA::resources::Resource my_res{RAJA::resources::Cuda()}; + RAJA::forall(my_res, .... ) // Compilation error since resource type is not concrete + + RAJA::resources::Host my_host_res; + RAJA::forall(my_host_res, .... ) // Compilation error since resource type is incompatible with the execution policy + +IndexSet Usage +^^^^^^^^^^^^^^^ + +Recall that a kernel that uses a RAJA IndexSet to describe the kernel iteration +space, require a two execution policies (see :ref:`indexsetpolicy-label`). +Currently, a user may only pass a single resource to a method taking +an IndexSet argument. The resource is used for the *inner* execution over +each segment in the IndexSet, not for the *outer* iteration over segments. +For example:: + + using ExecPol = RAJA::ExecPolicy>; + RAJA::forall(my_cuda_res, iset, .... ); + +Default Resources +^^^^^^^^^^^^^^^^^^^^^^ + +When a resource is not provided by the user, a *default* resource that +corresponds to the execution policy is used. The default resource +can be accessed in multiple ways. It can be accessed directly from the concrete resource type:: - RAJA::resources::Cuda my_default_cuda = RAJA::resources::Cuda::get_default(); + RAJA::resources::Cuda my_default_cuda = RAJA::resources::Cuda::get_default(); -The resource type can also be deduced from an execution policy:: +The resource type can also be deduced in two different ways from an execution +policy:: - using Res = RAJA::resources::get_resource::type; - Res r = Res::get_default(); + using Res = RAJA::resources::get_resource::type; + Res r = Res::get_default(); -Finally, the resource type can be deduced from an execution policy:: +Or:: - auto my_resource = RAJA::resources::get_default_resource(); + auto my_resource = RAJA::resources::get_default_resource(); -.. note:: For CUDA and HIP, the default resource is *NOT* the CUDA or HIP - default stream. It is its own stream defined in - ``camp/include/resource/``. This is an attempt to break away - from some of the issues that arise from the synchronization behaviour +.. note:: For CUDA and HIP, the default resource is *NOT* associated with the + default CUDA or HIP stream. It is its own stream defined by the + underlying camp resource. This is intentional to break away + from some issues that arise from the synchronization behavior of the CUDA and HIP default streams. It is still possible to use the CUDA and HIP default streams as the default resource. This can be enabled by defining the environment variable @@ -185,34 +242,40 @@ Finally, the resource type can be deduced from an execution policy:: Events ------ -Event objects allow users to wait or query the status of a resource's action. An -event can be returned from a resource:: +Event objects allow users to wait or query the status of a resource's action. +An event can be returned from a resource:: - RAJA::resources::Event e = my_res.get_event(); + RAJA::resources::Event e = my_res.get_event(); Getting an event like this enqueues an event object for the given back-end. Users can call the *blocking* ``wait`` function on the event:: - e.wait(); + e.wait(); -Preferably, users can enqueue the event on a specific resource, forcing only -that resource to wait for the event:: +This wait call will block all execution until all operations enqueued on a +resource complete. - my_res.wait_for(&e); +Alternatively, a user can enqueue the event on a specific resource, forcing +only the resource to wait for the operation associated with the event to +complete:: -The usage allows one to set up dependencies between resource objects and -``RAJA::forall`` calls. + my_res.wait_for(&e); + +All methods listed above near the beginning of the RAJA resource discussion +return an event object so users can access the event associated with the +method call. This allows one to set up dependencies between resource objects +and operations, as well as define and control asynchronous execution patterns. .. note:: An Event object is only created if a user explicitly sets the event returned by the ``RAJA::forall`` call to a variable. This avoids unnecessary event objects being created when not needed. For example:: - forall>(my_cuda_res, ... + RAJA::forall>(my_cuda_res, ...); will *not* generate a cudaStreamEvent, whereas:: - RAJA::resources::Event e = forall>(my_cuda_res, ... + RAJA::resources::Event e = RAJA::forall>(my_cuda_res, ...); will generate a cudaStreamEvent. @@ -220,66 +283,69 @@ The usage allows one to set up dependencies between resource objects and Example ------- -This example executes three kernels across two cuda streams on the GPU with -a requirement that the first and second kernel finish execution before -launching the third. It also demonstrates copying memory from the device -to host on a resource: +The example presented here executes three kernels across two CUDA streams on +a GPU with a requirement that the first and second kernel finish execution +before the third is launched. It also shows copying memory from the device +to host on a resource that we described earlier. -First, define two concrete CUDA resources and one host resource: +First, we define two concrete CUDA resources and one concrete host resource, +and define an asynchronous CUDA execution policy type: .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_defres_start :end-before: _raja_res_defres_end :language: C++ -Next, allocate data for two device arrays and one host array: +Next, we allocate data for two GPU arrays and one host array, all of length 'N': .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_alloc_start :end-before: _raja_res_alloc_end :language: C++ -Then, Execute a kernel on CUDA stream 1 ``res_gpu1``: +Then, we launch a GPU kernel on the CUDA stream associated with the resource +``res_gpu1``, without keeping a handle to the associated event: .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_k1_start :end-before: _raja_res_k1_end :language: C++ - -and execute another kernel on CUDA stream 2 ``res_gpu2`` storing a handle to -an ``Event`` object to a local variable: + +Next, we execute another GPU kernel on the CUDA stream associated with the +resource ``res_gpu2`` and keep a handle to the corresponding event object +by assigning it to a local variable ``e``: .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_k2_start :end-before: _raja_res_k2_end :language: C++ - -The next kernel on ``res_gpu1`` requires that the last kernel on ``res_gpu2`` -finish first. Therefore, we enqueue a wait on ``res_gpu1`` that enforces -this: + +We require that the next kernel we launch to wait for the kernel launched on +the stream associated with the resource ``res_gpu2`` to complete. Therefore, +we enqueue a wait on that event on the ``res_gpu1`` resource: .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_wait_start :end-before: _raja_res_wait_end :language: C++ - -Execute the second kernel on ``res_gpu1`` now that the two previous kernels -have finished: + +Now that the second GPU kernel is complete, we launch a second kernel on the +stream associated with the resource ``res_gpu1``: .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_k3_start :end-before: _raja_res_k3_end :language: C++ - -We can enqueue a memcpy operation on ``res_gpu1`` to move data from the device -to the host: + +Next, we enqueue a memcpy operation on the resource ``res_gpu1`` to copy +the GPU array ``d_array`` to the host array ``h_array``: .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_memcpy_start :end-before: _raja_res_memcpy_end :language: C++ -Lastly, we use the copied data on the host side: +Lastly, we use the copied data in a kernel executed on the host: .. literalinclude:: ../../../../examples/resource-forall.cpp :start-after: _raja_res_k4_start diff --git a/docs/sphinx/user_guide/feature/scan.rst b/docs/sphinx/user_guide/feature/scan.rst index 76c6eb6688..c74b444189 100644 --- a/docs/sphinx/user_guide/feature/scan.rst +++ b/docs/sphinx/user_guide/feature/scan.rst @@ -6,10 +6,10 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _scan-label: +.. _feat-scan-label: ================ -Scans +Scan Operations ================ RAJA provides portable parallel scan operations, which are basic @@ -20,7 +20,8 @@ A few important notes: .. note:: * All RAJA scan operations are in the namespace ``RAJA``. * Each RAJA scan operation is a template on an *execution policy* parameter. The same policy types used for ``RAJA::forall`` methods - may be used for RAJA scans. + may be used for RAJA scans. Please see :ref:`feat-policies-label` + for more information. * RAJA scan operations accept an optional *operator* argument so users can perform different types of scan operations. If no operator is given, the default is a 'plus' operation and @@ -28,20 +29,18 @@ A few important notes: Also: -.. note:: For scans using the CUDA back-end, RAJA uses the NVIDIA CUB library - internally. The CMake variable ``CUB_DIR`` will be automatically - set to the location of the CUB library when CUDA is enabled. Details - for using a different version of the CUB library are available in - the :ref:`getting_started-label` section. +.. note:: For scans using the CUDA or HIP back-end, RAJA implementation uses + the NVIDIA CUB library or AMD rocPRIM library, respectively. + Typically, the CMake variable ``CUB_DIR`` or ``ROCPRIM_DIR`` will + be automatically set to the location of the CUB or rocPRIM library + for the CUDA or rocPRIM installation specified when either back-end + is enabled. More details for configuring the CUB or rocPRIM library + for a RAJA build can be found in :ref:`getting_started_depend-label`. -.. note:: For scans using the HIP back-end, RAJA uses the AMD rocPRIM library - internally. The CMake variable ``ROCPRIM_DIR`` will be automatically - set to the location of the rocPRIM library when HIP is enabled. - Details for using a different version of the rocPRIM library are - available in the :ref:`getting_started-label` section. +Please see the following tutorial sections for detailed examples that use +RAJA scan operations: -Please see the :ref:`scan-label` tutorial section for usage examples of RAJA -scan operations. + * :ref:`tut-scan-label`. ----------------- Scan Operations @@ -97,11 +96,12 @@ scan operation above will be a *prefix-sum* since there is no operator argument given; i.e., the output array will contain partial sums of the input array. The second scan will apply the operator that is passed. Note that container arguments can be generated from iterators using ``RAJA::make_span(begin, len)``. +This is shown in the examples in :ref:`tut-scan-label`. RAJA also provides *in-place* scans: * ``RAJA::inclusive_scan_inplace< exec_policy >(in_container)`` - * ``RAJA::inclusive_scan_inplace< exec_policy >(in_container, )`` + * ``RAJA::inclusive_scan_inplace< exec_policy >(in_container, operator)`` An in-place scan generates the same output sequence as a non-inplace scan. However, an in-place scan does not take separate input and output arrays and @@ -121,7 +121,7 @@ and * ``RAJA::exclusive_scan_inplace< exec_policy >(in_container)`` * ``RAJA::exclusive_scan_inplace< exec_policy >(in_container, )`` -.. _scanops-label: +.. _feat-scanops-label: -------------------- RAJA Scan Operators @@ -139,11 +139,3 @@ types of scans, such as: .. note:: * All RAJA scan operators are in the namespace ``RAJA::operators``. -------------------- -Scan Policies -------------------- - -For information about RAJA execution policies to use with scan operations, -please see :ref:`policies-label`. - - diff --git a/docs/sphinx/user_guide/feature/sort.rst b/docs/sphinx/user_guide/feature/sort.rst index 115959a4dd..8d4db5032f 100644 --- a/docs/sphinx/user_guide/feature/sort.rst +++ b/docs/sphinx/user_guide/feature/sort.rst @@ -6,60 +6,61 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _sort-label: +.. _feat-sort-label: ================ -Sorts +Sort Operations ================ -RAJA provides portable parallel sort operations, which are basic -parallel algorithm building blocks. They are described in this section. +RAJA provides portable parallel sort operations, which are described in this +section. A few important notes: .. note:: * All RAJA sort operations are in the namespace ``RAJA``. * Each RAJA sort operation is a template on an *execution policy* parameter. The same policy types used for ``RAJA::forall`` methods - may be used for RAJA sorts. + may be used for RAJA sorts. Please see :ref:`feat-policies-label` + for more information. * RAJA sort operations accept an optional *comparator* argument so users can perform different types of sort operations. If no operator is given, the default is a *less than* operation and - the result is **non-decreasing**. + the result is a sequence sorted in **non-decreasing** order. Also: -.. note:: * For sorts using the CUDA back-end, RAJA uses the implementations - provided by the NVIDIA CUB library. For information please see - :ref:`build-external-tpl `. - * For sorts using the HIP back-end, RAJA uses the implementations - provided by the AMD rocPRIM library. For information please see - :ref:`build-external-tpl `. - * The RAJA CUDA and HIP back-ends only support sorting - arithmetic types using RAJA operators 'less than' and - 'greater than'. +.. note:: For sorts using the CUDA or HIP back-end, RAJA implementation uses + the NVIDIA CUB library or AMD rocPRIM library, respectively. + Typically, the CMake variable ``CUB_DIR`` or ``ROCPRIM_DIR`` will + be automatically set to the location of the CUB or rocPRIM library + for the CUDA or rocPRIM installation specified when either back-end + is enabled. More details for configuring the CUB or rocPRIM library + for a RAJA build can be found :ref:`getting_started_depend-label`. + +Please see the following tutorial sections for detailed examples that use +RAJA scan operations: -Please see the :ref:`sort-label` tutorial section for usage examples of RAJA -sort operations. + * :ref:`tut-sort-label` ----------------- Sort Operations ----------------- -In general, a sort operation takes a sequence of numbers ``x`` and a binary -comparison operator ``op`` that forms a strict weak ordering of elements in -input sequence ``x`` and produces a sequence of numbers ``y`` as output. The +In general, a sort operation takes a sequence of numbers 'x' and a binary +comparison operator 'op' to form a strict weak ordering of elements in +input sequence 'x' and produce a sequence of numbers 'y' as output. The output sequence is a permutation of the input sequence where each pair of -elements ``a`` and ``b``, where ``a`` is before ``b`` in the output sequence, -satisfies ``!(b op a)``. Sorts are stable if they always preserve the order of -equivalent elements, where equivalent elements satisfy ``!(a op b) && !(b op a)``. +elements 'a' and 'b', where 'a' is before 'b' in the output sequence, +satisfies '!(b op a)'. Sorts are stable if they always preserve the order of +equivalent elements, where equivalent means '!(a op b) && !(b op a)' is true. -A **stable sort** takes an input sequence ``x`` where a\ :sub:`i` appears -before a\ :sub:`j` if i < j when a\ :sub:`i` and a\ :sub:`j` are equivalent for -any i != j. +A **stable sort** takes an input sequence 'x' where a\ :sub:`i` appears +before a\ :sub:`j` if i < j when a\ :sub:`i` and a\ :sub:`j` are equivalent +for any i != j. x = { a\ :sub:`0`\, b\ :sub:`0`\, a\ :sub:`1`\, ... } -and calculates the stably sorted output sequence ``y`` that preserves the +and calculates the stably sorted output sequence 'y' that preserves the order of equivalent elements. That is, the sorted sequence where element a\ :sub:`i` appears before the equivalent element a\ :sub:`j` if i < j: @@ -83,13 +84,13 @@ RAJA unstable sort operations look like the following: * ``RAJA::sort< exec_policy >(container)`` * ``RAJA::sort< exec_policy >(container, comparator)`` -For example, sorting an array with this sequence of values:: +For example, sorting an integer array with this sequence of values:: 6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5 with a sequential unstable sort operation: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_seq_start :end-before: _sort_seq_end :language: C++ @@ -98,14 +99,24 @@ produces the ``out`` array with this sequence of values:: 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 -Note that the syntax is essentially the same as :ref:`scan-label`. +Note that the syntax is essentially the same as :ref:`feat-scan-label`. Here, ``container`` is a random access range of elements. ``container`` provides access to the input sequence and contains the output sequence at the end of -sort. The first sort operation listed above will be a *non-decreasing* sort +sort. The sort operation listed above will be a *non-decreasing* sort since there is no comparator argument given; i.e., the sequences will be -reordered *in-place* using operator::less. The second sort will apply the -comparator that is passed into the function. Note that the container argument -can be generated from iterators using ``RAJA::make_span(begin, len)``. +reordered *in-place* using the default RAJA less-than comparator. + +Equivalently, the ``RAJA::operators::less`` comparator operator could be +passed as the second argument to the sort routine to produce the same result: + +.. literalinclude:: ../../../../exercises/sort_solution.cpp + :start-after: _sort_seq_less_start + :end-before: _sort_seq_less_end + :language: C++ + +Note that container arguments can be generated from iterators using +``RAJA::make_span(out, N)``, where we pass the base pointer for the array +and its length. RAJA also provides sort operations that operate on key-value pairs stored separately: @@ -117,7 +128,8 @@ separately: ``keys_container`` as ``RAJA::sort`` does in ``container`` and reorders the sequence of values in ``vals_container`` by permuting the sequence of values in the same manner as the sequence of keys; i.e. the sequence of pairs is sorted -based on comparing their keys. +based on comparing their keys. Detailed examples are provided in +:ref:`tut-sort-label`. .. note:: The comparator used in ``RAJA::sort_pairs`` only compares keys. @@ -125,7 +137,7 @@ based on comparing their keys. RAJA Stable Sorts --------------------- -RAJA stable sorts are essentially the same as unstable sorts: +RAJA stable sort operations are used essentially the same as unstable sorts: * ``RAJA::stable_sort< exec_policy >(container)`` * ``RAJA::stable_sort< exec_policy >(container, comparator)`` @@ -136,11 +148,11 @@ separately: * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container)`` * ``RAJA::stable_sort_pairs< exec_policy >(keys_container, vals_container, comparator)`` -.. _sortops-label: +.. _feat-sortops-label: --------------------- +-------------------------- RAJA Comparison Operators --------------------- +-------------------------- RAJA provides two operators that can be used to produce different ordered sorts: @@ -149,11 +161,3 @@ RAJA provides two operators that can be used to produce different ordered sorts: .. note:: All RAJA comparison operators are in the namespace ``RAJA::operators``. -------------------- -Sort Policies -------------------- - -For information about RAJA execution policies to use with sort operations, -please see :ref:`policies-label`. - - diff --git a/docs/sphinx/user_guide/feature/tiling.rst b/docs/sphinx/user_guide/feature/tiling.rst index 8b2e18d501..e803590e08 100644 --- a/docs/sphinx/user_guide/feature/tiling.rst +++ b/docs/sphinx/user_guide/feature/tiling.rst @@ -6,26 +6,27 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _tiling-label: +.. _feat-tiling-label: =========== Loop Tiling =========== In this section, we discuss RAJA statements that can be used to tile nested -for-loops. Typical loop tiling involves partitioning an iteration space into +loops. Typical loop tiling involves partitioning an iteration space into a collection of "tiles" and then iterating over tiles in outer loops and -entries within each tile in inner loops. Many scientific computing algorithms +indices within each tile in inner loops. Many scientific computing algorithms can benefit from loop tiling due to more efficient cache usage on a CPU or use of GPU shared memory. -For example, an operation performed using a for-loop with a range of [0, 10):: +For example, consider an operation performed using a C-style for-loop with +a range of [0, 10):: for (int i=0; i<10; ++i) { // loop body using index 'i' } -May be expressed as a loop nest that iterates over five tiles of size two:: +This May be written as a loop nest that iterates over five tiles of size two:: int numTiles = 5; int tileDim = 2; @@ -36,11 +37,10 @@ May be expressed as a loop nest that iterates over five tiles of size two:: } } -Next, we show how this tiled loop can be represented using RAJA. Then, we -present variations on it that illustrate the usage of different RAJA kernel -statement types. +Next, we show how loop tiling can be written using RAJA with variations that +use different ``RAJA::kernel`` execution policy statement types. -.. code-block:: cpp +Here is a way to write the tiled loop kernel above using ``RAJA::kernel``:: using KERNEL_EXEC_POL = RAJA::KernelPolicy< @@ -51,28 +51,30 @@ statement types. > >; - RAJA::kernel(RAJA::make_tuple(RAJA::RangeSegment(0,10)), + RAJA::kernel( + RAJA::make_tuple(RAJA::TypedRangeSegment(0,10)), [=] (int i) { - // loop body using index 'i' - }); - -In RAJA, the simplest way to tile an iteration space is to use RAJA -``statement::Tile`` and ``statement::For`` statement types. A -``statement::Tile`` type is similar to a ``statement::For`` type, but takes -a tile size as the second template argument. The ``statement::Tile`` -construct generates the outer loop over tiles and the ``statement::For`` -statement iterates over each tile. Nested together, as in the example, these -statements will pass the global index 'i' to the loop body in the lambda -expression as in the non-tiled version above. - -.. note:: When using ``statement::Tile`` and ``statement::For`` types together - to define a tiled loop structure, the integer passed as the first - template argument to each statement type must be the same. This - indicates that they both apply to the same item in the iteration - space tuple passed to the ``RAJA::kernel`` methods. - -RAJA also provides alternative tiling and for statements that provide the tile -number and local tile index, if needed inside the kernel body, as shown below:: + // kernel body using index 'i' + } + ); + +In RAJA, the simplest way to tile an iteration space is to use +``RAJA::statement::Tile`` and ``RAJA::statement::For`` statement types. A +``RAJA::statement::Tile`` type is similar to a ``RAJA::statement::For`` type, +but takes a tile size as the second template argument. The +``RAJA::statement::Tile`` type generates the outer loop over tiles and +the ``RAJA::statement::For`` type iterates over each tile. Nested together, +these statements will pass the global index ('i' in the example) to the +lambda expression as (kernel body) in a non-tiled version above. + +.. note:: When using ``RAJA::statement::Tile`` and ``RAJA::statement::For`` + types together to define a tiled loop structure, the integer passed + as the first template argument to each statement type must be the + same. This indicates that they both apply to the same iteration space + in the space tuple passed to the ``RAJA::kernel`` method. + +RAJA also provides alternative statements that provide the tile number and +local tile index, if needed inside the kernel body, as shown below:: using KERNEL_EXEC_POL2 = RAJA::KernelPolicy< @@ -86,8 +88,9 @@ number and local tile index, if needed inside the kernel body, as shown below:: >; - RAJA::kernel_param(RAJA::make_tuple(RAJA::RangeSegment(0,10)), - RAJA::make_tuple((int)0, (int)0), + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0,10)), + RAJA::make_tuple((int)0, (int)0), [=](int i, int t, int j) { // i - global index @@ -95,20 +98,22 @@ number and local tile index, if needed inside the kernel body, as shown below:: // j - index within tile // Then, i = j + 2*t (2 is tile size) - }); - -The ``statement::TileTCount`` type allows the tile number to be accessed as a -lambda argument and the ``statement::ForICount`` type allows the local tile -loop index to be accessed as a lambda argument. These values are specified in -the tuple, which is the second argument passed to the ``RAJA::kernel_param`` -method above. The ``statement::Param<#>`` type appearing as the second + } + ); + +The ``RAJA::statement::TileTCount`` type indicates that the tile number will +be passed to the lambda expression and the ``RAJA::statement::ForICount`` type +indicates that the local tile loop index will be passed to the lambda +expression. Storage for these values is specified in the parameter tuple, the +second argument passed to the ``RAJA::kernel_param`` method. The +``RAJA::statement::Param<#>`` type appearing as the second template parameter for each statement type indicates which parameter tuple -entry the tile number or local tile loop index is passed to the lambda, and +entry, the tile number or local tile loop index, is passed to the lambda and in which order. Here, the tile number is the second lambda argument (tuple parameter '0') and the local tile loop index is the third lambda argument (tuple parameter '1'). .. note:: The global loop indices always appear as the first lambda expression arguments. Then, the parameter tuples identified by the integers - in the ``Param`` statement types given for the loop statement - types follow. + in the ``RAJA::Param`` statement types given for the loop statement + types follow. diff --git a/docs/sphinx/user_guide/feature/vectorization.rst b/docs/sphinx/user_guide/feature/vectorization.rst index 081ebf9028..057c0695d6 100644 --- a/docs/sphinx/user_guide/feature/vectorization.rst +++ b/docs/sphinx/user_guide/feature/vectorization.rst @@ -14,86 +14,94 @@ Vectorization (SIMD/SIMT) .. warning:: **This section describes an initial draft of an incomplete, experimental RAJA capability. It is not considered ready - for production. A basic description is provided here so - that (potentially) interested users can take a look, try it - out, and provide input if they wish to do so.** + for production, but it is ready for interested users to try.** -The RAJA team is experimenting with an API for SIMD/SIMT programming. -The goal is to make the implementation perform as well as if one used -vectorization intrinsics directly in their code, but without the + * We provide a basic description here so that interested users + can take a look, try it out, and provide input if they wish to + do so. The RAJA team values early feedback from users on new + capabilities. + + * There are no usage examples available in RAJA yet, except for + tests. Examples will be made available as they are developed. + +The aim of the RAJA API for SIMD/SIMT programming described in this section +is to make an implementation perform as well as if one used +SIMD/SIMT intrinsics directly in her code, but without the software complexity and maintenance burden associated with doing that. -In particular, our goal is to *guarantee* that specified vectorization -occurs without needing to explicitly use intrinsics in user code or +In particular, we want to *guarantee* that specified vectorization +occurs without requiring users to manually insert intrinsics in their code or rely on compiler auto-vectorization implementations. -.. note:: All RAJA vectorization types are in the namespace ``RAJA::expt``. +.. note:: All RAJA vectorization types described here are in the namespace + ``RAJA::expt``. -Currently, the main abstractions developed in RAJA so far are: +Currently, the main abstractions in RAJA for SIMD/SIMT programming are: - * ``Register`` wraps underlying SIMD/SIMT hardware registers and - provides consistent uniform access to them, using intrinsics under the - API when possible. The RAJA register abstraction currently supports the - following hardware-specific ISAs : AVX, AVX2, AVX512, CUDA, and HIP. - * ``Vector`` builds on ``Register`` to provide arbitrary length + * ``Register`` which wraps underlying SIMD/SIMT hardware registers and + provides consistent uniform access to them, using intrinsics behind the + API when possible. The register abstraction currently supports the + following hardware-specific ISAs (instruction set architectures): + AVX, AVX2, AVX512, CUDA, and HIP. + * ``Vector`` which builds on ``Register`` to provide arbitrary length vectors and operations on them. - * ``Matrix`` builds on ``Register`` to provide arbitrary-sized - matrices, column-major and row-major layouts, and operations on them. + * ``Matrix`` which builds on ``Register`` to provide arbitrary-sized + matrices and operations on them, including support for column-major and + row-major data layouts. -Finally, these capabilities integrate with RAJA :ref:`view-label` -capabilities, which implements am expression-template system that allows -a user to write linear algebra expressions on arbitrarily sized scalars, +Using these abstractions, RAJA provides an expression-template system that +allows users to write linear algebra expressions on arbitrarily sized scalars, vectors, and matrices and have the appropriate SIMD/SIMT instructions -performed during expression evaluation. +performed during expression evaluation. These capabilities integrate with +RAJA :ref:`feat-view-label` capabilities, which insulate load/store and other +operations from user code. ------------------------ Why Are We Doing This? ------------------------ -Quoting Tim Foley in `Matt Pharr's blog `_: "Auto-vectorization is not a programming model". Unless, of -course, you consider "hope for the best" to be a sound plan. - -Auto-vectorization is problematic for multiple reasons. First, vectorization -is not explicit in the source code and so compilers must divine correctness -when attempting to apply vectorization optimizations. Since most compilers -are very conservative in this regard, many vectorization opportunities are -typically missed when one relies solely on compiler auto-vectorization. -Second, every compiler will treat your code differently since compiler -implementations use different heuristics, even for different versions of the -same compiler. So performance portability is not just an issue with respect to -hardware, but also across compilers. Third, it is impossible in general for -most application developers to clearly understand the decisions made by a -compiler during its optimization process. +Quoting Tim Foley in `Matt Pharr's blog `_ -- "Auto-vectorization is not a programming model". This is +true, of course, unless you consider "hope for the best" that the compiler +optimizes the way you want to be a sound code development strategy. + +Compiler auto-vectorization is problematic for multiple reasons. First, when +vectorization is not explicit in source code, compilers must divine correctness +when attempting to apply vectorization optimizations. Most compilers are very +conservative in this regard, due to the possibility of data aliasing in C and +C++ and prioritizing correctness over performance. Thus, many vectorization +opportunities are usually missed when one relies solely on compiler +auto-vectorization. Second, every compiler will treat your code differently +since compiler implementations use different optimization heuristics, even in +different versions of the same compiler. So performance portability is not +just an issue with respect to hardware, but also for compilers. Third, it is +generally impossible for most application developers to clearly understand +the choices made by compilers during optimization processes. Using vectorization intrinsics in application source code is also problematic because different processors support different instruction set architectures (ISAs) and so source code portability requires a mechanism that insulates it from architecture-specific code. -GPU programming makes us be explicit about parallelization, and SIMD +Writing GPU code makes a programmer be explicit about parallelization, and SIMD is really no different. RAJA enables single-source portable code across a variety of programming model back-ends. The RAJA vectorization abstractions -introduced here are an attempt to bring a level of convergence between SIMD +introduced here are an attempt to bring some convergence between SIMD and GPU programming by providing uniform access to hardware-specific acceleration. -.. note:: **Auto-vectorization is not a programming model.** --Tim Foley +.. important:: **Auto-vectorization is not a programming model.** --Tim Foley --------------------- Register --------------------- -``RAJA::expt::Register`` is a class template that takes a -a data type parameter ``T`` and a register policy ``REGISTER_POLICY`` that -indicates the hardware register type. The ``RAJA::expt::Register`` interface -provides uniform access to register-level operations. It is intended as a -building block for higher level abstractions. A ``RAJA::expt::Register`` type -represents one SIMD register on a CPU architecture and 1 value/SIMT lane on -a GPU architecture. - -.. note:: A user can use the ``RAJA::expt::Register`` type directly in their - code. However, we do not recommend this. Instead, we want users to - employ higher level abstractions that RAJA provides. +``RAJA::expt::Register`` is a class template with +parameters for a data type ``T`` and a register policy ``REGISTER_POLICY``, +which specifies the hardware register type. It is intended as a building block +for higher level abstractions. The ``RAJA::expt::Register`` interface provides +uniform access to register-level operations for different hardware features +and ISA models. A ``RAJA::expt::Register`` type represents one SIMD register +on a CPU architecture and 1 value/SIMT lane on a GPU architecture. ``RAJA::expt::Register`` supports four scalar element types, ``int32_t``, ``int64_t``, ``float``, and ``double``. These are the only types that are @@ -101,15 +109,20 @@ portable across all SIMD/SIMT architectures. ``Bfloat``, for example, is not portable, so we don't provide support for that type. ``RAJA::expt::Register`` supports the following SIMD/SIMT hardware-specific -ISAs: AVX, AVX2, and AVX512 for SIMD CPU vectorization, and CUDA warp, -HIP wavefront for GPUs. Scalar support is provided for all hardware for -portability and experimentation/analysis. Extensions to support other -architectures may be forthcoming and should be straightforward to implement. +ISAs: AVX, AVX2, and AVX512 for SIMD CPU vectorization, and CUDA warp and +HIP wavefront for NVIDIA and AMD GPUs, respectively. Scalar support is +provided for all hardware for portability and experimentation/analysis. +Extensions to support other architectures may be forthcoming as they are +needed and requested by users. + +.. note:: One can use the ``RAJA::expt::Register`` type directly in her + code. However, we do not recommend it. Instead, we want users to + employ higher level abstractions that RAJA provides. Register Operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``RAJA::expt::Register`` provides various operations, including: +``RAJA::expt::Register`` provides various operations which include: * Basic SIMD handling: get element, broadcast * Memory operations: load (packed, strided, gather) and store (packed, strided, scatter) @@ -124,41 +137,43 @@ Register Operations Register DAXPY Example ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following is a code example that shows using the ``RAJA::expt::Register`` -class to perform a DAXPY kernel with AVX2 CPU SIMD instructions. -Again, we do not recommend that you write code directly using the Register -class, but use the higher level VectorRegister abstraction. -However, this example demonstrates how the higher level abstractions are -using the Register class:: +The following code example shows how to use the ``RAJA::expt::Register`` +class to perform a DAXPY kernel with AVX2 SIMD instructions. +While we do not recommend that you write code directly using the Register +class, but instead use the higher level VectorRegister abstraction, we use +the Register type here to illustrate the basics mechanics of SIMD +vectorization:: - // define array length + // Define array length int len = ...; - // data used in kernel + // Define data used in kernel double a = ...; double const *X = ...; double const *Y = ...; double *Z = ...; + // Define an avx2 register, which has width of 4 doubles using reg_t = RAJA::expt::Register; - int reg_width = reg_t::s_num_elem; // width of avx2 register is 4 doubles + int reg_width = reg_t::s_num_elem; - // Compute daxpy in chunks of 4 values at one time + // Compute daxpy in chunks of 4 values (register width) at a time for (int i = 0;i < len; i += reg_width){ reg_t x, y; - // load 4 consecutive values of X, Y arrays into registers + // Load 4 consecutive values of X, Y arrays into registers x.load_packed( X+i ); y.load_packed( Y+i ); - // perform daxpy on 4 values simultaneously (store in register) + // Perform daxpy on 4 values simultaneously and store in a register reg_t z = a * x + y; - // store register result in Z array + // Store register result in Z array z.store_packed( Z+i ); } - // loop postamble code + // Loop postamble code to complete daxpy operation when array length + // is not an integer multiple of the register width int remainder = len % reg_width; if (remainder) { reg_t x, y; @@ -166,119 +181,114 @@ using the Register class:: // 'i' is the starting array index of the remainder int i = len - remainder; - // load remainder values of X, Y arrays into registers + // Load remainder values of X, Y arrays into registers x.load_packed_n( X+i, remainder ); y.load_packed_n( Y+i, remainder ); - // perform daxpy on remainder values simultaneously (store in register) + // Perform daxpy on remainder values simultaneously and store in register reg_t z = a * x + y; - // store register result in Z array + // Store register result in Z array z.store_packed_n(Z+i, remainder); } This code is guaranteed to vectorize since the ``RAJA::expt::Register`` -operations insert the appropriate SIMD intrinsic operations into the method -calls. Note that ``RAJA::expt::Register`` provides overloads of basic -arithmetic operations so that the DAXPY operation itself (z = a * x + y) looks +operations insert the appropriate SIMD intrinsics into the operation +calls. Since ``RAJA::expt::Register`` provides overloads of basic +arithmetic operations, the SIMD DAXPY operation ``z = a * x + y`` looks like vanilla scalar code. -Note that since we are using bare pointers to the data, load and store +Because we are using bare pointers to the data, load and store operations are performed by explicit method calls in the code. Also, we must -write (duplicate) postamble code to handle cases where the array length -(len) is not an integer multiple of the register width. The postamble code -perform the DAXPY operation on the *remainder* of the array that remains after -the for-loop. - -**These extra lines of code should make it clear why we do not recommend -using ``RAJA::Register`` directly in application code.** +write explicit *postamble* code to handle cases where the array length +``len`` is not an integer multiple of the register width ``reg_width``. The +postamble code performs the DAXPY operation on the *remainder* of the array +that is excluded from the for-loop, which is strided by the register width. +**The need to write extra postamble code should make clear one reason why we +do not recommend using ``RAJA::Register`` directly in application code.** -------------------- -Tensor Register -------------------- - -``RAJA::expt::TensorRegister< >`` is a class template that provides a -higher-level interface on top of the ``RAJA::expt::Register`` class. -``RAJA::expt::TensorRegister< >`` wraps one or more -``RAJA::expt::Register< >`` objects to create a tensor-like object. - -.. note:: As with ``RAJA::expt::Register``, we don't recommend using - ``RAJA::expt::TensorRegister`` directly. Rather, we recommend using - use-case specific types that RAJA provides and which are described - below. +------------------ +Vector Register +------------------ **To make code cleaner and more readable, the specific types are intended to be used with ``RAJA::View`` and ``RAJA::expt::TensorIndex`` objects.** -Vector Register -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - ``RAJA::expt::VectorRegister`` provides an abstraction for a vector of arbitrary length. It is implemented using one or more ``RAJA::expt::Register`` objects. The vector length is independent of the -underlying register width. The template parameters are: ``T`` data type, -``REGISTER_POLICY`` vector register policy, and ``NUM_ELEM`` number of -data elements of type ``T`` that fit in a register. The last two of these -have defaults for all cases, so they do not usually need to be provided by -a user. - -Earlier, we said that we do not recommended using ``RAJA::expt::Register`` -directly. The reason for this is that it is good to decouple -vector length from hardware register size since it allows one to write +underlying register width. The template parameters are: data type ``T``, +vector register policy ``REGISTER_POLICY``, and ``NUM_ELEM`` which +is the number of data elements of type ``T`` that fit in a register. The last +two of these template parameters have defaults for all cases, so a user +need note provide them in most cases. + +Recall that we said earlier that we do not recommended using +``RAJA::expt::Register`` directly. One important reason for this is that +decoupling the vector length from hardware register size allows one to write simpler, more readable code that is easier to get correct. This should be -clear from the code example below. +clear from the code example below, when compared to the previous code example. Vector Register DAXPY Example ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following code example shows the DAXPY computation shown above written -using ``RAJA::expt::VectorRegister``, ``RAJA::expt::VectorIndex``, and -``RAJA::View`` classes, which obviate the need for the extra lines of code -discussed earlier:: +The following code example shows the DAXPY computation discussed above, +but written using ``RAJA::expt::VectorRegister``, ``RAJA::expt::VectorIndex``, +and ``RAJA::View`` types. Using these types, we can write cleaner, more +concise code that is easier to get correct because it is simpler. For example, +we do not have to write the postamble code discussed earlier:: - // define array length and data used in kernel (as before) + // Define array length and data used in kernel (as before) int len = ...; double a = ...; double const *X = ...; double const *Y = ...; double *Z = ...; - // define vector register and index types + // Define vector register and index types using vec_t = RAJA::expt::VectorRegister; using idx_t = RAJA::expt::VectorIndex; - // wrap array pointers in RAJA View objects + // Wrap array pointers in RAJA View objects auto vX = RAJA::make_view( X, len ); auto vY = RAJA::make_view( Y, len ); auto vZ = RAJA::make_view( Z, len ); - // 'all' knows the length of vX, vY, and vZ from the View objects - // and it encodes the vector type + // The 'all' variable gets the length of the arrays from the vX, vY, and + // vZ View objects and encodes the vector register type auto all = idx_t::all(); - // compute the complete array daxpy in one line of code - // this produces a vectorized loop, and the loop postamble + // Compute the complete array daxpy in one line of code + // this produces a vectorized loop and the loop postamble + // in the executable vZ( all ) = a * vX( all ) + vY( all ); -This code has several advantages over the previous example. It is guaranteed -to vectorize and is much easier to read, get correct, and maintain since -the ``RAJA::View`` class handles the looping and postamble code automatically -to allow arrays of arbitrary size. The ``RAJA::View`` class provides overloads -of the arithmetic operations based on the 'all' type and inserts the -appropriate SIMD instructions and load/store operations to vectorize the -operations as in the earlier example. It may be considered by some to be -inconvenient to have to use the ``RAJA::View`` class, but it is easy to wrap -bare pointers as can is shown in the example. +It should be clear that this code has several advantages over the previous +code example. It is guaranteed to vectorize as before, but it is much easier +to read, get correct, and maintain since the ``RAJA::View`` class handles the +looping and postamble code automatically for arrays of arbitrary size. The +``RAJA::View`` class provides overloads of the arithmetic operations based on +the ``all`` variable and inserts the appropriate SIMD instructions and +load/store operations to vectorize the operations that were explicit in the +earlier example. It may be considered by some to be inconvenient to have to +use the ``RAJA::View`` class, but it is easy to wrap bare pointers as is shown +here. Expression Templates -^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^ -The figure below shows the sequence of SIMD operations, in the form of an -*abstract syntax tree (AST)*, applied in the DAXPY code by the RAJA constructs -used in the code example. During compilation, a tree of *expression template* -objects is constructed based on the order of operations that appear in the -kernel. Specifically, the operation sequence is the following: +The figure below shows the sequence of SIMD operations, as they are parsed to +form of an *abstract syntax tree (AST)*, for the DAXPY code in the vector +register code example above. + +.. figure:: ../figures/vectorET.png + + An AST illustration of the SIMD operations in the DAXPY code. + +During compilation, a tree of *expression template* objects is constructed +based on the order of operations that appear in the DAXPY kernel. Specifically, +the operation sequence is the following: #. Load a chunk of values in 'vX' into a register. #. Broadcast the scalar value 'a' to each slot in a vector register. @@ -289,26 +299,21 @@ kernel. Specifically, the operation sequence is the following: #. Write the result in the register to the 'vZ' array. ``RAJA::View`` objects indexed by ``RAJA::TensorIndex`` objects -(``RAJA::VectorIndex`` in this case) return *LoadStore* expression +(``RAJA::VectorIndex`` in this case) return *Load/Store* expression template objects. Each expression template object is evaluated on assignment and a register chunk size of values is loaded into another register object. Finally, the left-hand side of the expression is evaluated by storing the -chunk of values in the right-hand side result register into the array on the -left-hand side of the equal sign. - -.. figure:: ../figures/vectorET.png - - An AST illustration of the SIMD operations in the DAXPY code. - +chunk of values in the right-hand side result register into the array associated +with the view ``vZ`` on the left-hand side of the equal sign. CPU/GPU Portability ^^^^^^^^^^^^^^^^^^^^^ -It is important to note that the code in the example in the previous section is -*not* portable to run on a GPU because it does not include a way to launch a -GPU kernel. The following code example shows how to enable the code to run on -either a CPU or GPU via a run time choice:: +It is important to note that the code in the example above can only run on a +CPU; i.e., it is *not* portable to run on either a CPU or GPU because it does +not include a way to launch a GPU kernel. The following code example shows +how to enable the code to run on either a CPU or GPU via a run time choice:: // array lengths and data used in kernel same as above @@ -339,53 +344,69 @@ either a CPU or GPU via a run time choice:: This version of the kernel can be run on a CPU or GPU depending on the run time chosen value of the variable ``cpu_or_gpu``. When compiled, the code will -generate versions of the kernel for the CPU and GPU based on the parameters -in the ``pol_t`` loop policy. The CPU version will be the same as the version -in the previous section. The GPU version is essentially the same but will -run in a GPU kernel. Note that there is only one template argument passed to -the register when ``vec_t`` is defined. ``RAJA::expt::VectorRegister`` -uses defaults for the register policy, based on the system hardware, and -number of data elements of type double that will fit in a register. +generate versions of the kernel for a CPU and an CUDA GPU based on the +parameters in the ``pol_t`` loop policy. The CPU version will be the same +as the version described earlier. The GPU version is essentially the same +but will run in a GPU kernel. Note that there is only one template argument +passed to the register when ``vec_t`` is defined. +``RAJA::expt::VectorRegister`` uses defaults for the register policy, +based on the system hardware, and number of data elements of type double that +will fit in a register. + +------------------- +Tensor Register +------------------- + +``RAJA::expt::TensorRegister< >`` is a class template that provides a +higher-level interface on top of ``RAJA::expt::Register``. +``RAJA::expt::TensorRegister< >`` wraps one or more +``RAJA::expt::Register< >`` objects to create a tensor-like object. + +.. note:: As with ``RAJA::expt::Register``, we don't recommend using + ``RAJA::expt::TensorRegister`` directly. Rather, we recommend using + higher-level abstraction types that RAJA provides and which are + described below. +----------------------- Matrix Registers -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +----------------------- RAJA provides ``RAJA::expt::TensorRegister`` type aliases to support matrices of arbitrary size and shape. These are: - * ``RAJA::expt::SquaretMatrixRegister`` which + * ``RAJA::expt::SquareMatrixRegister`` which abstracts operations on an N x N square matrix. * ``RAJA::expt::RectMatrixRegister`` - which abstracts operations on an N x M rectangular matrix. + which abstracts operations on an N x M rectangular matrix. Matrices are implemented using one or more ``RAJA::expt::Register`` objects. Data layout can be row-major or column major. Matrices are intended to be used with ``RAJA::View`` and ``RAJA::expt::TensorIndex`` objects, -similar to what was shown above with ``RAJA::expt::VectorRegister`` example. +similar to what was shown above in the ``RAJA::expt::VectorRegister`` example. -Matrix operations support matrix-matrix, matrix-vector, and vector-matrix +Matrix operations support matrix-matrix, matrix-vector, vector-matrix multiplication, and transpose operations. Rows or columns can be represented with one or more registers, or a power-of-two fraction of a single register. -This is important for CUDA GPU warp/wavefront registers, which are 32-wide for +This is important for GPU warp/wavefront registers, which are 32-wide for CUDA and 64-wide for HIP. -Here is a simple code example that performs the matrix-analogue of the -vector DAXPY operation presented above using square matrices:: +Here is a code example that performs the matrix-analogue of the +vector DAXPY operation using square matrices:: - // define matrix size and data used in kernel (similar to before) + // Define matrix size and data used in kernel (similar to before) int N = ...; double a = ...; double const *X = ...; double const *Y = ...; double *Z = ...; - // define matrix register and row/column index types + // Define matrix register and row/column index types using mat_t = RAJA::expt::SquareMatrixRegister; using row_t = RAJA::expt::RowIndex; using col_t = RAJA::expt::ColIndex; - // wrap array pointers in RAJA View objects (similar to before) + // Wrap array pointers in RAJA View objects (similar to before) auto mX = RAJA::make_view( X, N, N ); auto mY = RAJA::make_view( Y, N, N ); auto mZ = RAJA::make_view( Z, N, N ); @@ -409,9 +430,10 @@ vector DAXPY operation presented above using square matrices:: ); Conceptually, as well as implementation-wise, this is similar to the previous -vector example except the operations are in two dimensions. The kernel code is -easy to read, it is guaranteed to vectorize, and iterating over the data is -handled by RAJA (register width sized chunk, plus postamble scalar operations). -Again, the ``RAJA::View`` arithmetic operation overloads insert the +vector example except the operations are on two-dimensional matrices. The +kernel code is easy to read, it is guaranteed to vectorize, and iterating +over the data is handled by RAJA view objects (register-width sized chunk, +plus postamble scalar operations), and it can run on a CPU or NVIDIA GPU. As +before, the ``RAJA::View`` arithmetic operation overloads insert the appropriate vector instructions in the code. diff --git a/docs/sphinx/user_guide/feature/view.rst b/docs/sphinx/user_guide/feature/view.rst index da10c83a25..79814cd348 100644 --- a/docs/sphinx/user_guide/feature/view.rst +++ b/docs/sphinx/user_guide/feature/view.rst @@ -6,7 +6,7 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _view-label: +.. _feat-view-label: =============== View and Layout @@ -31,7 +31,16 @@ to access a matrix entry in row `r` and column `c`. However, this solution has limitations; e.g., additional macro definitions may be needed when adopting a different matrix data layout or when using other matrices. To facilitate multi-dimensional indexing and different indexing layouts, RAJA provides -``RAJA::View`` and ``RAJA::Layout`` classes. +``RAJA::View``, ``RAJA::Layout``, and ``RAJA::OffsetLayout`` classes. + +Please see the following tutorial sections for detailed examples that use +RAJA Views and Layouts: + + * :ref:`tut-view_layout-label` + * :ref:`tut-offsetlayout-label` + * :ref:`tut-permutedlayout-label` + * :ref:`tut-kernelexecpols-label` + * :ref:`tut-launchexecpols-label` ---------- RAJA Views @@ -167,7 +176,7 @@ stride, the third index (index 2 - extent 11) has stride 5, and the second index (index 1 - extent 7) has stride 55 (= 5*11). .. note:: If a permuted layout is created with the *identity permutation* - (e.g., {0,1,2}, the layout is the same as if it were created by + (e.g., {0,1,2}), the layout is the same as if it were created by calling the Layout constructor directly with no permutation. The first argument to ``RAJA::make_permuted_layout`` is a C++ array whose @@ -210,16 +219,16 @@ Offset Layout The ``RAJA::make_offset_layout`` method creates a ``RAJA::OffsetLayout`` object with offsets applied to the indices. For example,:: - double* C = new double[11]; + double* C = new double[10]; RAJA::Layout<1> layout = RAJA::make_offset_layout<1>( {{-5}}, {{5}} ); RAJA::View > Cview(C, layout); creates a one-dimensional view with a layout that allows one to index into -it using indices in :math:`[-5, 5]`. In other words, one can use the loop:: +it using indices in :math:`[-5, 5)`. In other words, one can use the loop:: - for (int i = -5; i < 6; ++i) { + for (int i = -5; i < 5; ++i) { CView(i) = ...; } @@ -228,21 +237,22 @@ to an array offset index by subtracting the lower offset from it; i.e., in the loop, each 'i' value has '-5' subtracted from it to properly access the array entry. That is, the sequence of indices generated by the for-loop:: - -5 -4 -3 ... 5 + -5 -4 -3 ... 4 will index into the data array as:: - 0 1 2 ... 10 + 0 1 2 ... 9 The arguments to the ``RAJA::make_offset_layout`` method are C++ arrays that -hold the start and end values of the indices. RAJA offset layouts support -any number of dimensions; for example:: +hold the begin-end values of indices in the half-open interval +:math:[begin, end)`. RAJA offset layouts support any number of dimensions; +for example:: RAJA::OffsetLayout<2> layout = RAJA::make_offset_layout<2>({{-1, -5}}, {{2, 5}}); defines a two-dimensional layout that enables one to index into a view using -indices :math:`[-1, 2]` in the first dimension and indices :math:`[-5, 5]` in +indices :math:`[-1, 2)` in the first dimension and indices :math:`[-5, 5)` in the second dimension. As noted earlier, double braces are needed to properly initialize the internal data in the layout object. @@ -257,10 +267,10 @@ indices. For example,:: RAJA::OffsetLayout<2> layout = RAJA::make_permuted_offset_layout<2>( {{-1, -5}}, {{2, 5}}, perm ); -Here, the two-dimensional index space is :math:`[-1, 2] \times [-5, 5]`, the +Here, the two-dimensional index space is :math:`[-1, 2) \times [-5, 5)`, the same as above. However, the index strides are permuted so that the first -index (index 0) has unit stride and the second index (index 1) has stride 4, -which is the extent of the first index (:math:`[-1, 2]`). +index (index 0) has unit stride and the second index (index 1) has stride 3, +which is the extent of the first index (:math:`[-1, 2)`). .. note:: It is important to note some facts about RAJA layout types. All layouts have a permutation. So a permuted layout and @@ -272,7 +282,7 @@ which is the extent of the first index (:math:`[-1, 2]`). ``RAJA::View`` data access operator when they are not needed. Complete examples illustrating ``RAJA::Layouts`` and ``RAJA::Views`` may -be found in the :ref:`offset-label` and :ref:`permuted-layout-label` +be found in the :ref:`tut-offsetlayout-label` and :ref:`tut-permutedlayout-label` tutorial sections. Typed Layouts @@ -282,7 +292,7 @@ RAJA provides typed variants of ``RAJA::Layout`` and ``RAJA::OffsetLayout`` that enable users to specify integral index types. Usage requires specifying types for the linear index and the multi-dimensional indicies. The following example creates two two-dimensional typed layouts where the -linear index is of type TIL and the '(x, y)' indices for accesingg the data +linear index is of type TIL and the '(x, y)' indices for accessing the data have types TIX and TIY:: RAJA_INDEX_VALUE(TIX, "TIX"); @@ -404,5 +414,5 @@ runtime bounds checking for RAJA views. This may be a useful debugging aid for users. When attempting to use an index value that is out of bounds, RAJA will abort the program and print the index that is out of bounds and the value of the index and bounds for it. Since the bounds checking is a runtime -operation, it incurs non-negligible overhead. When bounds checkoing is turned +operation, it incurs non-negligible overhead. When bounds checking is turned off (default case), there is no additional run time overhead incurred. diff --git a/docs/sphinx/user_guide/feature/workgroup.rst b/docs/sphinx/user_guide/feature/workgroup.rst index fba8e19310..25087d64dd 100644 --- a/docs/sphinx/user_guide/feature/workgroup.rst +++ b/docs/sphinx/user_guide/feature/workgroup.rst @@ -21,16 +21,15 @@ represents an executable form of those loops and when run makes a ``RAJA::WorkSi that the RAJA workgroup constructs API is still being developed and may change in later RAJA releases. -.. note:: * All **workgroup** constructs are in the namespace ``RAJA``. - * The ``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates - are templated on: +.. note:: * All workgroup constructs are in the namespace ``RAJA``. + * The ``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates are templated on: * a WorkGroup policy which is composed of: - * a work execution policy. - * a work ordering policy. - * a work storage policy. + * a work execution policy + * a work ordering policy + * a work storage policy + * a work dispatch policy * an index type that is the first argument to the loop bodies. - * a list of extra argument types that are the rest of the arguments to - the loop bodies. + * a list of extra argument types that are the rest of the arguments to the loop bodies. * an allocator type to be used for the memory used to store and manage the loop bodies. * The ``RAJA::WorkPool::enqueue`` method takes two arguments: @@ -43,7 +42,7 @@ Examples showing how to use RAJA workgroup methods may be found in the :ref:`tutorial-label`. For more information on RAJA work policies and iteration space constructs, -see :ref:`policies-label` and :ref:`index-label`, respectively. +see :ref:`feat-policies-label` and :ref:`feat-index-label`, respectively. .. _workgroup-Policies-label: @@ -52,19 +51,20 @@ Policies -------- The behavior of the RAJA workgroup constructs is determined by a policy. -The ``RAJA::WorkGroupPolicy`` has three components, a work execution policy, -a work ordering policy, and a work storage policy. ``RAJA::WorkPool``, -``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates all -take the same policy and template arguments. For example:: +The ``RAJA::WorkGroupPolicy`` has four components, a work execution policy, +a work ordering policy, a work storage policy, and a work dispatch policy. +``RAJA::WorkPool``, ``RAJA::WorkGroup``, and ``RAJA::WorkSite`` class templates +all take the same policy and template arguments. For example:: using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::seq_work, RAJA::ordered, - RAJA::ragged_array_of_objects >; + RAJA::ragged_array_of_objects, + RAJA::indirect_function_call_dispatch >; is a workgroup policy that will run loops sequentially on the host in the order -they were enqueued and store the loop bodies sequentially in single buffer in -memory. +they were enqueued, stores the loop bodies sequentially in single buffer in +memory, and dispatches each loop using a function pointer. The work execution policy acts like the execution policies used with ``RAJA::forall`` and determines the backend used to run the loops and the parallelism within each @@ -97,25 +97,24 @@ The work ordering policy acts like the segment iteration execution policies when ``RAJA::forall`` is used with a ``RAJA::IndexSet`` and determines the backend used when iterating over the loops and the parallelism between each loop. - ====================================== ======================================== - Work Execution Policies Brief description - ====================================== ======================================== - ordered Execute loops sequentially in the order - they were enqueued using forall. - reverse_ordered Execute loops sequentially in the - reverse of the order order they were - enqueued using forall. - unordered_cuda_loop_y_block_iter_x_threadblock_average - Execute loops in parallel by mapping - each loop to a set of cuda blocks with - the same index in the y direction in - a cuda kernel. Each loop is given a - number of threads over one of more - blocks in the x direction equal to the - average number of iterations of all the - loops rounded up to a multiple of the - block size. - ====================================== ======================================== + ======================================================= ======================================== + Work Ordering Policies Brief description + ======================================================= ======================================== + ordered Execute loops sequentially in the order + they were enqueued using forall. + reverse_ordered Execute loops sequentially in the + reverse of the order order they were + enqueued using forall. + unordered_cuda_loop_y_block_iter_x_threadblock_average Execute loops in parallel by mapping + each loop to a set of cuda blocks with + the same index in the y direction in + a cuda kernel. Each loop is given a + number of threads over one of more + blocks in the x direction equal to the + average number of iterations of all the + loops rounded up to a multiple of the + block size. + ======================================================= ======================================== The work storage policy determines the strategy used to allocate and layout the storage used to store the ranges, loop bodies, and other data necessary to @@ -140,6 +139,23 @@ implement the workstorage constructs. the loop data items as needed. ====================================== ======================================== +The work dispatch policy determines the technique used to dispatch from type +erased storage to the loops or iterations of each range and loop body pair. + + ====================================== ======================================== + Work Dispatch Policies Brief description + ====================================== ======================================== + indirect_function_call_dispatch Dispatch using function pointers. + indirect_virtual_function_dispatch Dispatch using virtual functions in a + class hierarchy. + direct_dispatch< Dispatch using a switch statement like + camp::list...> coding to pick the right pair of + Range and Callable types from the + template parameter pack. You may only + enqueue a range and callable pair that + is in the list of types in the policy. + ====================================== ======================================== + .. _workgroup-Arguments-label: @@ -187,16 +203,17 @@ policies:: using Allocator = std::allocator; -.. note:: * The allocator type must use template argument char. +.. note:: * The allocator type must use template argument ``char``. * Allocators must provide memory that is accessible where it is used. * Ordered work order policies only require memory that is accessible where loop bodies are enqueued. * Unordered work order policies require memory that is accessible from both where the loop bodies are enqueued and from where the loop is executed based on the work execution policy. - * For example when using cuda work exeution policies with cuda - unordered work order policies pinned memory is a good choice - because it is always accessible on the host and device. + + For example, when using cuda work exeution policies with CUDA + unordered work order policies, pinned memory is a good choice + because it is always accessible on the host and device. .. _workgroup-WorkPool-label: diff --git a/docs/sphinx/user_guide/features.rst b/docs/sphinx/user_guide/features.rst index 8752b546c9..d65384d782 100644 --- a/docs/sphinx/user_guide/features.rst +++ b/docs/sphinx/user_guide/features.rst @@ -12,7 +12,10 @@ RAJA Features ************************ -The following sections describe key aspects of the main RAJA features. +The following sections describe the main RAJA features. They are intended +to introduce users to the features and basic usage and also to provide +a syntax reference guide. The sections contain links to RAJA tutorial +materials that provide detailed examples of usage. .. toctree:: :maxdepth: 2 @@ -22,13 +25,13 @@ The following sections describe key aspects of the main RAJA features. feature/iteration_spaces feature/view feature/reduction - feature/resource feature/atomic feature/scan feature/sort + feature/resource feature/local_array feature/tiling - feature/plugins feature/workgroup feature/vectorization + feature/plugins diff --git a/docs/sphinx/user_guide/figures/vertexsum.jpg b/docs/sphinx/user_guide/figures/vertexsum.jpg index ea61476db0..967c6aec06 100644 Binary files a/docs/sphinx/user_guide/figures/vertexsum.jpg and b/docs/sphinx/user_guide/figures/vertexsum.jpg differ diff --git a/docs/sphinx/user_guide/figures/vertexsum_color.png b/docs/sphinx/user_guide/figures/vertexsum_color.png new file mode 100644 index 0000000000..3071d526ad Binary files /dev/null and b/docs/sphinx/user_guide/figures/vertexsum_color.png differ diff --git a/docs/sphinx/user_guide/getting_started.rst b/docs/sphinx/user_guide/getting_started.rst index 0d66ada68c..b9f868d10a 100644 --- a/docs/sphinx/user_guide/getting_started.rst +++ b/docs/sphinx/user_guide/getting_started.rst @@ -13,145 +13,206 @@ Getting Started With RAJA ************************* -This section will help get you up and running with RAJA quickly. +This section should help get you up and running with RAJA quickly. ============ Requirements ============ -The primary requirement for using RAJA is a C++14 compliant compiler. -Accessing various programming model back-ends requires that they be supported -by the compiler you chose. Available options and how to enable or disable -them are described in :ref:`configopt-label`. To build RAJA in its most basic -form and use its simplest features: +The primary requirement for using RAJA is a C++14 standard compliant compiler. +Certain features, such as various programming model back-ends like CUDA or HIP, +msut be supported by the compiler you chose to use them. Available RAJA +configuration options and how to enable or disable features are described +in :ref:`configopt-label`. + +To build RAJA and use its most basic features, you will need: - C++ compiler with C++14 support -- `CMake `_ version 3.14.5 or greater. +- `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. ================== Get the Code ================== -The RAJA project is hosted on `GitHub `_. -To get the code, clone the repository into a local working space using -the command:: +The RAJA project is hosted on GitHub: +`GitHub RAJA project `_. To get the code, clone +the repository into a local working space using the command:: $ git clone --recursive https://github.com/LLNL/RAJA.git -The ``--recursive`` argument above is needed to pull in necessary RAJA -dependencies as Git *submodules*. Current RAJA dependencies are: +The ``--recursive`` option above is used to pull RAJA Git *submodules*, on +which RAJA depends, into your local copy of the RAJA repository. + +After running the clone command, a copy of the RAJA repository will reside in +the ``RAJA`` subdirectory where you ran the clone command. You will be on the +``develop`` branch, which is the default RAJA branch. + +If you do not pass the ``--recursive`` argument to the ``git clone`` +command, you can also type the following commands after cloning:: + + $ cd RAJA + $ git submodule update --init --recursive + +Either way, the end result is the same and you should be good to configure the +code and build it. + +.. note:: * If you switch branches in a RAJA repo (e.g., you are on a branch, + with everything up-to-date, and you run the command + ``git checkout ``, you may need to run + the command ``git submodule update`` to set the Git submodule + versions to what is used by the new branch. + * If the set of submodules in a new branch is different than the + previous branch you were on, you may need to run the command + ``git submodule update --init --recursive`` to pull in the + correct set of submodule and versions. + +.. _getting_started_depend-label: + +================== +Dependencies +================== +RAJA has several dependencies that are required based on how you want to +build and use it. The RAJA Git repository has submodules that contain +most of these dependencies. + +RAJA includes other submodule dependencies, which are used to support our +Gitlab CI testing. These are described in the RAJA Developer Guide. + +Dependencies that are required to build the RAJA code are: + +- A C++ 14 standard compliant compiler - `BLT build system `_ +- `CMake `_ version 3.23 or greater when building the HIP back-end, and version 3.20 or greater otherwise. - `Camp compiler agnostic metaprogramming library `_ -- `CUB CUDA utilities library `_ -- `rocPRIM HIP parallel primitives library `_ -You probably don't need to know much about these other projects to start -using RAJA. But, if you want to know more about them, click on the links above. +Other dependencies that users should be aware of that support certain +features are: -After running the clone command, a copy of the RAJA repository will reside in -a ``RAJA`` subdirectory where you ran the clone command. You will be on the -``develop`` branch of RAJA, which is our default branch. +- `CUB CUDA utilities library `_, which is required for using the RAJA CUDA back-end. +- `rocPRIM HIP parallel primitives library `_, which is required for using the RAJA HIP back-end. +- `Desul `_, which is required if you want to use Desul atomics in RAJA instead of our current default atomics. Note that we plan to switch over to Desul atomics exclusively at some point. -If you do not pass the ``--recursive`` argument to the ``git clone`` -command, you can type the following commands after cloning:: +.. note:: You may want or need to use external versions of camp, CUB, or + rocPRIM instead of the RAJA submodules. This is usually the case + when you are using RAJA along with some other library that also + needs one of these. To do so, you need to use CMake variables to + pass a path to a valid installation of each library. Specifically: - $ cd RAJA - $ git submodule init - $ git submodule update + * External camp:: + + cmake \ + ... \ + -Dcamp_DIR=path/to/camp/install \ + ... + + * External CUB:: + + cmake \ + ... \ + -DRAJA_ENABLE_EXTERNAL_CUB=On \ + -DCUB_DIR=path/to/cub \ + ... + + * External rocPRIM:: + + cmake \ + ... \ + -DRAJA_ENABLE_EXTERNAL_ROCPRIM=On \ + -DROCPRIM_DIR=path/to/rocPRIM \ + ... + +More information about configuring GPU builds with CUDA or HIP is provided +in :ref:`getting_started_build_gpu-label` -Either way, the end result is the same and you should be good to go. +Additional discussion of these dependencies, with respect to building RAJA, is +provided in :ref:`getting_started_build-label`. Other than that, you probably +don't need to know much about them. If you are curious and want to know more, +please click on the link to the library you want to know about in the above +list. -.. note:: Any time you switch branches in RAJA, you need to re-run the - 'git submodule update' command to set the Git submodules to - what is used by the new branch. +.. _getting_started_build-label: ================== Build and Install ================== -Building and installing RAJA can be very easy or more complicated, depending -on which features you want to use and how easy it is to use your system. +The complexity of building and installing RAJA depends on which features you +want to use and how easy it is to do this on your system. --------------- -Building RAJA --------------- +.. note:: RAJA builds must be *out-of-source*. In particular, RAJA does not + allow building in its source directory. You must create a build + directory and run CMake in it. -RAJA uses CMake to configure a build. A "bare bones" configuration looks like:: +RAJA uses CMake to configure a build. To create a "bare bones" configuration, +build, and install it, you can do the following:: $ mkdir build-dir && cd build-dir $ cmake -DCMAKE_INSTALL_PREFIX=/path/to/install ../ + $ make (or make -j for a parallel build) + $ make install -.. note:: * RAJA requires a minimum CMake version of 3.14.5. - * Builds must be *out-of-source*. RAJA does not allow building in - the source directory, so you must create a build directory and - run CMake in it. - -When you run CMake, it will generate output about the build environment -(compiler and version, options, etc.). Some RAJA features, -like OpenMP support are enabled by default if, for example, the compiler -supports OpenMP. These can be disabled if desired. For a summary of -RAJA configuration options, please see :ref:`configopt-label`. - -After CMake successfully completes, you compile RAJA by executing the ``make`` -command in the build directory; i.e.,:: - - $ make +Running ``cmake`` generates the RAJA build configuration. Running ``make`` +compiles the code. Running ``make install`` copies RAJA header files +to an ``include`` directory and installs the RAJA library in a ``lib`` +directory, both in the directory location specified with the +``-DCMAKE_INSTALL_PREFIX`` CMake option. -If you have access to a multi-core system, you can compile in parallel by -running ``make -j`` (to build with all available cores) or ``make -j N`` to -build using N cores. +Other build configurations are accomplished by passing other options to CMake. +For example, if you want to use a C++ compiler other than the default on +your system, you would pass a path to the compiler using the standard +CMake option ``-DCMAKE_CXX_COMPILER=path/to/compiler``. +When you run CMake, it will generate output about the build configuration +(compiler and version, options, etc.), which is helpful to make sure CMake +is doing what you want. For a summary of RAJA configuration +options, please see :ref:`configopt-label`. -.. note:: * RAJA is configured to build its unit tests by default. If you do not - disable them with the appropriate CMake option (please see - :ref:`configopt-label`), you can run them after the build completes - to check if everything is built properly. +.. note:: RAJA is configured to build its tests, examples, and tutorial + exercises by default. If you do not disable them with the + appropriate CMake option (see :ref:`configopt-label`), + you can run them after the build completes to check if everything + is built properly. - The easiest way to run the full set of RAJA tests is to type:: + The easiest way to run the full set of RAJA tests is to type:: - $ make test + $ make test - in the build directory after the build completes. + in the build directory after the build completes. - You can also run individual tests by invoking test - executables directly. They will be located in the ``test`` - subdirectory in the build space directory. RAJA tests use the - `Google Test framework `_, - so you can also run tests via Google Test commands. + You can also run individual tests by invoking the corresponding + test executables directly. They will be located in the ``test`` + subdirectory in your build space. RAJA tests use the + `Google Test framework `_, + so you can also run and filter tests via Google Test commands. - * RAJA also contains example and tutorial exercise - programs you can run if you wish. Similar to the RAJA tests, - the examples and exercises are built by default and can be - disabled with CMake options (see :ref:`configopt-label`). The - source files for these are located in the ``RAJA/examples`` and - ``RAJA/exercises`` directories, respectively. When built, the - executables for the examples and exercises will be located in - the ``bin`` subdirectory in the build space directory. Feel free to - experiment by editing the source files and recompiling. + The source files for RAJA examples and exercises are located in + the ``RAJA/examples`` and ``RAJA/exercises`` directories, + respectively. When built, the executables for the examples and + exercises will be located in the ``bin`` subdirectory in your build + space. -.. _build-external-tpl-label: +.. _getting_started_build_gpu-label: -.. note:: You may use externally-supplied versions of the camp, CUB, and rocPRIM - libraries with RAJA if you wish. To do so, pass the following - options to CMake: - * External camp: -DEXTERNAL_CAMP_SOURCE_DIR= - * External CUB: -DRAJA_ENABLE_EXTERNAL_CUB=On -DCUB_DIR= - * External rocPRIM: -DRAJA_ENABLE_EXTERNAL_ROCPRIM=On - -DROCPRIM_DIR= +------------------------------------------- +Additional RAJA Back-end Build Information +------------------------------------------- ------------------ -GPU Builds, etc. ------------------ +Configuring a RAJA build to support a GPU back-end, such as CUDA, HIP, or +OpenMP target offload, typically requires additional CMake options, which +we describe next. CUDA ^^^^^^ To run RAJA code on NVIDIA GPUs, one typically must have a CUDA compiler -installed on your system, in addition to a host code compiler. You may need +installed on the system, in addition to a host code compiler. You may need to specify both when you run CMake. The host compiler is specified using the -``CMAKE_CXX_COMPILER`` CMake variable. The CUDA compiler is specified with -the ``CMAKE_CUDA_COMPILER`` variable. +``CMAKE_CXX_COMPILER`` CMake variable as described earlier. The CUDA software +stack and compiler are specified using the following CMake options: + + * -DCUDA_TOOLKIT_ROOT_DIR=path/to/cuda/toolkit + * -DCMAKE_CUDA_COMPILER=path/to/nvcc When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables: @@ -159,21 +220,19 @@ When using the NVIDIA nvcc compiler for RAJA CUDA functionality, the variables: * CMAKE_CUDA_FLAGS_DEBUG * CMAKE_CUDA_FLAGS_RELWITHDEBINFO -which corresponding to the standard CMake build types are used to pass flags -to nvcc. +correspond to the standard CMake build types and are used to pass additional +compiler options to nvcc. -.. note:: When nvcc must pass options to the host compiler, the arguments - can be included using these CMake variables. Host compiler - options must be prepended with the `-Xcompiler` directive. +.. note:: Often, nvcc must pass options to the host compiler, the arguments + can be included using the ``CMAKE_CUDA_FLAGS...`` CMake variables + listed above. Host compiler options must be prepended with the + ``-Xcompiler`` directive to properly propagate. -To set the CUDA compute architecture for the nvcc compiler, which should be -chosen based on the NVIDIA GPU hardware you are using, you can use the -``CUDA_ARCH`` CMake variable. For example, the CMake option:: - - -DCUDA_ARCH=sm_60 - -will tell the compiler to use the `sm_60` SASS architecture in its second -stage of compilation. It will pick the PTX architecture to use in the first +To set the CUDA compute architecture, which should be chosen based on the +NVIDIA GPU hardware you are using, you can use the ``CUDA_ARCH`` CMake +variable. For example, the CMake option ``-DCUDA_ARCH=sm_70`` will tell the +compiler to use the `sm_70` SASS architecture in its second stage of +compilation. The compiler will pick the PTX architecture to use in the first stage of compilation that is suitable for the SASS architecture you specify. Alternatively, you may specify the PTX and SASS architectures, using @@ -182,29 +241,37 @@ appropriate nvcc options in the ``CMAKE_CUDA_FLAGS_*`` variables. .. note:: **RAJA requires a minimum CUDA architecture level of `sm_35` to use all supported CUDA features.** Mostly, the architecture level affects which RAJA CUDA atomic operations are available and how they are - implemented inside RAJA. This is described in :ref:`atomics-label`. + implemented inside RAJA. This is described in + :ref:`feat-atomics-label`. * If you do not specify a value for ``CUDA_ARCH``, it will be set to `sm_35` by default and CMake will emit a status message - indicatting this choice was made. + indicating this choice was made. * If you give a ``CUDA_ARCH`` value less than `sm_35` (e.g., `sm_30`), - CMake will report this and stop processing. - -Also, RAJA relies on the CUB CUDA utilities library for some CUDA functionality. -The CUB included in the CUDA toolkit is used by default if available. RAJA -includes a CUB submodule that is used if it is not available. To use -an external CUB install provide the following option to CMake: -``-DRAJA_ENABLE_EXTERNAL_CUB=On -DCUB_DIR=``. - -.. note:: **It is important to note that the CUDA toolkit version of cub is + CMake will report this as an error and stop processing. + +Also, RAJA relies on the CUB CUDA utilities library, mentioned earlier, for +some CUDA back-end functionality. The CUB version included in the CUDA toolkit +installation is used by default when available. This is the case for CUDA +version 11 and later. RAJA includes a CUB submodule that is used by default +with older versions of CUDA. To use an external CUB installation, provide the +following options to CMake:: + + cmake \ + ... \ + -DRAJA_ENABLE_EXTERNAL_CUB=On \ + -DCUB_DIR= \ + ... + +.. note:: The CUDA toolkit version of CUB is required for compatibility with the CUDA toolkit version of thrust - starting with CUDA toolkit version v11.0.0. So, if you build - RAJA with CUDA version 11 or higher you must use the CUDA - toolkit version of CUB to use Thrust and be compatible with libraries - that use Thrust. + starting with CUDA version 11.0.0. So, if you build + RAJA with CUDA version 11 or higher, you should use the version of + CUB contained in the CUDA toolkit version you are using to use + Thrust and to be compatible with libraries that use Thrust. - *It is important to note that the version of Googletest that +.. note:: The version of Googletest that is used in RAJA version v0.11.0 or newer requires CUDA version 9.2.x or newer when compiling with nvcc. Thus, if you build RAJA with CUDA enabled and want to also enable RAJA tests, you @@ -213,74 +280,88 @@ an external CUB install provide the following option to CMake: HIP ^^^^ -To run RAJA code on AMD GPUs, one typically uses the HIP compiler and tool -chain (which can also be used to compile code for NVIDIA GPUs). +To run RAJA code on AMD GPUs, one typically uses a ROCm compiler and tool +chain (which can also be used to compile code for NVIDIA GPUs, which is not +covered in detail in RAJA user documentation). .. note:: RAJA requires version 3.5 or newer of the ROCm software stack to use the RAJA HIP back-end. -Also, RAJA relies on the rocPRIM HIP utilities library for some HIP +Unlike CUDA, you do not specify a host compiler and a device compiler when +using the AMD ROCm software stack. Typical CMake options to use when building +with a ROCm stack are: + + * -DROCM_ROOT_DIR=path/to/rocm + * -DHIP_ROOT_DIR=path/to/hip + * -DHIP_PATH=path/to/hip/binaries + * -DCMAKE_CXX_COMPILER=path/to/rocm/compiler + +Additionally, you use the CMake variable ``CMAKE_HIP_ARCHITECTURES`` to set +the target compute architecture. For example:: + + -DCMAKE_HIP_ARCHITECTURES=gfx908 + +RAJA relies on the rocPRIM HIP utilities library for some HIP functionality. The rocPRIM included in the ROCm install is used by default if available. RAJA includes a rocPRIM submodule that is used if it is not -available. To use an external rocPRIM install provide the following option to CMake: -``-DRAJA_ENABLE_EXTERNAL_ROCPRIM=On -DROCPRIM_DIR=``. +available. To use an external rocPRIM install provide the following options +to CMake:: + + cmake \ + ... \ + -DRAJA_ENABLE_EXTERNAL_ROCPRIM=On \ + -DROCPRIM_DIR= \ + ... -.. note:: When using HIP and targeting NVIDIA GPUs RAJA uses CUB instead of - rocPRIM. In this case you must use an external CUB install using the - CMake variables described in the CUDA section. +.. note:: When using HIP and targeting NVIDIA GPUs, RAJA uses CUB instead of + rocPRIM. In this case, you must configure with an external CUB + install using the CMake variables described in the CUDA section above. OpenMP ^^^^^^^ -To use OpenMP target offlad GPU execution, additional options may need to be +To use OpenMP target offload GPU execution, additional options may need to be passed to the compiler. The variable ``OpenMP_CXX_FLAGS`` is used for this. Option syntax follows the CMake *list* pattern. For example, to specify OpenMP target options for NVIDIA GPUs using a clang-based compiler, one may do something like:: cmake \ - .... - -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" + ... \ + -DOpenMP_CXX_FLAGS="-fopenmp;-fopenmp-targets=nvptx64-nvidia-cuda" \ + ... ---------------------------------------- RAJA Example Build Configuration Files ---------------------------------------- -The ``RAJA/scripts`` directory contains subdirectories with a variety of -build scripts we use to build and test RAJA on various platforms with -various compilers. These scripts pass files (*CMake cache files*) located in -the ``RAJA/host-configs`` directory to CMake using the '-C' option. +The RAJA repository has subdirectories ``RAJA/scripts/*-builds`` that contain +a variety of build scripts we use to build and test RAJA on various platforms +with various compilers. These scripts pass files (*CMake cache files*) +located in the ``RAJA/host-configs`` directory to CMake using the '-C' option. These files serve as useful examples of how to configure RAJA prior to compilation. ----------------- -Installing RAJA ----------------- - -To install RAJA as a library, run the following command in your build -directory:: - - $ make install - -This will copy RAJA header files to the ``include`` directory and the RAJA -library will be installed in the ``lib`` directory you specified using the -``-DCMAKE_INSTALL_PREFIX`` CMake option. - - ====================== Learning to Use RAJA ====================== -If you want to view and run a very simple RAJA example code, a good place to -start is located in the file: ``RAJA/examples/daxpy.cpp``. After building -RAJA with the options you select, the executable for this code will reside -in the file: ``/examples/bin/daxpy``. Simply type the name -of the executable in your build directory to run it; i.e.,:: - - $ ./examples/bin/daxpy - -The ``RAJA/examples`` directory also contains many other RAJA example codes -you can run and experiment with. +The RAJA repository contains a variety of example source codes that you are +encouraged to view and run to learn about how to use RAJA: + + * The ``RAJA/examples`` directory contains various examples that illustrate + algorithm patterns. + * The ``RAJA/exercises`` directory contains exercises for users to work + through along with complete solutions. These are described in detail + in the :ref:`tutorial-label` section. + * Other examples can also be found in the ``RAJA/test`` directories. + +We mentioned earlier that RAJA examples, exercises, and tests are built by +default when RAJA is compiled. So, unless you explicitly disable them when +you run CMake to configure a RAJA build, you can run them after compiling RAJA. +Executables for the examples and exercises will be located in the +``/bin`` directory in your build space. Test executables will +be located in the ``/test`` directory. For an overview of all the main RAJA features, see :ref:`features-label`. A full tutorial with a variety of examples showing how to use RAJA features diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index 692bf6dd9f..bb4eb6d2db 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -26,11 +26,11 @@ Additional information about things to think about when considering whether to use RAJA in an application can be found in :ref:`app-considerations-label`. .. toctree:: - :maxdepth: 2 + :maxdepth: 3 getting_started + using_raja + config_options features app_considerations tutorial - using_raja - config_options diff --git a/docs/sphinx/user_guide/tutorial.rst b/docs/sphinx/user_guide/tutorial.rst index 3e2bb25f5d..a468c86984 100644 --- a/docs/sphinx/user_guide/tutorial.rst +++ b/docs/sphinx/user_guide/tutorial.rst @@ -8,38 +8,54 @@ .. _tutorial-label: -********************** +**************************** +RAJA Tutorial and Examples +**************************** + +The following sections contain tutorial material and examples that describe +how to use RAJA features. + +=============== RAJA Tutorial -********************** - -In addition to the tutorial portion of this RAJA User Guide, we maintain -a repository of tutorial presentation materials here `RAJA Tutorials Repo `_. - -This RAJA tutorial introduces RAJA concepts and capabilities via a -sequence of examples of increasing complexity. Complete working codes for -the examples are located in the ``RAJA``examples`` directory. The RAJA -tutorial evolves as we add new features to RAJA, so refer to it periodically -if you are interested in learning about them. - -To understand the discussion and code examples, a working knowledge of C++ -templates and lambda expressions is required. So, before we begin, we provide -a bit of background discussion of basic aspects of how RAJA use employs C++ -templates and lambda expressions, which is essential to using RAJA successfully. - -To understand the GPU examples (e.g., CUDA), it is also important to know the -difference between CPU (host) and GPU (device) memory allocations and how -transfers between those memory spaces work. For a detailed discussion, see -`Device Memory `_. - -RAJA does not provide a memory model. This is by design as developers of many -of applications that use RAJA prefer to manage memory themselves. Thus, users -are responsible for ensuring that data is properly allocated and initialized -on a GPU device when running GPU code. This can be done using explicit host -and device allocation and copying between host and device memory spaces or via -unified memory (UM), if available. RAJA developers also support a library -called `CHAI `_ which complements RAJA by -providing a alternative to manual host-device memory copy calls or UM. -For more information, see :ref:`plugins-label`. +=============== + +This section contains a self-paced tutorial that shows how to use many RAJA +features by way of a sequence of examples and exercises. Each exercise is +located in files in the ``RAJA/exercises`` directory, one *exercise* file with +code sections removed and comments containing instructions to fill in the +missing code parts and one *solution* file containing complete working code to +compare with and for guidance if you get stuck working on the exercise file. +You are encouraged to build and run the exercises and modify them to try out +different variations. + +We also maintain a repository of tutorial slide presentations +`RAJA Tutorials Repo `_ which we use +when we give in-person or virtual online tutorials in various venues. The +presentations complement the material found here. The tutorial material +evolves as we add new features to RAJA, so refer to it periodically if you +are interested in learning about new things in RAJA. + +To understand the GPU examples (e.g., CUDA), it is also important to know the +difference between CPU (host) and GPU (device) memory allocations and how +transfers between those memory spaces work. For a detailed discussion, see +`Device Memory `_. + +It is important to note that RAJA does not provide a memory model. This is by +design as application developers who use RAJA prefer to manage memory +in different ways. Thus, users are responsible for ensuring that data is +properly allocated and initialized on a GPU device when running GPU code. +This can be done using explicit host and device allocation and copying between +host and device memory spaces or via unified memory (UM), if available. +The RAJA Portability Suite contains other libraries, namely +`CHAI `_ and +`Umpire `_, that complement RAJA by +providing alternatives to manual programming model specific memory operations. + +.. note:: Most of the CUDA GPU exercises use unified memory (UM) via a simple + memory manager capability provided in a file in the ``RAJA/exercises`` + directory. HIP GPU exercises use explicit host and device memory + allocations and explicit memory copy operations to move data between + the two. .. _tutorial-lambda-label: @@ -47,20 +63,26 @@ For more information, see :ref:`plugins-label`. A Little C++ Background =============================== -RAJA makes heavy use of C++ templates and using RAJA most easily and -effectively is done by representing the bodies of loop kernels as C++ lambda -expressions. Alternatively, C++ functors can be used, but they make -application source code more complex, potentially placing a significant -negative burden on source code readability and maintainability. +To understand the discussion and code examples, a working knowledge of C++ +templates and lambda expressions is required. So, before we begin, we provide +a bit of background discussion of basic aspects of how RAJA use employs C++ +templates and lambda expressions, which is essential to use RAJA successfully. + +RAJA is almost an entirely header-only library that makes heavy use of +C++ templates. Using RAJA most easily and effectively is done by representing +the bodies of loop kernels as C++ lambda expressions. Alternatively, C++ +functors can be used, but they make application source code more complex, +potentially placing a significant negative burden on source code readability +and maintainability. ----------------------------------- C++ Templates ----------------------------------- -C++ templates enable one to write generic code and have the compiler generate -a specific implementation for each set of template parameter types you use. -For example, the ``RAJA::forall`` method to execute loop kernels is a -template method defined as:: +C++ templates enable one to write type-generic code and have the compiler +generate an implementation for each set of template parameter types specified. +For example, the ``RAJA::forall`` method to execute loop kernels is +essentially method defined as:: template ( RAJA::RangeSegment(0, N), [=](int i) { + RAJA::forall< RAJA::loop_exec >( RAJA::TypedRangeSegment(0, N), [=](int i) { a[i] = b[i] + c[i]; }); -The "IdxType" and "LoopBody" types are deduced by the compiler based on what -arguments are passed to the ``RAJA::forall`` method. Here, the loop body type -is defined by the lambda expression:: +is a sequential CPU RAJA kernel that performs an element-by-element vector sum. +The C-style analogue of this kernel is:: + + for (int i = 0; i < N; ++i) { + a[i] = b[i] + c[i]; + } + +The execution policy type ``RAJA::loop_exec`` template argument +is used to choose as specific implementation of the +``RAJA::forall`` method. The ``IdxType`` and ``LoopBody`` types are deduced by +the compiler based the arguments passed to the ``RAJA::forall`` method; +i.e., the ``IdxType`` is the stride-1 index range:: + + RAJA::TypedRangeSegment(0, N) + +and the ``LoopBody`` type is the lambda expression:: [=](int i) { a[i] = b[i] + c[i]; } @@ -88,11 +124,11 @@ Elements of C++ Lambda Expressions Here, we provide a brief description of the basic elements of C++ lambda expressions. A more technical and detailed discussion is available here: -`Lambda Functions in C++11 - the Definitive Guide `_ +`Lambda Functions in C++11 - the Definitive Guide `_ -Lambda expressions were introduced in C++ 11 to provide a lexical-scoped -name binding; specifically, a *closure* that stores a function with a data -environment. That is, a lambda expression can *capture* variables from an +Lambda expressions were introduced in C++ 11 to provide a lexical-scoped +name binding; specifically, a *closure* that stores a function with a data +environment. That is, a lambda expression can *capture* variables from an enclosing scope for use within the local scope of the function expression. A C++ lambda expression has the following form:: @@ -100,36 +136,39 @@ A C++ lambda expression has the following form:: [capture list] (parameter list) {function body} The ``capture list`` specifies how variables outside the lambda scope are pulled -into the lambda data environment. The ``parameter list`` defines arguments +into the lambda data environment. The ``parameter list`` defines arguments passed to the lambda function body -- for the most part, lambda arguments -are just like arguments in a regular C++ method. Variables in the capture list -are initialized when the lambda expression is created, while those in the -parameter list are set when the lambda expression is called. The body of a +are just like arguments in a regular C++ method. Variables in the capture list +are initialized when the lambda expression is created, while those in the +parameter list are set when the lambda expression is called. The body of a lambda expression is similar to the body of an ordinary C++ method. -RAJA templates, such as ``RAJA::forall`` and ``RAJA::kernel`` pass arguments -to lambdas based on usage and context; e.g., loop iteration indices. +RAJA kernel execution templates, such as ``RAJA::forall`` and ``RAJA::kernel`` +that we will describe in detail later, pass arguments +to lambdas based on usage and context such as loop iteration indices. -A C++ lambda expression can capture variables in the capture list by value -or by reference. This is similar to how arguments to C++ methods are passed; -i.e., *pass-by-reference* or *pass-by-value*. However, there are some subtle +A C++ lambda expression can capture variables in the capture list *by value* +or *by reference*. This is similar to how arguments to C++ methods are passed; +i.e., *pass-by-reference* or *pass-by-value*. However, there are some subtle differences between lambda variable capture rules and those for ordinary -methods. Variables mentioned in the capture list with no extra symbols are -captured by value. Capture-by-reference is accomplished by using the -reference symbol '&' before the variable name; for example:: +methods. **Variables included in the capture list with no extra symbols are +captured by value.** Variables captured by value are effectively *const* +inside the lambda expression body and cannot be written to. +Capture-by-reference is accomplished by using the reference symbol '&' before +the variable name similar to C++ method arguments. For example:: int x; int y = 100; [&x, &y](){ x = y; }; -generates a lambda expression that captures both 'x' and 'y' by reference -and assigns the value of 'y' to 'x' when called. The same outcome would be +generates a lambda expression that captures both 'x' and 'y' by reference +and assigns the value of 'y' to 'x' when called. The same outcome would be achieved by writing:: [&](){ x = y; }; // capture all lambda arguments by reference... or:: - [=, &x](){ x = y; }; // capture 'x' by reference and 'y' by value... + [=, &x](){ x = y; }; // capture 'x' by reference and 'y' by value... Note that the following two attempts will generate compilation errors:: @@ -138,80 +177,90 @@ Note that the following two attempts will generate compilation errors:: [x, &y](){ x = y; }; // error: cannot assign to 'x' since it is captured // by value. -**Specifically, a variable hat is captured by value is read-only.** +.. note:: A variable that is captured by value in a lambda expression is + **read-only.** ---------------------------------------- -A Few Notes About Lambda Usage With RAJA +A Few Notes About Lambda Usage With RAJA ---------------------------------------- -There are several issues to note about C++ lambda expressions; in particular, -with respect to RAJA usage. We describe them here. +There are several issues to note about using C++ lambda expressions to +represent kernel bodies with RAJA. We describe them here. - * **Prefer by-value lambda capture.** + * **Prefer by-value lambda capture.** - We recommended `capture by-value` for all lambda loop bodies passed to - RAJA execution methods. To execute a RAJA loop on a non-CPU device, such - as a GPU, all variables accessed in the loop body must be passed into the - GPU device data environment. Using capture by-value for all RAJA-based - lambda usage will allow your code to be portable for either CPU or GPU - execution. In addition, the read-only nature of variables captured - by-value can help avoid incorrect CPU code since the compiler will report + We recommend `capture by-value` for all lambda kernel bodies passed to + RAJA execution methods. To execute a RAJA loop on a non-CPU device, such + as a GPU, all variables accessed in the loop body must be passed into the + GPU device data environment. Using capture by-value for all RAJA-based + lambda usage will allow your code to be portable for either CPU or GPU + execution. In addition, the read-only nature of variables captured + by-value can help avoid incorrect CPU code since the compiler will report incorrect usage. +|br| - * **Must use 'device' annotation for CUDA device execution.** + * **The '__device__' annotation is required for device execution using CUDA or HIP.** - Any lambda passed to a CUDA execution context (or function called from a - CUDA device kernel, for that matter) must be decorated with + Any lambda passed to a CUDA or HIP execution context (or function called from a + device kernel, for that matter) must be decorated with the ``__device__`` annotation; for example:: - + RAJA::forall>( range, [=] __device__ (int i) { ... } ); Without this, the code will not compile and generate compiler errors - indicating that a 'host' lambda cannot be called from 'device' code. + indicating that a 'host' lambda cannot be called in 'device' code. RAJA provides the macro ``RAJA_DEVICE`` that can be used to help switch - between host-only or device-only CUDA compilation. - + between host-only or device-only compilation. + +|br| * **Use 'host-device' annotation on a lambda carefully.** RAJA provides the macro ``RAJA_HOST_DEVICE`` to support the dual - CUDA annotation ``__ host__ __device__``. This makes a lambda or function - callable from CPU or CUDA device code. However, when CPU performance is - important, **the host-device annotation should be applied carefully on a - lambda that is used in a host (i.e., CPU) execution context**. - Unfortunately, a loop kernel containing a lambda annotated in this way - may run noticeably slower on a CPU than the same lambda with no annotation - depending on the version of the nvcc compiler you are using. - + annotation ``__ host__ __device__``, which makes a lambda or function + callable from CPU or GPU device code. However, when CPU performance is + important, **the host-device annotation should be applied carefully on a + lambda that is used in a host (i.e., CPU) execution context**. Although + compiler improvements in recent years have significantly + improved support for host-device lambda expressions, a loop kernel + containing a lambda annotated in this way may run noticeably slower on + a CPU than the same lambda with no annotation depending on the version of + the compiler (e.g., nvcc) you are using. To be sure that your code does not + suffer in performance, we recommend comparing CPU execution timings of + important kernels with and without the ``__host__ __device__`` annotation. + +|br| + + * **Cannot use 'break' and 'continue' statements in a lambda.** - * **Cannot use 'break' and 'continue' statements in a lambda.** + In this regard, a lambda expression is similar to a function. So, if you + have loops in your code with these statements, they should be rewritten. - In this regard, a lambda expression is similar to a function. So, if you - have loops in your code with these statements, they should be rewritten. - +|br| - * **Global variables are not captured in a lambda.** + * **Global variables are not captured in a lambda.** - This fact is due to the C++ standard. If you need (read-only) access to a - global variable inside a lambda expression, one solution is to make a local + This fact is due to the C++ standard. If you need access to a + global variable inside a lambda expression, one solution is to make a local reference to it; for example:: double& ref_to_global_val = global_val; - RAJA::forall>( range, [=] __device__ (int i) { + RAJA::forall>( range, [=] __device__ (int i) { // use ref_to_global_val } ); - - * **Local stack arrays may not be captured by CUDA device lambdas.** +|br| + + * **Local stack arrays may not be captured by CUDA device lambdas.** Although this is inconsistent with the C++ standard (local stack arrays - are properly captured in lambdas for code that will execute on a CPU), - attempting to access elements in a local stack array in a CUDA device - lambda may generate a compilation error depending on the version of the - nvcc compiler you are using. One solution to this problem is to wrap the + are properly captured in lambdas for code that will execute on a CPU), + attempting to access elements in a local stack array in a CUDA device + lambda may generate a compilation error depending on the version of the + device compiler you are using. One solution to this problem is to wrap the array in a struct; for example:: struct array_wrapper { @@ -224,33 +273,32 @@ with respect to RAJA usage. We describe them here. // access entries of bounds.array } ); - This issue appears to be resolved in in the 10.1 release of CUDA. If you - are using an earlier version of nvcc, an implementation - similar to the one above will be required. - - -================ -RAJA Examples -================ + This issue was resolved in the 10.1 release of CUDA. If you are using an + earlier version, an implementation similar to the one above will be required. + +.. |br| raw:: html + +
+ +=========================== +RAJA Examples and Exercises +=========================== The remainder of this tutorial illustrates how to use RAJA features with -working code examples that are located in the ``RAJA/examples`` -directory. Additional information about the RAJA features -used can be found in :ref:`features-label`. - -The examples demonstrate CPU execution (sequential, SIMD, OpenMP -multithreading) and CUDA GPU execution. Examples that show how to use -RAJA with other parallel programming model back-ends that are in -development will appear in future RAJA releases. For adventurous users who -wish to try experimental features, usage is similar to what is shown in the +working code examples and interactive exercises. Files containing the +exercise source code are located in the ``RAJA/exercises`` directory. +Additional information about the RAJA features used can be found +in :ref:`features-label`. + +The examples demonstrate CPU execution (sequential and OpenMP +multithreading) and GPU execution (CUDA and/or HIP). Examples that show how +to use RAJA with other parallel programming model back-ends will appear in +future RAJA releases. For adventurous users who wish to try experimental +RAJA back-end support, usage is similar to what is shown in the examples here. All RAJA programming model support features are enabled via CMake options, -which are described in :ref:`configopt-label`. - -For the purposes of discussion of each example, we assume that any and all -data used has been properly allocated and initialized. This is done in the -example code files, but is not discussed further here. +which are described in :ref:`configopt-label`. .. _tutorialbasic-label: @@ -260,52 +308,147 @@ Simple Loops and Basic RAJA Features The examples in this section illustrate how to use ``RAJA::forall`` methods to execute simple loop kernels; i.e., non-nested loops. It also describes -iteration spaces, reductions, atomic operations, scans, and sorts. +iteration spaces, reductions, atomic operations, scans, sorts, and RAJA +data views. .. toctree:: :maxdepth: 1 tutorial/add_vectors.rst - tutorial/dot_product.rst tutorial/indexset_segments.rst tutorial/vertexsum_coloring.rst + tutorial/dot_product.rst tutorial/reductions.rst tutorial/atomic_histogram.rst tutorial/scan.rst tutorial/sort.rst + tutorial/view_layout.rst + tutorial/permuted-layout-batch-matrix-multiply.rst .. _tutorialcomplex-label: ================================================================= -Complex Loops: Transformations and Advanced RAJA Features +Complex Loops and Advanced RAJA Features ================================================================= -The examples in this section illustrate how to use ``RAJA::kernel`` methods -to execute complex loop kernels, such as nested loops. It also describes -how to construct kernel execution policies, use different view types and -tiling mechanisms to transform loop patterns. +RAJA provides two APIs for writing complex kernels involving nested +loops: ``RAJA::kernel`` that has been available for several years and +``RAJA::expt::launch``, which is more recent and which will be moved out of +the ``expt`` namespace soon. We briefly introduce both interfaces here. +The tutorial sections that follow provide much more detailed descriptions. + +``RAJA::kernel`` is analogous to ``RAJA::forall`` in that it involves +kernel execution templates, execution policies, iteration spaces, and lambda +expression kernel bodies. The main differences between ``RAJA::kernel`` and +``RAJA::forall`` are: + + * ``RAJA::kernel`` requires a tuple of iteration spaces, one for each level + in a loop nest, whereas ``RAJA::forall`` takes exactly one iteration + space. + * ``RAJA::kernel`` can accept multiple lambda expressions to express + different parts of a kernel body, whereas ``RAJA::forall`` accepts + exactly one lambda expression for a kernel body. + * ``RAJA::kernel`` execution policies are more complicated than those + for ``RAJA::forall``. ``RAJA::forall`` policies essentially represent + the kernel execution back-end only. ``RAJA::kernel`` execution policies + enable complex compile time algorithm transformations to be done without + changing the kernel code. + +The following exercises illustrate the common usage of ``RAJA::kernel`` +and ````RAJA::expt::launch``. Please see :ref:`loop_elements-kernelpol-label` +for more information about other execution policy constructs ``RAJA::kernel`` +provides. ``RAJA::expt::launch`` takes a ``RAJA::expt::Grid`` type argument for +representing a teams-thread launch configuration, and a lambda expression +which takes a ``RAJA::expt::LaunchContext`` argument. ``RAJA::expt::launch`` +allows an optional run time choice of execution environment, either CPU or GPU. +Code written inside the lambda expression body will execute in the chosen +execution environment. Within that environment, a user executes +kernel operations using ``RAJA::expt::loop`` method calls, which +take lambda expressions to express loop body operations. + +.. note:: A key difference between the ``RAJA::kernel`` and + ``RAJA::expt::launch`` approaches is that almost all of the + kernel execution pattern is expressed in the execution policy + when using ``RAJA::kernel``, whereas with ``RAJA::expt::launch`` the + kernel execution pattern is expressed mostly in the lambda + expression kernel body. + +One may argue that ``RAJA::kernel`` is more portable and flexible in that +the execution policy enables compile time code transformations without +changing kernel body code. On the other hand, ``RAJA::expt::launch`` is +less opaque and more intuitive, but may require kernel body code changes for +algorithm changes. Which interface to use depends on personal preference +and other concerns, such as portability requirements, the need for run time +execution selection, etc. Kernel structure is more explicit in application +source code with ``RAJA::expt::launch``, and more concise and arguably more +opaque with ``RAJA::kernel``. There is a large overlap of algorithms that can +be expressed with either interface. However, there are things that one can do +with one or the other but not both. + +In the following sections, we introduce the basic mechanics and features +of both APIs with examples and exercises. We also present a sequence of +execution policy examples and matrix transpose examples using both +``RAJA::kernel`` and ``RAJA::expt::launch`` to compare and contrast the +two interfaces. + +=========================================================================== +Nested Loops with ``RAJA::kernel`` +=========================================================================== + +The examples in this section illustrate various features of the +``RAJA::kernel`` API used to execute nested loop kernels. It describes how to +construct kernel execution policies and use different view types and tiling +mechanisms to transform loop patterns. More information can be found in +:ref:`loop_elements-kernel-label`. .. toctree:: :maxdepth: 1 - tutorial/matrix_multiply.rst - tutorial/nested_loop_reorder.rst - tutorial/permuted-layout.rst - tutorial/offset-layout.rst - tutorial/tiled_matrix_transpose.rst - tutorial/matrix_transpose_local_array.rst - tutorial/halo-exchange.rst + tutorial/kernel_nested_loop_reorder.rst + tutorial/kernel_exec_pols.rst + tutorial/offset-layout-5pt-stencil.rst ================================================================= -Team based Loops: Nested loops with a thread/team model +Nested Loops with ``RAJA::expt::launch`` ================================================================= The examples in this section illustrate how to use ``RAJA::expt::launch`` -to create an run-time selectable execution space for expressing algorithms -in terms of threads and teams. +to create an run time selectable execution space for expressing algorithms +as nested loops. + +.. toctree:: + :maxdepth: 1 + + tutorial/launch_basic.rst + tutorial/launch_exec_pols.rst + tutorial/launch_naming_kernels.rst + +.. _tutorialmatrixtranspose-label: + +=============================================================================== +Comparing ``RAJA::kernel`` and ``RAJA::expt::launch``: Matrix-Transpose +=============================================================================== + +In this section, we compare ``RAJA::kernel`` and ``RAJA::expt::launch`` +implementations of a matrix transpose algorithm. We illustrate +implementation differences of the two interfaces as we build upon each +example with more complex features. .. toctree:: :maxdepth: 1 - tutorial/teams_basic.rst - tutorial/naming_kernels.rst + tutorial/matrix_transpose.rst + tutorial/matrix_transpose_tiled.rst + tutorial/matrix_transpose_local_array.rst + +========================================== +Other RAJA Features and Usage Examples +========================================== + +.. toctree:: + :maxdepth: 1 + + tutorial/halo-exchange.rst + tutorial/matrix_multiply.rst + + diff --git a/docs/sphinx/user_guide/tutorial/add_vectors.rst b/docs/sphinx/user_guide/tutorial/add_vectors.rst index 73e7ed13d9..9b15a1a34d 100644 --- a/docs/sphinx/user_guide/tutorial/add_vectors.rst +++ b/docs/sphinx/user_guide/tutorial/add_vectors.rst @@ -6,23 +6,29 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _addvectors-label: +.. _tut-addvectors-label: -------------------------------------- -Vector Addition (Basic Loop Execution) +Basic Loop Execution: Vector Addition -------------------------------------- -Key RAJA features shown in this example: +This section contains an exercise file ``RAJA/exercises/vector-addition.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/vector-addition_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make vector-addition`` and ``make vector-addition_solution`` +from the build directory. - * ``RAJA::forall`` loop execution template - * ``RAJA::RangeSegment`` iteration space construct - * RAJA execution policies +Key RAJA features shown in this example are: + * ``RAJA::forall`` loop execution template and execution policies + * ``RAJA::TypedRangeSegment`` iteration space construct In the example, we add two vectors 'a' and 'b' of length N and store the result in vector 'c'. A simple C-style loop that does this is: -.. literalinclude:: ../../../../examples/tut_add-vectors.cpp +.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp :start-after: _cstyle_vector_add_start :end-before: _cstyle_vector_add_end :language: C++ @@ -31,28 +37,24 @@ store the result in vector 'c'. A simple C-style loop that does this is: RAJA Variants ^^^^^^^^^^^^^^^^^^^^^ -The RAJA variants of the vector addition operation illustrate how the -same kernel can be run with a variety of different programming model -back-ends by simply swapping out the execution policy. This can be done -by defining type aliases in a header file so that execution policy types -can be easily switched, and the code can be compiled to run differently, -without changing the loop kernel code. In the example code, we -make all execution policy types explicit for clarity. - -For the RAJA variants, we replace the C-style for-loop with a call to the -``RAJA::forall`` loop execution template method. +For the RAJA variants of the vector addition kernel, we replace the C-style +for-loop with a call to the ``RAJA::forall`` loop execution template method. The method takes an iteration space and the vector addition loop body as -a C++ lambda expression. We pass a ``RAJA::RangeSegment`` object, which -describes a contiguous sequence of integral values [0, N) for the iteration -space (for more information about RAJA loop indexing concepts, -see :ref:`index-label`). The loop execution template method requires an +a C++ lambda expression. We pass the object:: + + RAJA::TypedRangeSegment(0, N) + +for the iteration space, which is contiguous sequence of integral +values [0, N) (for more information about RAJA loop indexing concepts, +see :ref:`feat-index-label`). The loop execution template method requires an execution policy template type that specifies how the loop is to run -(for more information about RAJA execution policies, see :ref:`policies-label`). +(for more information about RAJA execution policies, +see :ref:`feat-policies-label`). -For the RAJA sequential variant, we use the ``RAJA::seq_exec`` execution +For a RAJA sequential variant, we use the ``RAJA::seq_exec`` execution policy type: -.. literalinclude:: ../../../../examples/tut_add-vectors.cpp +.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp :start-after: _rajaseq_vector_add_start :end-before: _rajaseq_vector_add_end :language: C++ @@ -65,59 +67,66 @@ execution policy:: RAJA::simd_exec -Alternatively, RAJA provides a *loop execution* policy:: +An alternative RAJA policy is:: RAJA::loop_exec -This policy allows the compiler to generate optimizations, such as SIMD if -compiler heuristics suggest that it is safe to do so and potentially +which allows the compiler to generate optimizations based on how its internal +heuristics suggest that it is safe to do so and potentially beneficial for performance, but the optimizations are not forced. To run the kernel with OpenMP multithreaded parallelism on a CPU, we use the ``RAJA::omp_parallel_for_exec`` execution policy: -.. literalinclude:: ../../../../examples/tut_add-vectors.cpp +.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp :start-after: _rajaomp_vector_add_start :end-before: _rajaomp_vector_add_end :language: C++ This will distribute the loop iterations across CPU threads and run the -loop over threads in parallel. +loop over threads in parallel. In particular, this is what you would get if +you wrote the kernel using a C-style loop with an OpenMP pragma directly:: + + #pragma omp parallel for + for (int i = 0; i < N; ++i) { + c[i] = a[i] + b[i]; + } To run the kernel on a CUDA GPU device, we use the ``RAJA::cuda_exec`` policy: -.. literalinclude:: ../../../../examples/tut_add-vectors.cpp +.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp :start-after: _rajacuda_vector_add_start :end-before: _rajacuda_vector_add_end :language: C++ -Note that the CUDA execution policy type accepts a template argument -``CUDA_BLOCK_SIZE``, which specifies that each CUDA thread block launched -to execute the kernel will have the given number threads in the block. +Since the lambda defining the loop body will be passed to a device kernel, +it must be decorated with the ``__device__`` attribute. +This can be done directly or by using the ``RAJA_DEVICE`` macro. + +Note that the CUDA execution policy type requires a template argument +``CUDA_BLOCK_SIZE``, which specifies the number of threads to run in each +CUDA thread block launched to run the kernel. -For performance tuning, the ``RAJA::cuda_exec_explicit`` policy is also -provided. This allows the user to specify the number of blocks allocated -per streaming multiprocessor (SM) to allow additional block level -parallelism. Note that the third boolean argument representing asynchronous -execution can be omitted, and is ``false`` by default: +For additional performance tuning options, the ``RAJA::cuda_exec_explicit`` +policy is also provided, which allows a user to specify the minimum number +of thread blocks to launch at a time on each streaming multiprocessor (SM): -.. literalinclude:: ../../../../examples/tut_add-vectors.cpp +.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp :start-after: _rajacuda_explicit_vector_add_start :end-before: _rajacuda_explicit_vector_add_end :language: C++ -Since the lambda defining the loop body will be passed to a device kernel, -it must be decorated with the ``__device__`` attribute when it is defined. -This can be done directly or by using the ``RAJA_DEVICE`` macro. +Note that the third boolean template argument is used to express whether the +kernel launch is synchronous or asynchronous. This is optional and is +'false' by default. A similar defaulted optional argument is supported for +other RAJA GPU (e.g., CUDA or HIP) policies. -Similarly, to run the kernel on a GPU using the RAJA HIP back-end, +Lastly, to run the kernel on a GPU using the RAJA HIP back-end, we use the ``RAJA::hip_exec`` policy: -.. literalinclude:: ../../../../examples/tut_add-vectors.cpp +.. literalinclude:: ../../../../exercises/vector-addition_solution.cpp :start-after: _rajahip_vector_add_start :end-before: _rajahip_vector_add_end :language: C++ -The file ``RAJA/examples/tut_add-vectors.cpp`` contains the complete -working example code. diff --git a/docs/sphinx/user_guide/tutorial/atomic_histogram.rst b/docs/sphinx/user_guide/tutorial/atomic_histogram.rst index 72b136a269..970271c875 100644 --- a/docs/sphinx/user_guide/tutorial/atomic_histogram.rst +++ b/docs/sphinx/user_guide/tutorial/atomic_histogram.rst @@ -6,96 +6,108 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _atomichist-label: +.. _tut-atomichist-label: -------------------------------------------------- -Computing a Histogram with Atomic Operations +Atomic Operations: Computing a Histogram -------------------------------------------------- -Key RAJA features shown in this example: +This section contains an exercise file ``RAJA/exercises/atomic-histogram.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/atomic-histogram_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make atomic-histogram`` and ``make atomic-histogram_solution`` +from the build directory. - * ``RAJA::forall`` loop execution template - * ``RAJA::RangeSegment`` iteration space construct - * RAJA atomic add operation +Key RAJA features shown in this exercise are: + + * ``RAJA::forall`` kernel execution template and execution policies + * ``RAJA::TypedRangeSegment`` iteration space construct + * RAJA atomic add operation and RAJA atomic operation policies The example uses an integer array of length 'N' randomly initialized with -values in the interval [0, M). While iterating over the array, the kernel -accumulates the number of occurrences of each value in the array using atomic -add operations. Atomic operations allow one to update a memory location +values in the interval [0, M). + +.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp + :start-after: _array_atomic_histogram_start + :end-before: _array_atomic_histogram_end + :language: C++ + +Each kernel iterates over the array and accumulates the number of occurrences +of each value in [0, M) in another array named 'hist'. The kernels use atomic +operations for the accumulation, which allow one to update a memory location referenced by a specific address in parallel without data races. The example shows how to use RAJA portable atomic operations and that they are used similarly for different programming model back-ends. -.. note:: Each RAJA reduction operation requires an atomic policy type +.. note:: Each RAJA atomic operation requires an atomic policy type parameter that must be compatible with the execution policy for - the kernel in which it is used. + the kernel in which it is used. This is similar to the reduction + policies we described in :ref:`tut-dotproduct-label`. For a complete description of supported RAJA atomic operations and -atomic policies, please see :ref:`atomics-label`. +atomic policies, please see :ref:`feat-atomics-label`. -All code snippets described below use the loop range: +All code snippets described below use the stride-1 iteration space range: -.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp +.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp :start-after: _range_atomic_histogram_start :end-before: _range_atomic_histogram_end :language: C++ -and the integer array 'bins' of length 'M' to accumulate the number of -occurrences of each value in the array. - Here is the OpenMP version: -.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp +.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp :start-after: _rajaomp_atomic_histogram_start :end-before: _rajaomp_atomic_histogram_end :language: C++ -Each slot in the 'bins' array is incremented by one when a value associated +One is added to a slot in the 'bins' array when a value associated with that slot is encountered. Note that the ``RAJA::atomicAdd`` operation uses an OpenMP atomic policy, which is compatible with the OpenMP -loop execution policy. +kernel execution policy. The CUDA and HIP versions are similar: -.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp +.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp :start-after: _rajacuda_atomic_histogram_start :end-before: _rajacuda_atomic_histogram_end :language: C++ and: -.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp +.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp :start-after: _rajahip_atomic_histogram_start :end-before: _rajahip_atomic_histogram_end :language: C++ Here, the atomic add operations uses CUDA and HIP atomic policies, which are -compatible with the CUDA and HIP loop execution policies. +compatible with the CUDA and HIP kernel execution policies. Note that RAJA provides an ``auto_atomic`` policy for easier usage and -improved portability. This policy will do the right thing in most -circumstances. If OpenMP is enabled, the OpenMP atomic policy will be used, -which is correct in a sequential execution context as well. Otherwise, the -sequential atomic policy will be applied. Similarly, if it is encountered in -a CUDA or HIP execution context, the corresponding GPU back-end atomic policy +improved portability. This policy will choose the proper atomic operation +for the execution policy used to run the kernel. Specifically, when OpenMP +is enabled, the OpenMP atomic policy will be used, which is correct in a +sequential or OpenMP execution context. Otherwise, the sequential atomic +policy will be applied. Similarly, if it is encountered in a CUDA or HIP +execution context, the corresponding GPU back-end atomic policy will be applied. For example, here is the CUDA version that uses the 'auto' atomic policy: -.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp +.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp :start-after: _rajacuda_atomicauto_histogram_start :end-before: _rajacuda_atomicauto_histogram_end :language: C++ and the HIP version: -.. literalinclude:: ../../../../examples/tut_atomic-histogram.cpp +.. literalinclude:: ../../../../exercises/atomic-histogram_solution.cpp :start-after: _rajahip_atomicauto_histogram_start :end-before: _rajahip_atomicauto_histogram_end :language: C++ -The same CUDA and HIP loop execution policies as in the previous examples +The same CUDA and HIP kernel execution policies as in the previous examples are used. -The file ``RAJA/examples/tut_atomic-histogram.cpp`` contains the complete -working example code. diff --git a/docs/sphinx/user_guide/tutorial/dot_product.rst b/docs/sphinx/user_guide/tutorial/dot_product.rst index 9c10217edb..feac719918 100644 --- a/docs/sphinx/user_guide/tutorial/dot_product.rst +++ b/docs/sphinx/user_guide/tutorial/dot_product.rst @@ -6,31 +6,37 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _dotproduct-label: +.. _tut-dotproduct-label: ----------------------------------- -Vector Dot Product (Sum Reduction) +Sum Reduction: Vector Dot Product ----------------------------------- -Key RAJA features shown in this example: +This section contains an exercise file ``RAJA/exercises/dot-product.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/dot-product_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make dot-product`` and ``make dot-product_solution`` +from the build directory. - * ``RAJA::forall`` loop execution template - * ``RAJA::RangeSegment`` iteration space construct - * RAJA execution policies - * ``RAJA::ReduceSum`` sum reduction template - * RAJA reduction policies +Key RAJA features shown in this example are: + * ``RAJA::forall`` loop execution template and execution policies + * ``RAJA::TypedRangeSegment`` iteration space construct + * ``RAJA::ReduceSum`` sum reduction template and reduction policies In the example, we compute a vector dot product, 'dot = (a,b)', where -'a' and 'b' are two vectors length N and 'dot' is a scalar. Typical +'a' and 'b' are two vectors of length N and 'dot' is a scalar. Typical C-style code to compute the dot product and print its value afterward is: -.. literalinclude:: ../../../../examples/tut_dot-product.cpp +.. literalinclude:: ../../../../exercises/dot-product_solution.cpp :start-after: _csytle_dotprod_start :end-before: _csytle_dotprod_end :language: C++ -Note that this operation performs a *reduction*, a computational pattern that +Although this kernel is serial, it is representative of a *reduction* +operation which is a common algorithm pattern that produces a single result from a set of values. Reductions present a variety of issues that must be addressed to operate properly in parallel. @@ -39,65 +45,69 @@ RAJA Variants ^^^^^^^^^^^^^^^^^^^^^ Different programming models support parallel reduction operations differently. -Some models, such as CUDA, do not provide support for reductions at all and +Some models, such as CUDA, do not provide direct support for reductions and so such operations must be explicitly coded by users. It can be challenging to generate a correct and high performance implementation. RAJA provides portable reduction types that make it easy to perform reduction operations -in loop kernels. The RAJA variants of the dot product computation show how +in kernels. The RAJA variants of the dot product computation show how to use the ``RAJA::ReduceSum`` sum reduction template type. RAJA provides -other reduction types and also allows multiple reduction operations to be -performed in a single kernel along with other computation. Please see -:ref:`reductions-label` for an example that does this. +other reduction types and allows multiple reduction operations to be +performed in a single kernel alongside other computations. Please see +:ref:`feat-reductions-label` for more information. Each RAJA reduction type takes a `reduce policy` template argument, which **must be compatible with the execution policy** applied to the kernel in which the reduction is used. Here is the RAJA sequential variant of the dot product computation: -.. literalinclude:: ../../../../examples/tut_dot-product.cpp - :start-after: _rajaseq_atomic_histogram_start - :end-before: _rajaseq_atomic_histogram_end +.. literalinclude:: ../../../../exercises/dot-product_solution.cpp + :start-after: _rajaseq_dotprod_start + :end-before: _rajaseq_dotprod_end :language: C++ The sum reduction object is defined by specifying the reduction -policy ``RAJA::seq_reduce``, which matches the loop execution policy, and -a reduction value type (i.e., 'double'). An initial value of zero for the -sum is passed to the reduction object constructor. After the kernel executes, -we use the 'get' method to retrieve the reduced value. +policy ``RAJA::seq_reduce`` matching the kernel execution policy +``RAJA::seq_exec``, and a reduction value type (i.e., 'double'). An initial +value of zero for the sum is passed to the reduction object constructor. After +the kernel executes, we use the 'get' method to retrieve the reduced value. -The OpenMP multithreaded variant of the loop is implemented similarly: +The OpenMP multithreaded variant of the kernel is implemented similarly: -.. literalinclude:: ../../../../examples/tut_dot-product.cpp +.. literalinclude:: ../../../../exercises/dot-product_solution.cpp :start-after: _rajaomp_dotprod_start :end-before: _rajaomp_dotprod_end :language: C++ Here, we use the ``RAJA::omp_reduce`` reduce policy to match the OpenMP -loop execution policy. +kernel execution policy. -The RAJA CUDA variant is achieved by using appropriate loop execution and +The RAJA CUDA variant is achieved by using appropriate kernel execution and reduction policies: -.. literalinclude:: ../../../../examples/tut_dot-product.cpp +.. literalinclude:: ../../../../exercises/dot-product_solution.cpp :start-after: _rajacuda_dotprod_start :end-before: _rajacuda_dotprod_end :language: C++ Here, the CUDA reduce policy ``RAJA::cuda_reduce`` matches the CUDA -loop execution policy. Note that the CUDA thread block size is not +kernel execution policy. Note that the CUDA thread block size is not specified in the reduce policy as it will use the same value as the loop execution policy. Similarly, for the RAJA HIP variant: -.. literalinclude:: ../../../../examples/tut_dot-product.cpp +.. literalinclude:: ../../../../exercises/dot-product_solution.cpp :start-after: _rajahip_dotprod_start :end-before: _rajahip_dotprod_end :language: C++ -It is worth noting how similar the code looks for each of these variants. +It is worth repeating how similar the code looks for each of these variants. The loop body is identical for each and only the loop execution policy and reduce policy types change. -The file ``RAJA/examples/tut_dot-product.cpp`` contains the complete -working example code. +.. note:: Currently available reduction capabilities in RAJA require a + *reduction policy* type that is compatible with the execution + policy for the kernel in which the reduction is used. We + are developing a new reduction interface for RAJA that will + provide an alternative for which the reduction policy is not + required. diff --git a/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP b/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP index 255bf0abdd..10b0dc44d7 100644 --- a/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP +++ b/docs/sphinx/user_guide/tutorial/gaussSeidel.rst-KEEP @@ -50,7 +50,7 @@ index range, has been used in all the other examples. A ``RAJA::ListSegment`` represents an arbitrary collection of indices, similar to an indirection array that is common in unstructured mesh applications. In the example, we use two ``RAJA::ListSegment`` objects to hold these two sets of indices. See -:ref:`index-label` for more information about RAJA segments and index sets. +:ref:`feat-index-label` for more information about RAJA segments and index sets. The code in the example that constructs the segments and index set is: diff --git a/docs/sphinx/user_guide/tutorial/halo-exchange.rst b/docs/sphinx/user_guide/tutorial/halo-exchange.rst index babc04a6e9..c451340ae2 100644 --- a/docs/sphinx/user_guide/tutorial/halo-exchange.rst +++ b/docs/sphinx/user_guide/tutorial/halo-exchange.rst @@ -6,26 +6,31 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _halo_exchange-label: +.. _tut-halo_exchange-label: ------------------------------------ -Halo Exchange (Workgroup Constructs) +Workgroup Constructs: Halo Exchange ------------------------------------ +The example code discussed in this section can be found in the file +``RAJA/examples/tut_halo-exchange.cpp``. The file contains complete working +code for multiple OpenMP, CUDA, and HIP RAJA variants. Here, we describe +a subset of these variants. + Key RAJA features shown in this example: * ``RAJA::WorkPool`` workgroup construct * ``RAJA::WorkGroup`` workgroup construct * ``RAJA::WorkSite`` workgroup construct - * ``RAJA::RangeSegment`` iteration space construct + * ``RAJA::TypedRangeSegment`` iteration space construct * RAJA workgroup policies In this example, we show how to use the RAJA workgroup constructs to implement buffer packing and unpacking for data halo exchange on a computational grid, -a common MPI communication operation. This may not provide a performance gain -on a CPU system, but it can significantly speedup halo exchange on a GPU -system compared to using ``RAJA::forall`` to run individual packing/unpacking -kernels. +a common MPI communication operation for distributed memory applications. +This technique may not provide a performance gain on a CPU system, but it can +significantly speedup halo exchange on a GPU system compared to running +many individual packing/unpacking kernels, for example. .. note:: Using an abstraction layer over RAJA can make it easy to switch between using individual ``RAJA::forall`` loops or the RAJA workgroup @@ -33,16 +38,17 @@ kernels. compile time or run time. We start by setting the parameters for the halo exchange by using default -values or values provided via command line input. These parameters determine -the size of the grid, the width of the halo, the number of grid variables -and the number of cycles. +values or values provided via command line input to the example code. These +parameters determine the size of the grid, the width of the halo, the number +of grid variables to pack/unpack, and the number of cycles; (iterations +to run). .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp :start-after: _halo_exchange_input_params_start :end-before: _halo_exchange_input_params_end :language: C++ -Next, we allocate the variables array (the memory manager in +Next, we allocate the variable data arrays (the memory manager in the example uses CUDA Unified Memory if CUDA is enabled). These grid variables are reset each cycle to allow checking the results of the packing and unpacking. @@ -89,19 +95,24 @@ into the adjacent halo cells: | 7 | 7 | 8 | 9 | 9 | +---+---+---+---+---+ +Although the example code does not use MPI and multiple domains (one per +MPI rank, for example), as would be the case in a real distributed memory +parallel application, the data copy operations represent the spirit of how +data communication would be done. + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Packing and Unpacking (Basic Loop Execution) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -A sequential non-RAJA example of packing: +A sequential non-RAJA example of data packing and unpacking would look like: .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp :start-after: _halo_exchange_sequential_cstyle_packing_start :end-before: _halo_exchange_sequential_cstyle_packing_end :language: C++ -and unpacking: +and: .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp :start-after: _halo_exchange_sequential_cstyle_unpacking_start @@ -135,7 +146,7 @@ and unpack the buffer data into the grid variable array: :language: C++ -For parallel multi-threading execution via OpenMP, the example can be run +For parallel multithreading execution via OpenMP, the example can be run by replacing the execution policy with: .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp @@ -151,6 +162,9 @@ policy: :end-before: _halo_exchange_cuda_forall_policies_end :language: C++ +Note that we can use an asynchronous execution policy because there are +no data races due to the intermediate buffer usage. + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RAJA Variants using workgroup constructs @@ -165,8 +179,8 @@ policies and types: :language: C++ which are used in a slightly rearranged version of packing. See how the comment -indicating where a message could be sent has been moved down after the call to -run on the workgroup: +indicating where messages are sent has been moved down after the call to +run the operations enqueued on the workgroup: .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp :start-after: _halo_exchange_loop_workgroup_packing_start @@ -184,7 +198,7 @@ unpacking the data: This reorganization has the downside of not overlapping the message sends with packing and the message receives with unpacking. -For parallel multi-threading execution via OpenMP, the example using workgroup +For parallel multithreading execution via OpenMP, the example using workgroup can be run by replacing the policies and types with: .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp @@ -192,6 +206,10 @@ can be run by replacing the policies and types with: :end-before: _halo_exchange_openmp_workgroup_policies_end :language: C++ +The main differences between these types and the ones defined for the sequential +case above are the ``forall_policy`` and the ``workgroup_policy``, which use +OpenMP execution policy types. + Similarly, to run the loops in parallel on a CUDA GPU use these policies and types, taking note of the unordered work ordering policy that allows the enqueued loops to all be run using a single CUDA kernel: @@ -201,21 +219,28 @@ enqueued loops to all be run using a single CUDA kernel: :end-before: _halo_exchange_cuda_workgroup_policies_end :language: C++ +The main differences between these types and the ones defined for the +sequential and OpenMP cases above are the ``forall_policy`` and the +``workgroup_policy``, which use different template parameters, and the +``workpool``, ``workgroup``, and ``worksite`` types which use 'pinned' +memory allocation. + The packing is the same as the previous workgroup packing examples with the -exception of added synchronization after calling run and before sending the -messages. The previous CUDA example used forall to launch -``num_neighbors * num_vars`` CUDA kernels and performed ``num_neighbors`` -synchronizations to send each message in turn. Here, the reorganization to pack -all messages before sending lets us use an unordered CUDA work ordering policy -in the workgroup constructs that reduces the number of CUDA kernel launches to -one. It also allows us to synchronize once before sending all of the messages: +exception of added synchronization after calling the workgroup run method +and before sending the messages. In the example code, there is a CUDA version +that uses forall to launch ``num_neighbors * num_vars`` CUDA kernels and +performs ``num_neighbors`` synchronizations to send each message in turn. +Here, the reorganization to pack all messages before sending lets us use an +unordered CUDA work ordering policy in the ``workgroup_policy`` that reduces +the number of CUDA kernel launches to one. It also allows us to need to +synchronize only once before sending all of the messages: .. literalinclude:: ../../../../examples/tut_halo-exchange.cpp :start-after: _halo_exchange_cuda_workgroup_packing_start :end-before: _halo_exchange_cuda_workgroup_packing_end :language: C++ -After waiting to receive all of the messages we use workgroup constructs using +After waiting to receive all of the messages we use workgroup constructs with a CUDA unordered work ordering policy to unpack all of the messages using a single kernel launch: @@ -228,6 +253,3 @@ Note that the synchronization after unpacking is done to ensure that ``group_unpack`` and ``site_unpack`` survive until the unpacking loop has finished executing. - -The file ``RAJA/examples/tut_halo-exchange.cpp`` contains a complete -working example code, with OpenMP, CUDA, and HIP variants. diff --git a/docs/sphinx/user_guide/tutorial/indexset_segments.rst b/docs/sphinx/user_guide/tutorial/indexset_segments.rst index 729df37632..d651bd1a03 100644 --- a/docs/sphinx/user_guide/tutorial/indexset_segments.rst +++ b/docs/sphinx/user_guide/tutorial/indexset_segments.rst @@ -6,200 +6,349 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _indexset-label: +.. _tut-indexset-label: ------------------------------------------ -Iteration Spaces: IndexSets and Segments ------------------------------------------ +------------------------------------------------- +Iteration Spaces: Segments and IndexSets +------------------------------------------------- -Key RAJA features shown in this example: +This section contains an exercise file ``RAJA/exercises/segment-indexset-basics.cpp`` for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/segment-indexset-basics_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make segment-indexset-basics`` and ``make segment-indexset-basics_solution`` +from the build directory. + +Key RAJA features shown in this example are: * ``RAJA::forall`` loop execution template - * ``RAJA::RangeSegment`` (i.e., ``RAJA::TypedRangeSegment``) iteration space construct - * ``RAJA::TypedListSegment`` iteration space construct - * ``RAJA::IndexSet`` iteration construct and associated execution policies - -The example uses a simple daxpy kernel and its usage of RAJA is similar to -previous simple loop examples. The example -focuses on how to use RAJA index sets and iteration space segments, such -as index ranges and lists of indices. These features are important for -applications and algorithms that use indirection arrays for irregular array + * ``RAJA::TypedRangeSegment``, ``RAJA::TypedRangeStrideSegment``, and + ``RAJA::TypedListSegment`` iteration space constructs + * ``RAJA::TypedIndexSet`` container and associated execution policies + +The concepts of iteration spaces and associated Loop variables are central to +writing kernels in RAJA. RAJA provides basic iteration space types +that serve as flexible building blocks that can be used to form a variety +of loop iteration patterns. These types can be used to define a particular +order for loop iterates, aggregate and partition iterates, as well as other +configurations. + +The examples in this section focus on how to use RAJA index sets and iteration +space segments, such as index ranges and lists of indices. Lists of indices +are important for algorithms that use indirection arrays for irregular array accesses. Combining different segment types, such as ranges and lists in an index set allows a user to launch different iteration patterns in a single loop execution construct (i.e., one kernel). This is something that is not supported by other programming models and abstractions and is unique to RAJA. -Applying these concepts judiciously can increase performance by allowing +Applying these concepts judiciously can help improve performance by allowing compilers to optimize for specific segment types (e.g., SIMD for range segments) while providing the flexibility of indirection arrays for general indexing patterns. -.. note:: For the following examples, it is useful to remember that all - RAJA segment types are templates, where the type of the index - value is the template argument. So for example, the basic RAJA - range segment type is ``RAJA::TypedRangeSegment``. The type - ``RAJA::RangeSegment`` used here (for convenience) is a type alias - for ``RAJA::TypedRangeSegment``, where the - template parameter is a default index type that RAJA defines. - -For a summary discussion of RAJA segment and index set concepts, please -see :ref:`index-label`. +Although the constructs described in the section are +useful in numerical computations and parallel execution, the examples only +contain print statements and sequential execution. The goal is to show you +how to use RAJA iteration space constructs. ^^^^^^^^^^^^^^^^^^^^^ RAJA Segments ^^^^^^^^^^^^^^^^^^^^^ -In previous examples, we have seen how to define a contiguous range of loop -indices [0, N) with a ``RAJA::RangeSegment`` object and use it in a RAJA -loop execution template to run a loop kernel over the range. For example: +A RAJA *Segment* represents a set of indices that one wants to execute as a +unit for a kernel. RAJA provides the following Segment types: -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _rajaseq_daxpy_range_start - :end-before: _rajaseq_daxpy_range_end - :language: C++ + * ``RAJA::TypedRangeSegment`` represents a stride-1 range + * ``RAJA::TypedRangeStrideSegment`` represents a (non-unit) stride range + * ``RAJA::TypedListSegment`` represents an arbitrary set of indices + +These segment types are used in ``RAJA::forall`` and other RAJA kernel +execution mechanisms to define the iteration space for a kernel. + +After we briefly introduce these types, we will present several examples using +them. + +TypedRangeSegment +^^^^^^^^^^^^^^^^^^^ + +A ``RAJA::TypedRangeSegment`` is the fundamental type for defining a +stride-1 (i.e., contiguous) range of indices. This is illustrated in the +figure below. + +.. figure:: ../figures/RangeSegment.png + + A range segment defines a stride-1 index range [beg, end). + +One creates a range segment object as follows:: + + // A stride-1 index range [beg, end) using type int. + RAJA::TypedRangeSegment my_range(beg, end); + +Any integral type can be given as the template parameter. + +.. note:: When using a RAJA range segment, no loop iterations will be run when + begin >= end. + +TypedRangeStrideSegment +^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``RAJA::TypedRangeStrideSegment`` defines a range with a constant stride, +including negative stride values if needed. This is illustrated in the +figure below. + +.. figure:: ../figures/RangeStrideSegment.png + + A range-stride segment defines an index range with arbitrary stride [beg, end, stride). In the figure the stride is 2. + +One creates a range stride segment object as follows:: + + // A stride-3 index range [beg, end) using type int. + RAJA::TypedRangeStrideSegment my_stride2_range(beg, end, 3); + + // A index range with -1 stride [0, N-1) using type int + RAJA::TypedRangeStrideSegment my_neg1_range( N-1, -1, -1); + +Any integral type can be given as the template parameter. + +When the negative-stride segment above is passed to a ``RAJA::forall`` method, +for example, the loop will run in reverse order with iterates:: + + N-1 N-2 N-3 ... 1 0 + +.. note:: When using a RAJA strided range, no loop iterations will be run + under the following conditions: + + * Stride > 0 and begin > end + * Stride < 0 and begin < end + * Stride == 0 + +TypedListSegment +^^^^^^^^^^^^^^^^^^ + +A ``RAJA::TypedListSegment`` is used to define an arbitrary set of +indices, akin to an indirection array. This is illustrated in the figure below. + +.. figure:: ../figures/ListSegment.png + + A list segment defines an arbitrary collection of indices. Here, we have a list segment with 5 irregularly-spaced indices. + +One creates a list segment object by passing a container of integral values to +a list segment constructor. For example:: + + // Create a vector holding some integer index values + std::vector idx = {0, 2, 3, 4, 7, 8, 9, 53}; + + // Create list segment with these indices where the indices are + // stored in the CUDA device memory space + camp::resources::Resource cuda_res{camp::resources::Cuda()}; + RAJA::TypedListSegment idx_list( idx[0], cuda_res ); + + // Alternatively + RAJA::TypedListSegment idx_list( &idx[0], idx.size(), + cuda_res ); + +When the list segment above is passed to a ``RAJA::forall`` method, +for example, the kernel will execute with iterates:: -We can accomplish the same result by enumerating the indices in a -``RAJA::TypedListSegment`` object. Here, we assemble the indices in a standard -vector, create a list segment from that, and then pass the list segment to the -forall execution template: + 0 2 3 4 7 8 9 53 -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _rajaseq_daxpy_list_start - :end-before: _rajaseq_daxpy_list_end +Note that a ``RAJA::TypedListSegment`` constructor can take a pointer to +an array of indices and an array length. If the indices are +in a container, such as ``std::vector`` that provides ``begin()``, ``end()``, +and ``size()`` methods, the container can be passed to the constructor and +the length argument is not required. + +.. note:: Currently, a camp resource object must be passed to a list segment + constructor to copy the indices in the indices into the proper + memory space for a kernel to execute (as shown above). In the future, + this will change and the user will be responsible for providing + the indices in the proper memory space. + +^^^^^^^^^^^ +IndexSets +^^^^^^^^^^^ + +A ``RAJA::TypedIndexSet`` is a container that can hold an arbitrary collection +of segment objects. The following figure shows an index set with two contiguous +ranges and an irregularly-spaced list of indices. + +.. figure:: ../figures/IndexSet.png + + An index set with two range segments and one list segment. + +We can create such an index set as follows:: + + // Create an index set that can hold range and list segments with + // int index value type + RAJA::TypedIndexSet< RAJA::TypedRangeSegment, + RAJA::TypedListSegment > iset; + + // Add two range segments and one list segment to the index set + iset.push_back( RAJA::TypedRangeSegment( ... ) ); + iset.push_back( RAJA::TypedListSegment(...) ); + iset.push_back( RAJA::TypedRangeSegment( ... ) ); + +A ``RAJA::TypedIndexSet`` object can be passed to a RAJA kernel execution +method, such as ``RAJA::forall`` to execute all segments in the index set +with one method call. We will show this in detail in the examples below. + +.. note:: It is the responsibility of the user to ensure that segments are + defined properly when using RAJA index sets. For example, if the + same index appears in multiple segments, the corresponding loop + iteration will be run multiple times. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Segment and IndexSet Examples +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The examples in this section illustrate how the segment types that RAJA +provides can be used to define kernel iteration spaces. We use the following +type aliases to make the code more compact: + +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_segment_type_start + :end-before: _raja_segment_type_end :language: C++ -Note that we are using the following type aliases: -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_list_segment_type_start - :end-before: _raja_list_segment_type_end +Stride-1 Indexing +^^^^^^^^^^^^^^^^^^^ + +Consider a simple C-style kernel that prints a contiguous sequence of values: + +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _cstyle_range1_start + :end-before: _cstyle_range1_end :language: C++ -Recall from discussion in :ref:`index-label` that ``RAJA::Index_type`` is -a default index type that RAJA defines and which is used in some RAJA -constructs as a convenience for users who want a simple mechanism to apply -index types consistently. - -It is important to understand what happens when using list segments. -During loop execution, indices stored in the list segment are passed to the -loop body one-by-one, effectively mimicking an indirection array except that -the indirection does not appear in the loop body. For example, we -can reverse the order of the indices, run the loop with a new list segment -object, and get the same result since the loop is `data-parallel`: - -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_list_segment_daxpy_reverse_start - :end-before: _raja_list_segment_daxpy_reverse_end +When run, the kernel prints the following sequence, as expected:: + + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + +Three RAJA variants of the kernel using a ``RAJA::TypedRangeSegment``, a +``RAJA::TypedRangeStrideSegment``, and a ``RAJA::TypedListSegment`` are: + +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_range1_start + :end-before: _raja_range1_end :language: C++ -Alternatively, we can also use a RAJA strided range segment to run the loop -in reverse by giving it a stride of -1. For example: +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_striderange1_start + :end-before: _raja_striderange1_end + :language: C++ -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_range_segment_daxpy_negstride_start - :end-before: _raja_range_segment_daxpy_negstride_end +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_list1_start + :end-before: _raja_list1_end :language: C++ -The fact that RAJA always passes loop index values to lambdas in a kernel -explains why we can run a kernel with multiple segment types in a single -RAJA construct as we discuss next. +Each of these variants prints the same integer sequence shown above. -^^^^^^^^^^^^^^^^^^^^^ -RAJA IndexSets -^^^^^^^^^^^^^^^^^^^^^ +One interesting thing to note is that with ``RAJA::TypedListSegment`` and +``RAJA::forall``, the actual iteration value is passed to the lambda loop body. +So the indirection array concept is not visible. In contrast, in C-style code, +one has to manually retrieve the index value from the indirection array to +achieve the desired result. For example: -The ``RAJA::TypedIndexSet`` template is a container that can hold -any number of segments, of the same or different types. An index set object -can be passed to a RAJA loop execution method, just like a segment, to -run a loop kernel. When the loop is run, the execution method iterates -over the segments and the loop indices in each segment. Thus, the loop -iterates can be grouped into different segments to partition the iteration -space and iterate over the loop kernel chunks (defined by segments), in -serial, in parallel, or in some specific dependency ordering. Individual -segments can be executed in serial or parallel. - -When an index set is defined, the segment types it may hold must be specified -as template arguments. For example, here we create an index set that can -hold list segments. Then, we add the list segment we created earlier to it -and run the loop: - -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_indexset_list_daxpy_start - :end-before: _raja_indexset_list_daxpy_end +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _cstyle_list1_start + :end-before: _cstyle_list1_end :language: C++ -You are probably wondering: What is the 'SEQ_ISET_EXECPOL' type used for the -execution policy? +Non-unit Stride Indexing +^^^^^^^^^^^^^^^^^^^^^^^^^ -Well, it is similar to execution policy types we have seen up to this point, -except that it specifies a two-level policy -- one for iterating over the -segments and one for executing the iterates defined by each segment. In the -example, we specify that we should do each of these operations sequentially -by defining the policy as follows: +Consider the following C-style kernel that prints the integer sequence +discussed earlier in reverse order: -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_seq_indexset_policy_daxpy_start - :end-before: _raja_seq_indexset_policy_daxpy_end +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _cstyle_negstriderange1_start + :end-before: _cstyle_negstriderange1_end :language: C++ -Next, we perform the daxpy operation by partitioning the iteration space into -two range segments: +We can accomplish the same result using a ``RAJA::TypedRangeStrideSegment``: -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_indexset_2ranges_daxpy_start - :end-before: _raja_indexset_2ranges_daxpy_end +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_negstriderange1_start + :end-before: _raja_negstriderange1_end :language: C++ -The first range segment is used to run the index range [0, N/2) and the -second is used to run the range [N/2, N). +Alternatively, we can use a ``RAJA::TypedListSegment``, where we reverse the +index array we used earlier to define the appropriate list segment: + +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_negstridelist1_start + :end-before: _raja_negstridelist1_end + :language: C++ + +The more common use of the ``RAJA::TypedRangeStrideSegment`` type is to run +constant strided loops with a positive non-unit stride. For example: + +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_range2_start + :end-before: _raja_range2_end + :language: C++ -We can also break up the iteration space into three segments, 2 ranges -and 1 list: +The C-style equivalent of this is: -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_indexset_2ranges_1list_daxpy_start - :end-before: _raja_indexset_2ranges_1list_daxpy_end +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _cstyle_range2_start + :end-before: _cstyle_range2_end :language: C++ -The first range segment runs the index range [0, N/3), the list segment -enumerates the indices in the interval [N/3, 2*N/3), and the second range -segment runs the range [2*N/3, N). Note that we use the same execution -policy as before. +IndexSets: Complex Iteration Spaces +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We noted earlier that ``RAJA::TypedIndexSet`` objects can be used to partition +iteration spaces into disjoint parts. Among other things, this can be useful to +expose parallelism in algorithms that would otherwise require significant +code transformation to do so. Please see :ref:`tut-vertexsum-label` for +discussion of an example that illustrates this. -Before we end the discussion of these examples, we demonstrate a few more -index set execution policy variations. To run the previous three segment -code by iterating over the segments sequentially and executing each -segment in parallel using OpenMP multithreading, we would use this policy -definition: +Here is an example that uses two ``RAJA::TypedRangeSegment`` objects in an +index set to represent an iteration space broken into two disjoint +contiguous intervals: -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_indexset_ompinnerpolicy_daxpy_start - :end-before: _raja_indexset_ompinnerpolicy_daxpy_end +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_indexset_2ranges_start + :end-before: _raja_indexset_2ranges_end :language: C++ -If we wanted to iterate over the segments in parallel using OpenMP -multi-threading and execute each segment sequentially, we would use the -following policy: - -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_indexset_ompouterpolicy_daxpy_start - :end-before: _raja_indexset_ompouterpolicy_daxpy_end - :language: C++ - -Finally, to iterate over the segments sequentially and execute each segment in -parallel on a GPU using either CUDA or HIP kernel, we would use a policy, -such as: - -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_indexset_cudapolicy_daxpy_start - :end-before: _raja_indexset_cudapolicy_daxpy_end +The integer sequence that is printed is:: + + 0 1 2 3 4 5 6 7 8 9 15 16 17 18 19 + +as we expect. + +The execution policy type when using a RAJA index set is a +*two-level* policy. The first level specifies how to iterate over the segments +in the index set, such as sequentially or in parallel using OpenMP. The second +level is the execution policy used to execute each segment. + +.. note:: Iterating over the indices of all segments in a RAJA index set + requires a two-level execution policy, with two template parameters, + as shown above. The first parameter specifies how to iterate over + the segments. The second parameter specifies how the kernel will + execute each segment over each segment. + See :ref:`indexsetpolicy-label` for more information about + RAJA index set execution policies. + +It is worth noting that a C-style version of this kernel requires either +an indirection array to run in one loop or two for-loops. For example: + +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _cstyle_2ranges_start + :end-before: _cstyle_2ranges_end :language: C++ -or: +Finally, we show an example that uses an index set holding two range segments +and one list segment to partition an iteration space into three parts: -.. literalinclude:: ../../../../examples/tut_indexset-segments.cpp - :start-after: _raja_indexset_hippolicy_daxpy_start - :end-before: _raja_indexset_hippolicy_daxpy_end +.. literalinclude:: ../../../../exercises/segment-indexset-basics_solution.cpp + :start-after: _raja_indexset_3segs_start + :end-before: _raja_indexset_3segs_end :language: C++ -The file ``RAJA/examples/tut_indexset-segments.cpp`` contains working code -for these examples. +The integer sequence that is printed is:: + + 0 1 2 3 4 5 6 7 10 11 14 20 22 24 25 26 27 diff --git a/docs/sphinx/user_guide/tutorial/kernel_exec_pols.rst b/docs/sphinx/user_guide/tutorial/kernel_exec_pols.rst new file mode 100644 index 0000000000..3271e8a584 --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/kernel_exec_pols.rst @@ -0,0 +1,249 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-kernelexecpols-label: + +----------------------------------------------------------- +``RAJA::kernel`` Execution Policies +----------------------------------------------------------- + +This section contains an exercise file +``RAJA/exercises/kernelintro-execpols.cpp`` for you to work through if you +wish to get some practice with RAJA. The file +``RAJA/exercises/kernelintro-execpols_solution.cpp`` contains +complete working code for the examples discussed in this section. You can use +the solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make kernelintro-execpols`` and +``make kernelintro-execpols_solution`` from the build directory. + +Key RAJA features shown in this section are: + + * ``RAJA::kernel`` kernel execution template and execution policies + +The examples in this section illustrate various execution policies for +``RAJA::kernel``. The goal is for you to gain an understanding of how +execution policies are constructed and used to perform various nested +loop execution patterns. All examples use the same simple kernel, which +is a three-level loop nest to initialize the entries in an array. +The C++ lambda expression representing the kernel inner loop body is identical +for all kernel variants described here, whether we are executing the kernel +on a CPU sequentially or in parallel with OpenMP, or in parallel on a GPU +(CUDA or HIP). The kernels perform the same operations as the examples in the +:ref:`tut-launchexecpols-label` tutorial section, which uses +``RAJA::expt::launch``. By comparing the two sets of examples, you will gain +an understanding of the differences between the ``RAJA::kernel`` and the +``RAJA::expt::launch`` interfaces. + +We begin by defining some constants used throughout the examples and +allocating two arrays: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _init_define_start + :end-before: _init_define_end + :language: C++ + +Note that we use the 'memory manager' routines contained in the exercise +directory to simplify the allocation process. In particular, CUDA unified +memory is used when CUDA is enabled to simplify accessing the data on the +host or device. + +Next, we execute a C-style nested for-loop version of the kernel to initialize +the entries in the 'reference' array that we will use to compare the results +of other variants for correctness: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _cstyle_tensorinit_seq_start + :end-before: _cstyle_tensorinit_seq_end + :language: C++ + +Note that we manually compute pointer offsets for the (i,j,k) indices. +To simplify the remaining kernel variants we introduce a ``RAJA::View`` +object, which wraps the tensor data pointer and simplifies the multi-dimensional +indexing: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _3D_raja_view_start + :end-before: _3D_raja_view_end + :language: C++ + +Here 'aView' is a three-dimensional View with extent 'N' in each +coordinate based on a three-dimensional ``RAJA::Layout`` object where the +array entries will be accessed using indices of type 'int'. Please see +:ref:`feat-view-label` for more information about the View and Layout types that +RAJA provides for various indexing patterns and data layouts. + +Using the View, the C-style kernel now looks like: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _cstyle_tensorinit_view_seq_start + :end-before: _cstyle_tensorinit_view_seq_end + :language: C++ + +Notice how accessing each (i,j,k) entry in the array is more natural, +and less error prone, using the View. + +The corresponding RAJA sequential version using ``RAJA::kernel`` is: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _raja_tensorinit_seq_start + :end-before: _raja_tensorinit_seq_end + :language: C++ + +This should be familiar to the reader who has read the preceding +:ref:`tut-kernelnestedreorder-label` section of this tutorial. + +Suppose we wanted to parallelize the outer 'k' loop using OpenMP multithreading. +A C-style version of this is: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _cstyle_tensorinit_omp_outer_start + :end-before: _cstyle_tensorinit_omp_outer_end + :language: C++ + +where we have placed the OpenMP directive ``#pragma omp parallel for`` before +the outer loop of the kernel. + +To parallelize all iterations in the entire loop nest, we can apply the OpenMP +``collapse(3)`` clause to map the iterations for all loop levels to OpenMP +threads: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _cstyle_tensorinit_omp_collapse_start + :end-before: _cstyle_tensorinit_omp_collapse_end + :language: C++ + +The corresponding RAJA versions of these two OpenMP variants are, +respectively: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _raja_tensorinit_omp_outer_start + :end-before: _raja_tensorinit_omp_outer_end + :language: C++ + +and + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _raja_tensorinit_omp_collapse_start + :end-before: _raja_tensorinit_omp_collapse_end + :language: C++ + +The first of these, in which we parallelize the outer 'k' loop, replaces +the ``RAJA::loop_exec`` loop execution policy with the +``RAJA::omp_parallel_for_exec`` policy, which applies the same OpenMP +directive to the outer loop used in the C-style variant. + +The RAJA OpenMP collapse variant introduces the ``RAJA::statement::Collapse`` +statement type. We use the ``RAJA::omp_parallel_collapse_exec`` execution +policy type and indicate that we want to collapse all three loop levels +in the second template argument ``RAJA::ArgList<2, 1, 0>``. The integer +values in the list indicate the order of the loops in the collapse operation: +'k' (2) outer, 'j' (1) middle, and 'i' (0) inner. The integers represent +the order of the lambda arguments and the order of the range segments in the +iteration space tuple. + +The first RAJA-based kernel for parallel GPU execution using the RAJA CUDA +back-end we introduce is: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _raja_tensorinit_cuda_start + :end-before: _raja_tensorinit_cuda_end + :language: C++ + +Here, we use the ``RAJA::statement::CudaKernel`` statement type to +indicate that we want a CUDA kernel to be launched. The 'k', 'j', 'i' +iteration variables are mapped to CUDA threads using the CUDA execution +policy types ``RAJA::cuda_thread_z_loop``, ``RAJA::cuda_thread_y_loop``, +and ``RAJA::cuda_thread_x_loop``, respectively. Thus, we use a +a three-dimensional CUDA thread-block to map the loop iterations to CUDA +threads. The ``_loop`` part of each execution policy name indicates that +the indexing in the associated portion of the mapping will use a block-stride +loop. This is useful to guarantee that the policy will work for any +array regardless of size in each coordinate dimension. + +To execute the kernel with a prescribed mapping of iterations to a +thread-block using RAJA, we could do the following: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _raja_tensorinit_cuda_tiled_direct_start + :end-before: _raja_tensorinit_cuda_tiled_direct_end + :language: C++ + +where we have defined the CUDA thread-block dimensions as: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _cuda_blockdim_start + :end-before: _cuda_blockdim_end + :language: C++ + +The ``RAJA::statement::CudaKernelFixed`` statement indicates that we want to +use a fixed thread-block size of 256. To ensure that we are mapping the kernel +iterations properly in chunks of 256 threads to each thread-block, we use RAJA +tiling statements in which we specify the tile size for each dimension/loop +index so that each tile has dimensions (32, 8, 1). For example, the statement +``RAJA::statement::Tile<1, RAJA::tile_fixed`` is used on the +'j' loop, which has a tile size of 8 associated with that dimension. Note that +we do not tile the 'k' loop, since the block size is one in that dimension. + +The other main difference with the previous block-stride loop kernel +version is that we map iterations within each tile directly to threads in +a block; for example, using a ``RAJA::cuda_block_y_direct`` policy type +for the 'j' loop. RAJA *direct* policy types eliminate the block-stride looping, +which is not necessary here since we prescribe a block-size of 256 which +fits within the thread-block size limitation of the CUDA device programming +model. + +For context and comparison, here is the same kernel implementation using +CUDA directly: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _cuda_tensorinit_tiled_direct_start + :end-before: _cuda_tensorinit_tiled_direct_end + :language: C++ + +The ``nested_init`` device kernel used here is: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _cuda_tensorinit_kernel_start + :end-before: _cuda_tensorinit_kernel_end + :language: C++ + +A few differences between the CUDA and RAJA-CUDA versions are worth noting. +First, the CUDA version uses the CUDA ``dim3`` construct to express the +threads-per-block and number of thread-blocks to use: i.e., the +``nthreads_per_block`` and ``nblocks`` variable definitions. Note that +RAJA provides a macro ``RAJA_DIVIDE_CEILING_INT`` to perform the proper +integer arithmetic to calculate the number of blocks based on the size of the +array and the block size in each dimension. Second, the mapping of thread +identifiers to the (i,j,k) indices is explicit in the device kernel. Third, +an explicit check of the (i,j,k) values is required in the CUDA implementation +to avoid addressing memory out-of-bounds; i.e., +``if ( i < N && j < N && k < N )...``. The RAJA kernel variants set similar +definitions internally and **mask out indices that would be out-of-bounds.** +Note that we also inserted additional error checking with ``static_assert`` +and ``cudaErrchk``, which is a RAJA macro, for printing CUDA device error +codes, to catch device errors if there are any. + +Lastly, we show the RAJA HIP variants of the kernel, which are semantically +identical to the RAJA CUDA variants we just described. First, the RAJA-HIP +block-stride loop variant: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _raja_tensorinit_hip_start + :end-before: _raja_tensorinit_hip_end + :language: C++ + +and then the RAJA-HIP fixed thread-block size, tiled, direct thread mapping +version: + +.. literalinclude:: ../../../../exercises/kernelintro-execpols_solution.cpp + :start-after: _raja_tensorinit_hip_tiled_direct_start + :end-before: _raja_tensorinit_hip_tiled_direct_end + :language: C++ + +The only differences are that type names are changed to replace 'CUDA' types +with 'HIP' types to use the RAJA HIP back-end. diff --git a/docs/sphinx/user_guide/tutorial/kernel_nested_loop_reorder.rst b/docs/sphinx/user_guide/tutorial/kernel_nested_loop_reorder.rst new file mode 100644 index 0000000000..757b153ce2 --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/kernel_nested_loop_reorder.rst @@ -0,0 +1,237 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-kernelnestedreorder-label: + +----------------------------------------------------------- +Basic ``RAJA::kernel`` Mechanics and Nested Loop Ordering +----------------------------------------------------------- + +This section contains an exercise file ``RAJA/exercises/kernelintro-nested-loop-reorder.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/kernelintro-nested-loop-reorder_solution.cpp`` contains +complete working code for the examples discussed in this section. You can use +the solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make kernelintro-nested-loop-reorder`` and ``make kernelintro-nested-loop-reorder_solution`` +from the build directory. + +Key RAJA features shown in this section are: + + * ``RAJA::kernel`` loop iteration templates and execution policies + * Nested loop reordering + * RAJA strongly-types indices + +The examples in this +section show the nested loop reordering process in more detail. +Specifically, we describe how to reorder execution policy statements, which +is conceptually analogous to how one would reorder for-loops in a C-style loop +nest. We also introduce strongly-typed index variables that can help users +write correct nested loop code with RAJA. The examples do not perform any +computation; each kernel simply prints out the loop indices in the +order that the iteration spaces are traversed. Thus, only sequential execution +policies are used to avoid complications resulting from print statements +used in parallel programs. The mechanics shown here work the same way for +parallel RAJA execution policies. + +Before we dive into code, we reiterate important features that +represent the main differences between nested-loop RAJA and the +``RAJA::forall`` construct for simple, non-nested loop kernels: + + * An index space (e.g., range segment) and lambda index argument are + required for each level in a loop nest. This example contains + triply-nested loops, so there will be three ranges and three index + arguments. + + * The index spaces for the nested loop levels are specified in a RAJA tuple + object. The order of spaces in the tuple must match the order of index + arguments to the lambda for this to be correct in general. RAJA provides + strongly-typed indices to help with this, which we show below. + + * An execution policy is required for each level in a loop nest. These + are specified as nested statements in the ``RAJA::KernelPolicy`` type. + + * The loop nest ordering is specified in the nested kernel policy -- + the first ``statement::For`` type identifies the outermost loop, the + second ``statement::For`` type identifies the loop nested inside the + outermost loop, and so on. + +We begin by defining three named **strongly-typed** variables for the loop +index variables (i, j, k): + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _raja_typed_indices_start + :end-before: _raja_typed_indices_end + :language: C++ + +Specifically, the 'i' index variable type is ``IIDX``, the 'j' index variable +is ``JIDX``, and the 'k' variable is ``KIDX``, which are aliases to +``int`` type. + +We also define [min, max) intervals for each loop index: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _range_min_max_start + :end-before: _range_min_max_end + :language: C++ + +and three corresponding **typed** range segments which bind the ranges to the +index variable types via template specialization: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _raja_typed_index_ranges_start + :end-before: _raja_typed_index_ranges_end + :language: C++ + +When these features are used as in this example, the compiler will +generate error messages if the lambda expression index argument ordering +and types do not match the index ordering in the tuple. This is illustrated +at the end of this section. + +We begin with a C-style loop nest with 'i' in the inner loop, 'j' in the +middle loop, and 'k' in the outer loop, which prints the (i, j, k) triple +in the inner loop body: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _cstyle_kji_loops_start + :end-before: _cstyle_kji_loops_end + :language: C++ + +The ``RAJA::kernel`` version of this is: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _raja_kji_loops_start + :end-before: _raja_kji_loops_end + :language: C++ + +The integer template parameters in the ``RAJA::statement::For`` types +represent the lambda expression index argument and the range types in the +iteration space tuple argument to ``RAJA::kernel``. + +Both kernels generate the same output, as expected:: + + (I, J, K) + --------- + (0, 1, 2) + (1, 1, 2) + (0, 2, 2) + (1, 2, 2) + (0, 1, 3) + (1, 1, 3) + (0, 2, 3) + (1, 2, 3) + +which you can see by running the exercise code. + +Here, the ``RAJA::kernel`` execution template takes two arguments: a tuple of +ranges, one for each of the three levels in the loop nest, and the lambda +expression loop body. Note that the lambda has an index argument for each +range and that their order and types match. This is required for the code to +compile. + +.. note:: RAJA provides mechanisms to explicitly specify which loop variables, + for example, and in which order they appear in a lambda expression + argument list. Please refer to :ref:`loop_elements-kernel-label` + for more information. + +The execution policy for the loop nest is specified in the +``RAJA::KernelPolicy`` type. The policy uses two statement types: +``RAJA::statement::For`` and ``RAJA::statement::Lambda``. + +The ``RAJA::statement::Lambda`` is used to generate code that invokes the +lambda expression. The '0' template parameter refers to the index of the +lambda expression in the ``RAJA::kernel`` argument list following the +iteration space tuple. Since there is only one lambda expression, we reference +it with the '0' identifier. Sometimes more complicated kernels require multiple +lambda expressions, so we need a way to specify where they will appear in the +generated executable code. We show examples of this in the matrix transpose +discussion later in the tutorial. + +Each level in the loop nest is identified by a +``RAJA::statement::For`` type, which identifies the iteration space and +execution policy for the level. Here, each level uses a +sequential execution policy, which is for illustration purposes. +The integer that appears as the first template argument to each +``RAJA::statement::For`` type corresponds to the index of a range in the tuple +and also to the associated lambda index argument; i.e., '0' for 'i', +'1' for 'j', and '2' for 'k'. + +The integer argument to each ``RAJA::statement::For`` type is needed so +that the levels in the loop nest can be reordered by changing the policy +while the kernel remains the same. To illustrate, we permute the loop nest +ordering so that the 'j' loop is the outermost, the 'i' loop is in the middle, +and the 'k' loop is the innermost with the following policy: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _raja_jik_loops_start + :end-before: _raja_jik_loops_end + :language: C++ + +This generates the following output:: + + (I, J, K) + --------- + (0, 1, 2) + (0, 1, 3) + (1, 1, 2) + (1, 1, 3) + (0, 2, 2) + (0, 2, 3) + (1, 2, 2) + (1, 2, 3) + +which is the same as the corresponding C-style version: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _cstyle_jik_loops_start + :end-before: _cstyle_jik_loops_end + :language: C++ + +Note that we have simply reordered the nesting of the ``RAJA::statement::For`` +types in the execution policy. This is analogous to reordering the for-loops +in C-style version. + +For completeness, we permute the loops again so that the 'i' loop +is the outermost, the 'k' loop is in the middle, and the 'j' loop is the +innermost with the following policy: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _raja_ikj_loops_start + :end-before: _raja_ikj_loops_end + :language: C++ + +The analogous C-style loop nest is: + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _cstyle_ikj_loops_start + :end-before: _cstyle_ikj_loops_end + :language: C++ + +The output generated by these two kernels is:: + + (I, J, K) + --------- + (0, 1, 2) + (0, 2, 2) + (0, 1, 3) + (0, 2, 3) + (1, 1, 2) + (1, 2, 2) + (1, 1, 3) + (1, 2, 3) + +Finally, we show an example that will generate a compilation error because +there is a type mismatch in the ordering of the range segments in the tuple +and the lambda expression argument list. + +.. literalinclude:: ../../../../exercises/kernelintro-nested-loop-reorder_solution.cpp + :start-after: _raja_compile_error_start + :end-before: _raja_compile_error_end + :language: C++ + +Do you see the problem? The last kernel is included in the exercise source +file, so you can see what happens when you attempt to compile it. diff --git a/docs/sphinx/user_guide/tutorial/launch_basic.rst b/docs/sphinx/user_guide/tutorial/launch_basic.rst new file mode 100644 index 0000000000..19f6f153ca --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/launch_basic.rst @@ -0,0 +1,99 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-launchintro-label: + +------------------------------ +``RAJA::Launch`` Basics +------------------------------ + +There are no exercise files to work through for this section. Instead, there +is an example source file ``RAJA/examples/tut_launch_basic.cpp`` which +contains complete code examples of the concepts described here. + +Key RAJA features shown in the following examples are: + + * ``RAJA::launch`` method to create a run-time + selectable host/device execution space. + * ``RAJA::loop`` methods to express algorithms + in terms of nested for loops. + +In this example, we introduce the RAJA Launch framework and discuss +hierarchical loop-based parallelism. Kernel execution details +with RAJA Launch occur inside the lambda expression +passed to the ``RAJA::launch`` method, which defines an execution +space:: + + RAJA::launch(RAJA::ExecPlace , + RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams), + RAJA::Threads(Nthreads,Nthreads)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + /* Kernel code goes here */ + + }); + +The ``RAJA::launch`` method accepts a ``RAJA::LaunchPolicy`` +template parameter that can be defined using up to two policies +(host and device). For example, the following creates an execution space +for a sequential and CUDA kernel dispatch:: + + using launch_policy = RAJA::LaunchPolicy + >; + +Whether a kernel executes on the host or device is determined by the first +argument passed to the ``RAJA::launch`` method, which is a +``RAJA::ExecPlace`` enum value, either ``HOST`` or ``DEVICE``. +Similar to GPU thread and block programming models, RAJA Launch carries out +computation in a predefined compute grid made up of threads which are +then grouped into teams when executing on the device. The execution space is +then enclosed by a host/device lambda which takes a +``RAJA::LaunchContext`` object, which may be used to control the flow +within the kernel, for example by creating thread-team synchronization points. + +Inside the execution space, developers write a kernel using nested +``RAJA::loop`` methods. The manner in which each loop is executed +is determined by a template parameter type, which +indicates how the corresponding iterates are mapped to the Teams/Threads +configuration defined by the ``RAJA::LaunchParams`` type passed as the second +argument to the ``RAJA::launch`` method. Following the CUDA and HIP +programming models, this defines an hierarchical structure in which outer loops +are executed by thread-teams and inner loops are executed by threads in a team. + +.. literalinclude:: ../../../../examples/tut_launch_basic.cpp + :start-after: // _team_loops_start + :end-before: // _team_loops_end + :language: C++ + +The mapping between teams and threads to the underlying programming +model depends on how the ``RAJA::loop`` template parameter types are +defined. For example, we may define host and device mapping strategies as:: + + using teams_x = RAJA::LoopPolicy; + using thread_x = RAJA::LoopPolicy; + +Here, the ``RAJA::LoopPolicy`` type holds both the host (CPU) and +device (CUDA GPU) loop mapping strategies. On the host, both the team/thread +strategies expand out to standard C-style loops for execution: + +.. literalinclude:: ../../../../examples/tut_launch_basic.cpp + :start-after: // _c_style_loops_start + :end-before: // _c_style_loops_end + :language: C++ + +On the device the ``teams_x/y`` policies will map loop iterations directly to +CUDA (or HIP) thread blocks, while the ``thread_x/y`` policies will map loop +iterations directly to threads in a CUDA (or HIP) thread block. The direct CUDA +equivalent of the kernel body using the policy shown above is: + +.. literalinclude:: ../../../../examples/tut_launch_basic.cpp + :start-after: // _device_loop_start + :end-before: // _device_loop_end + :language: C++ diff --git a/docs/sphinx/user_guide/tutorial/launch_exec_pols.rst b/docs/sphinx/user_guide/tutorial/launch_exec_pols.rst new file mode 100644 index 0000000000..76866804c1 --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/launch_exec_pols.rst @@ -0,0 +1,225 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-launchexecpols-label: + +----------------------------------------------------------- +``RAJA::Launch`` Execution Policies +----------------------------------------------------------- + +This section contains an exercise file +``RAJA/exercises/launchintro-execpols.cpp`` for you to work through if you +wish to get some practice with RAJA. The file +``RAJA/exercises/launchintro-execpols_solution.cpp`` contains complete working +code for the examples discussed in this section. You can use the solution file +to check your work and for guidance if you get stuck. To build the exercises +execute ``make launchintro-execpols`` and ``make launchintro-execpols_solution`` +from the build directory. + +Key RAJA features shown in this section are: + + * ``RAJA::launch`` kernel execution environment template + * ``RAJA::loop`` loop execution template and execution policies + +The examples in this section illustrate how to construct nested loop kernels +inside an ``RAJA::launch`` execution environment. In particular, +the goal is for you to gain an understanding of how to use execution policies +with nested ``RAJA::loop`` method calls to perform various nested +loop execution patterns. All examples use the same simple kernel, which +is a three-level loop nest to initialize the entries in an array. The kernels +perform the same operations as the examples in :ref:`tut-kernelexecpols-label`. +By comparing the two sets of examples, you will gain an understanding of the +differences between the ``RAJA::kernel`` and the ``RAJA::launch`` +interfaces. + +We begin by defining some constants used throughout the examples and allocating +arrays to represent the array data: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _init_define_start + :end-before: _init_define_end + :language: C++ + +Note that we use the 'memory manager' routines contained in the exercise +directory to simplify the allocation process. In particular, CUDA unified +memory is used when CUDA is enabled to simplify accessing the data on the +host or device. + +Next, we execute a C-style nested for-loop version of the kernel to initialize +the entries in the 'reference' array that we will use to compare the results +of other variants for correctness: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _cstyle_tensorinit_seq_start + :end-before: _cstyle_tensorinit_seq_end + :language: C++ + +Note that we manually compute the pointer offsets for the (i,j,k) indices. +To simplify the remaining kernel variants we introduce a ``RAJA::View`` +object, which wraps the array data pointer and simplifies the multi-dimensional +indexing: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _3D_raja_view_start + :end-before: _3D_raja_view_end + :language: C++ + +Here 'aView' is a three-dimensional View with extent 'N' in each +coordinate based on a three-dimensional ``RAJA::Layout`` object where the +array entries will be accessed using indices of type 'int'. +indices of type ``int``. Please see :ref:`feat-view-label` for more information +about the View and Layout types that RAJA provides for various indexing +patterns and data layouts. + +Using the View, the C-style kernel looks like: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _cstyle_tensorinit_view_seq_start + :end-before: _cstyle_tensorinit_view_seq_end + :language: C++ + +Notice how accessing each (i,j,k) entry in the array is more natural, +and less error prone, using the View. + +The corresponding RAJA sequential version using ``RAJA::launch`` is: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _raja_tensorinit_seq_start + :end-before: _raja_tensorinit_seq_end + :language: C++ + +This should be familiar to the reader who has read through the preceding +:ref:`tut-launchintro-label` section of this tutorial. As the +``RAJA::launch`` method is templated on a host execution policy, the +``RAJA::LaunchParams`` object can be defined without arguments as loop methods +will get dispatched as standard C-Style for-loops. + +Suppose we wanted to parallelize the outer 'k' loop using OpenMP multithreading. +A C-style version of this is: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _cstyle_tensorinit_omp_outer_start + :end-before: _cstyle_tensorinit_omp_outer_end + :language: C++ + +where we have placed the OpenMP directive ``#pragma omp parallel for`` before +the outer loop of the kernel. + +The corresponding RAJA versions of the C-style OpenMP variant is: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _raja_tensorinit_omp_outer_start + :end-before: _raja_tensorinit_omp_outer_end + :language: C++ + +With the OpenMP version above, ``RAJA::launch`` method is templated on +a ``RAJA::omp_launch_t`` execution policy. The policy is used +to create an OpenMP parallel region, loop iterations may then be distributed +using ``RAJA::loop`` methods templated on ``RAJA::omp_for_exec`` +execution policies. As before, the ``RAJA::LaunchParams`` object may be +initialized without grid dimensions as the CPU does not require specifying a +compute grid. + +The first RAJA-based kernel for parallel GPU execution using the RAJA CUDA +back-end we introduce is: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _raja_tensorinit_cuda_start + :end-before: _raja_tensorinit_cuda_end + :language: C++ + +where we have defined the CUDA thread-block dimensions as: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _cuda_blockdim_start + :end-before: _cuda_blockdim_end + :language: C++ + +Here, we use the ``RAJA::cuda_launch_t`` policy type to +indicate that we want a CUDA kernel to be launched. The 'k', 'j', 'i' +iteration variables are mapped to CUDA threads and blocks using the CUDA +execution policy types ``RAJA::cuda_block_z_direct``, +``RAJA::cuda_global_thread_y``, and ``RAJA::cuda_global_thread_x``, +respectively. Thus, we use a two-dimensional CUDA thread-block and +three-dimensional compute grid to map the loop iterations to CUDA threads. In +comparison to the RAJA CUDA example in :ref:`tut-kernelexecpols-label` , +``RAJA::loop`` methods support execution policies, which enable mapping +directly to the global thread ID of a compute grid. + +Using a combination of ``RAJA::tile`` and ``RAJA::loop`` methods, +we can create a loop tiling platform portable implementation. Here, is a +CUDA variant: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _raja_tensorinit_cuda_tiled_direct_start + :end-before: _raja_tensorinit_cuda_tiled_direct_end + :language: C++ + +We consider the kernel to be portable, because all of the execution policy types +and execution parameters can be replaced by other types and values without +changing the kernel code directly. + +The ``RAJA::tile`` methods are used to partition an iteration space into +tiles to be used within a ``RAJA::loop`` method. The '{i,j,k}_block_sz' +arguments passed to the ``RAJA::tile`` function specify the tile size +for each loop. In the case of GPU programming models, we define the tile size +to correspond to the number of threads in a given dimension. Execution tile +and loop execution policies are chosen to have CUDA blocks and threads map +directly to tiles and entries in a tile. + +For context and comparison, here is the same kernel implementation using +CUDA directly: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _cuda_tensorinit_tiled_direct_start + :end-before: _cuda_tensorinit_tiled_direct_end + :language: C++ + +The ``nested_init`` device kernel used here is: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _cuda_tensorinit_kernel_start + :end-before: _cuda_tensorinit_kernel_end + :language: C++ + +A few differences between the CUDA and RAJA-CUDA versions are worth noting. +First, the CUDA version uses the CUDA ``dim3`` construct to express the +threads-per-block and number of thread-blocks to use: i.e., the +``nthreads_per_block`` and ``nblocks`` variable definitions. The +``RAJA::launch`` interface takes compute dimensions through a +``RAJA::LaunchParams`` object. RAJA provides a macro ``RAJA_DIVIDE_CEILING_INT`` +to perform the proper integer arithmetic to calculate the number of blocks +based on the size of the array and the block size in each dimension. Second, the +mapping of thread identifiers to the (i,j,k) indices is explicit in the device +kernel. Third, an explicit check of the (i,j,k) values is required in the CUDA +implementation to avoid addressing memory out-of-bounds; i.e., +``if ( i < N && j < N && k < N )...``. The RAJA variants set similar +definitions internally and **mask out indices that would be out-of-bounds.** +Note that we also inserted additional error checking with ``static_assert`` +and ``cudaErrchk``, which is a RAJA macro, for printing CUDA device error +codes, to catch device errors if there are any. + +Lastly, we show the RAJA HIP variants of the kernel, which are semantically +identical to the RAJA CUDA variants. First, the RAJA-HIP global-thread +variant: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _raja_tensorinit_hip_start + :end-before: _raja_tensorinit_hip_end + :language: C++ + +and then the RAJA Launch HIP fixed thread-block size, tiled, direct thread +mapping version: + +.. literalinclude:: ../../../../exercises/launchintro-execpols_solution.cpp + :start-after: _raja_tensorinit_hip_tiled_direct_start + :end-before: _raja_tensorinit_hip_tiled_direct_end + :language: C++ + +The only differences are that type names are changed to replace 'CUDA' types +with 'HIP' types to use the RAJA HIP back-end. diff --git a/docs/sphinx/user_guide/tutorial/launch_naming_kernels.rst b/docs/sphinx/user_guide/tutorial/launch_naming_kernels.rst new file mode 100644 index 0000000000..7fa02a0fcb --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/launch_naming_kernels.rst @@ -0,0 +1,65 @@ +.. ## +.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-teamsbasic-label: + +------------------------------------ +Naming kernels for NVTX/ROCTX tools +------------------------------------ + +There are no exercise files to work through for this section. Instead, there +is an example source file ``RAJA/examples/teams_reductions.cpp`` which +contains complete code examples of the concepts described here. + +Key RAJA feature shown in the following example: + + * Naming kernels using an optional argument in ``RAJA::launch`` methods. + +In this example, we illustrate kernel naming capabilities within the RAJA Launch +framework for use with NVTX or ROCTX region naming capabilities. + +To name a ``RAJA::launch`` kernel, a string name is passed as an argument +before the lambda :: + + RAJA::launch(RAJA::ExecPlace , + RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams), + RAJA::Threads(Nthreads,Nthreads)), + "myKernel", + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + /* Kernel body code goes here */ + + } + ); + +The kernel name is used to create NVTX (NVIDIA) or ROCTX (AMD) ranges enabling +developers to identify kernels using NVIDIA `Nsight `_ +and NVIDIA `nvprof `_ profiling +tools or `ROCm `_ +profiling tools when using ROCTX. As an illustration, nvprof +kernels are identified as ranges of GPU activity using the provided kernel +name:: + + ==73220== NVTX result: + ==73220== Thread "" (id = 290832) + ==73220== Domain "" + ==73220== Range "myKernel" + Type Time(%) Time Calls Avg Min Max Name + Range: 100.00% 32.868us 1 32.868us 32.868us 32.868us myKernel + GPU activities: 100.00% 2.0307ms 1 2.0307ms 2.0307ms 2.0307ms _ZN4RAJA4expt17launch_global_fcnIZ4mainEUlNS0_13LaunchContextEE_EEvS2_T_ + API calls: 100.00% 27.030us 1 27.030us 27.030us 27.030us cudaLaunchKernel + +Similarly, ROCm tools can be used to generate traces of a profile and +the resulting json file can be viewed using tools such as `Perfetto +`_. + +In future work, we plan to add support to other profiling tools. Thus, API +changes may occur based on user feedback and integration with other tools. +Enabling NVTX profiling with RAJA Launch requires RAJA to be configured with +RAJA_ENABLE_NV_TOOLS_EXT=ON. +or RAJA_ENABLE_ROCTX=ON for ROCTX profiling on AMD platforms platforms. diff --git a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst index 74afc5fc4b..5760627743 100644 --- a/docs/sphinx/user_guide/tutorial/matrix_multiply.rst +++ b/docs/sphinx/user_guide/tutorial/matrix_multiply.rst @@ -6,18 +6,23 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _matrixmultiply-label: +.. _tut-matrixmultiply-label: ------------------------------------ -Matrix Multiplication (Nested Loops) +Matrix Multiplication: RAJA::kernel ------------------------------------ +The file ``RAJA/examples/tut_matrix-multiply.cpp`` contains the complete +working code for all examples described in this section, plus others that +show a variety of ``RAJA::kernel`` execution policy types. It also contains +raw CUDA and HIP versions of the kernel for comparison. + Key RAJA features shown in the following examples: * ``RAJA::kernel`` template for nested-loop execution * RAJA kernel execution policies * ``RAJA::View`` multi-dimensional data access - * Basic RAJA nested-loop interchange + * RAJA nested-loop interchange * Specifying lambda arguments through statements In this example, we present different ways to perform multiplication of two @@ -31,8 +36,8 @@ C-version: :end-before: _matmult_macros_end :language: C++ -Then, a typical C-style sequential matrix multiplication operation looks like -this: +Then, a typical C-style sequential matrix multiplication operation might +look like this: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp :start-after: _matmult_cstyle_start @@ -43,7 +48,7 @@ For the RAJA variants of the matrix multiple operation presented below, we use ``RAJA::View`` objects, which allow us to access matrix entries in a multi-dimensional manner similar to the C-style version that uses macros. We create a two-dimensional N x N 'view' -for each of the three matrices: +for each matrix: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp :start-after: _matmult_views_start @@ -53,70 +58,16 @@ for each of the three matrices: We show the most basic RAJA view usage here -- to simplify multi-dimensional array indexing. RAJA views can be used to abstract a variety of different data layouts and access patterns, including stride permutations, offsets, etc. -For more information about RAJA views, see :ref:`view-label`. +For more information about RAJA views, see :ref:`feat-view-label`. -We also use the following ``RAJA::RangeSegment`` objects to define the matrix -row and column and dot product iteration spaces: +We also use the following ``RAJA::TypedRangeSegment`` objects to define the +matrix row and column and dot product iteration spaces: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp :start-after: _matmult_ranges_start :end-before: _matmult_ranges_end :language: C++ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Should I Use RAJA::forall For Nested Loops? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We begin by walking through some RAJA variants of the matrix multiplication -operation that show RAJA usage that **we do not recommend**, but which helps -to motivate the ``RAJA::kernel`` interface. We noted some rationale behind -this preference in :ref:`loop_elements-kernel-label`. Here, we discuss this -in more detail. - -Starting with the C-style kernel above, we first convert the outermost -'row' loop to a ``RAJA::forall`` method call with a sequential execution policy: - -.. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp - :start-after: _matmult_outerforall_start - :end-before: _matmult_outerforall_end - :language: C++ - -Here, the lambda expression for the loop body contains the inner -'col' and 'k' loops. - -Note that changing the RAJA execution policy to an OpenMP or CUDA policy -enables the outer 'row' loop to run in parallel. When this is done, -each thread executes the lambda expression body, which contains the 'col' -and 'k' loops. Although this enables some parallelism, there is still more -available. In a bit, we will how the ``RAJA::kernel`` interface helps us to -expose all available parallelism. - -Next, we nest a ``RAJA::forall`` method call for the 'column' loop inside the -outer lambda expression: - -.. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp - :start-after: _matmult_nestedforall_start - :end-before: _matmult_nestedforall_end - :language: C++ - -Here, the innermost lambda expression contains the row-column dot product -initialization, the inner 'k' loop for the dot product, and the operation -that assigns the dot product to the proper location in the result matrix. - -Note that we can replace either RAJA execution policy with an OpenMP -execution policy to parallelize either the 'row' or 'col' loop. For example, -we can use an OpenMP execution policy on the outer 'row' loop and the result -will be the same as using an OpenMP execution policy in the earlier case that -used a ``RAJA::forall`` statement for the outer loop. - -We do not recommend using a parallel execution policy for both loops in -this type of kernel as the results may not be what is expected and RAJA -provides better mechanisms for parallelizing nested loops. Also, changing -the outer loop policy to a CUDA policy will not compile. This is by design -in RAJA since nesting forall statements inside lambdas in this way has limited -utility, is inflexible, and can hinder performance when compared to -``RAJA::kernel`` constructs, which we describe next. - .. _matmultkernel-label: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -142,14 +93,15 @@ second argument is the lambda loop body. Unlike ``RAJA::forall``, the iteration space for ``RAJA::kernel`` is defined as a *tuple* of ranges (created via the ``RAJA::make_tuple`` method), one for the 'col' loop and one for the 'row' loop. Also, the lambda expression takes an iteration index -argument for entry in the iteration space tuple. +argument for each entry in the iteration space tuple. .. note :: The number and order of lambda arguments must match the number and - order of the elements in the tuple for this to be correct. + order of the elements in the tuple for this type of ``RAJA::kernel`` + usage to be correct. Another important difference between ``RAJA::forall`` and ``RAJA::kernel`` involves the execution policy template parameter. The execution policy defined -by the ``RAJA::KernelPolicy`` type used here specifies a policy for each level +by the ``RAJA::KernelPolicy`` type shown here specifies a policy for each level in the loop nest via nested ``RAJA::statement::For`` types. Here, the row and column loops will both execute sequentially. The integer that appears as the first template parameter to each 'For' statement corresponds to the position of @@ -157,18 +109,14 @@ a range in the iteration space tuple and also to the associated iteration index argument to the lambda. Here, '0' is the 'col' range and '1' is the 'row' range because that is the order those ranges appear in the tuple. The innermost type ``RAJA::statement::Lambda<0>`` indicates that the first lambda -expression (the only one in this case!) argument passed to the -``RAJA::kernel`` method will be invoked inside the nested loops. +expression (the only one in this case) argument passed to the +``RAJA::kernel`` method will be invoked inside the inner loop. The integer arguments to the ``RAJA::statement::For`` types are needed to -enable a variety of kernel execution patterns and transformations. Since the -kernel policy is a single unified construct, it can be used to parallelize -the nested loop iterations together, which we will show later. Also, the -levels in the loop nest can be permuted by reordering the policy arguments; -this is analogous to how one would reorder C-style nested loops; i.e., -reorder for-statements for each loop nest level. These execution patterns -and transformations can be achieved by changing only the policy and leaving the -loop kernel code as is. +enable the desired kernel execution pattern and potential transformations, +without changing the kernel code. Since the kernel policy is a single unified +construct, it can be used to parallelize the nested loop iterations together, +which we show next. If we want to execute the row loop using OpenMP multithreaded parallelism and keep the column loop sequential, the policy we would use is: @@ -181,7 +129,7 @@ and keep the column loop sequential, the policy we would use is: To swap the loop nest ordering and keep the same execution policy on each loop, we would use the following policy, which swaps the ``RAJA::statement::For`` types. The inner loop is now the 'row' loop and is run in parallel; -the outer loop is now the 'col' loop and is still sequential: +the outer loop is now the 'col' loop and is run sequentially: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp :start-after: _matmult_ompkernel_swap_start @@ -192,8 +140,8 @@ the outer loop is now the 'col' loop and is still sequential: and others, can be done by switching the ``RAJA::KernelPolicy`` type with no changes to the loop kernel code. -In :ref:`nestedreorder-label`, we provide a more detailed discussion of the -mechanics of loop nest reordering. Next, we show other variations of the +In :ref:`tut-kernelnestedreorder-label`, we provide a more detailed discussion +of the mechanics of loop nest ordering. Next, we show other variations of the matrix multiplication kernel that illustrate other ``RAJA::kernel`` features. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -214,13 +162,13 @@ The first example uses sequential execution for all loops: Note that we use a ``RAJA::kernel_param`` method to execute the kernel. It is similar to ``RAJA::kernel`` except that it accepts a tuple as the second -argument (between the iteration space tuple and the lambda expressions). The -tuple is a set of *parameters* that can be used in the kernel to pass data -into lambda expressions. Here, the parameter tuple holds a single scalar -variable for the dot product. +argument (between the iteration space tuple and the lambda expressions). In +general, the tuple is a set of *parameters* that can be used in the lambda +expressions comprising the kernel. Here, the parameter tuple holds a single +scalar variable for the dot product of each row-column pair. The remaining arguments include a sequence of lambda expressions representing -different parts of the inner loop body. We use three lambda expressions that: +different parts of the kernel body. We use three lambda expressions that: initialize the dot product variable (lambda 0), define the 'k' inner loop row-col dot product operation (lambda 1), and store the computed row-col dot product in the proper location in the result matrix (lambda 2). Note that @@ -228,39 +176,47 @@ all lambdas take the same arguments in the same order, which is required for the kernel to be well-formed. In addition to the loop index variables, we pass the scalar dot product variable into each lambda. This enables the same variables to be used in all three lambda expressions. However, observe that -not all lambda expressions use all three index variables. They are declared, -but left unnamed to prevent compiler warnings. - -Alternatively, the lambda statements in the execution policy may be used -to specify which arguments each lambda takes and in which order. For example: +not all lambda expressions use all three index variables. This is the +result of using the ``RAJA::Params`` and ``RAJA::Segs`` template parameter +types in the ``RAJA::statement::Lambda`` types for lambdas '0' and '2'. +Specifically, ``RAJA::statement::Lambda<0, RAJA::Params<0>>`` indicates that +lambda '0' will take only the scalar parameter as an argument, and +``RAJA::statement::Lambda<2, RAJA::Segs<0, 1>, RAJA::Params<0>>`` indicates +that lambda '2' will take index values for the column and row ranges and +the scalar parameter as arguments, in that order. Since lambda '1' takes all +arguments, we do not specify them. + +Alternatively, the statement to invoke lambda '1' could be augmented to +specify the arguments it takes: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp :start-after: _matmult_3lambdakernel_args_seq_start :end-before: _matmult_3lambdakernel_args_seq_end :language: C++ +The result is the same. + By using ``RAJA::statement::Lambda`` parameters in this way, the code -potentially indicates more clearly which areguments are used. Of course, this +potentially indicates more clearly which arguments are used. Of course, this makes the execution policy more verbose, but that is typically hidden away -in a header file. Statements such as ``RAJA::Segs``, and -``RAJA::Params`` identify the positions of the segments and params -in the tuples to be used as arguments to the lambda expressions. +in a header file, so it need not make the code harder to read. + +.. note::: ``RAJA::Segs`` and ``RAJA::Params`` types can be used in a + ``RAJA::statement::Lambda`` type to identify which segment + indices and params are passed as arguments to a lambda expression. As we noted earlier, the execution policy type passed to the ``RAJA::kernel_param`` method as a template parameter describes how the statements and lambda expressions are assembled to form the complete kernel. To illustrate this, we describe various policies that enable the kernel to run in different ways. In each case, the ``RAJA::kernel_param`` method call, -including its arguments is the same. The curious reader will inspect the -example code in the file listed below to see that this is indeed the case. -In the interest of simplicity, the remaining matrix multiplication examples -do not use ``RAJA::statement::Lambda`` parameters to specify arguments to -the lambda expressions. +including its arguments is the same. The curious reader may inspect the +example code in the file noted above to see that this is indeed the case. Next, we show how to collapse nested loops in an OpenMP parallel region using a ``RAJA::statement::Collapse`` type in the execution policy. This allows one to parallelize multiple levels in a loop nest using OpenMP -directives, for instance. The following policy will collapse the two outer +directives. The following policy will collapse the two outer loops into one OpenMP parallel region: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp @@ -271,9 +227,9 @@ loops into one OpenMP parallel region: The ``RAJA::ArgList`` type indicates which loops in the nest are to be collapsed and their nesting order within the collapse region. The integers passed to ``ArgList`` are indices of entries in the tuple of iteration spaces -and indicate inner to outer loop levels when read from right to left (i.e., -here '1, 0' indicates the column loop is the inner loop and the row loop is -the outer). For this transformation there are no ``statement::For`` types +and indicate inner to outer loop levels when read from right to left. Here, +'1, 0' indicates that the column loop is the inner loop and the row loop is +the outer loop. For this transformation there are no ``statement::For`` types and policies for the individual loop levels inside the OpenMP collapse region. Lastly, we show how to use ``RAJA::statement::CudaKernel`` and @@ -320,18 +276,14 @@ Note that the tiling mechanism requires a ``RAJA::statement::Tile`` type, with a tile size and a tiling execution policy, plus a ``RAJA::statement::For`` type with an execution execution policy for each tile dimension. -The analogous HIP policy is: +The analogous HIP execution policy is: .. literalinclude:: ../../../../examples/tut_matrix-multiply.cpp :start-after: _matmult_3lambdakernel_hiptiled_start :end-before: _matmult_3lambdakernel_hiptiled_end :language: C++ -In :ref:`tiledmatrixtranspose-label` and :ref:`matrixtransposelocalarray-label`, -we will discuss loop tiling in more detail including how it can be used to -improve performance of certain algorithms. +In :ref:`tut-tiledmatrixtranspose-label` and +:ref:`tut-matrixtransposelocalarray-label`, +we discuss loop tiling in more detail. -The file ``RAJA/examples/tut_matrix-multiply.cpp`` contains the complete -working code for all examples described in this section, plus others that -show a variety of ``RAJA::kernel`` execution policy types. It also contains -a raw CUDA version of the kernel for comparison. diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose.rst new file mode 100644 index 0000000000..5a36ccbee9 --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/matrix_transpose.rst @@ -0,0 +1,123 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-matrixtranspose-label: + +---------------------- +Matrix Transpose +---------------------- + +In :ref:`tut-kernelexecpols-label` and :ref:`tut-launchexecpols-label`, +we presented a simple array initialization kernel using ``RAJA::kernel`` and +``RAJA::launch`` interfaces, respectively, and compared the two. This +section describes the implementation of a matrix transpose kernel using both +``RAJA::kernel`` and ``RAJA::launch`` interfaces. The intent is to +compare and contrast the two, as well as introduce additional features of the +interfaces. + +There are exercise files +``RAJA/exercises/kernel-matrix-transpose.cpp`` and +``RAJA/exercises/launch-matrix-transpose.cpp`` for you to work through if you +wish to get some practice with RAJA. The files +``RAJA/exercises/kernel-matrix-transpose_solution.cpp`` and +``RAJA/exercises/launch-matrix-transpose_solution.cpp`` contain +complete working code for the examples. You can use the solution files to +check your work and for guidance if you get stuck. To build +the exercises execute ``make (kernel/launch)-matrix-transpose`` and ``make (kernel/launch)-matrix-transpose_solution`` +from the build directory. + +Key RAJA features shown in this example are: + + * ``RAJA::kernel`` method and kernel execution policies + * ``RAJA::launch`` method and kernel execution interface + +In the example, we compute the transpose of an input matrix +:math:`A` of size :math:`N_r \times N_c` and store the result in a second +matrix :math:`At` of size :math:`N_c \times N_r`. + +First we define our matrix dimensions + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp + :start-after: // _mattranspose_dims_start + :end-before: // _mattranspose_dims_end + :language: C++ + +and wrap the data pointers for the matrices in ``RAJA::View`` objects to +simplify the multi-dimensional indexing: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp + :start-after: // _mattranspose_views_start + :end-before: // _mattranspose_views_end + :language: C++ + +Then, a C-style for-loop implementation looks like this: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp + :start-after: // _cstyle_mattranspose_start + :end-before: // _cstyle_mattranspose_end + :language: C++ + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``RAJA::kernel`` Implementation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For ``RAJA::kernel`` variants, we use ``RAJA::statement::For`` and +``RAJA::statement::Lambda`` statement types in the execution policies. +The complete sequential ``RAJA::kernel`` variant is: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp + :start-after: // _raja_mattranspose_start + :end-before: // _raja_mattranspose_end + :language: C++ + +A CUDA ``RAJA::kernel`` variant for the GPU is similar with different policies +in the ``RAJA::statement::For`` statements: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose_solution.cpp + :start-after: // _raja_mattranspose_cuda_start + :end-before: // _raja_mattranspose_cuda_end + :language: C++ + +A notable difference between the CPU and GPU execution policy is the insertion +of the ``RAJA::statement::CudaKernel`` type in the GPU version, which indicates +that the execution will launch a CUDA device kernel. + +In the CUDA ``RAJA::kernel`` variant above, the thread-block size and +and number of blocks to launch is determined by the implementation of the +``RAJA::kernel`` execution policy constructs using the sizes of the +``RAJA::TypedRangeSegment`` objects in the iteration space tuple. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``RAJA::launch`` Implementation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For ``RAJA::launch`` variants, we use ``RAJA::loop`` methods +to write a loop hierarchy within the kernel execution space. For a sequential +implementation, we pass the ``RAJA::seq_launch_t`` template parameter +to the launch method and pass the ``RAJA::loop_exec`` parameter to the loop +methods. The complete sequential ``RAJA::launch`` variant is: + +.. literalinclude:: ../../../../exercises/launch-matrix-transpose_solution.cpp + :start-after: // _raja_mattranspose_start + :end-before: // _raja_mattranspose_end + :language: C++ + +A CUDA ``RAJA::launch`` variant for the GPU is similar with CUDA +policies in the ``RAJA::loop`` methods. The complete +``RAJA::launch`` variant is: + +.. literalinclude:: ../../../../exercises/launch-matrix-transpose_solution.cpp + :start-after: // _raja_mattranspose_cuda_start + :end-before: // _raja_mattranspose_cuda_end + :language: C++ + +A notable difference between the CPU and GPU ``RAJA::launch`` +implementations is the definition of the compute grid. For the CPU +version, the argument list is empty for the ``RAJA::LaunchParams`` constructor. +For the CUDA GPU implementation, we define a 'Team' of one two-dimensional +thread-block with 16 x 16 = 256 threads. diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst index 7fb10a7299..3d5aa4f316 100644 --- a/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst +++ b/docs/sphinx/user_guide/tutorial/matrix_transpose_local_array.rst @@ -6,60 +6,72 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _matrixtransposelocalarray-label: +.. _tut-matrixtransposelocalarray-label: ---------------------------------- -Matrix Transpose with Local Array ---------------------------------- +----------------------------------------- +Tiled Matrix Transpose with Local Array +----------------------------------------- -This section extends the discussion in :ref:`tiledmatrixtranspose-label`, -where only loop tiling is considered. Here, we combine loop tiling with -``RAJA::LocalArray`` objects which enable us to store data for each tile in +This section extends the discussion in :ref:`tut-tiledmatrixtranspose-label` +by adding *local array* objects which are used to store data for each tile in CPU stack-allocated arrays or GPU thread local and shared memory to be used -within kernels. For more information about ``RAJA::LocalArray``, please -see :ref:`local_array-label`. +within kernels. -Key RAJA features shown in this example include: +There are exercise files +``RAJA/exercises/kernel-matrix-transpose-local-array.cpp`` and +``RAJA/exercises/launch-matrix-transpose-local-array.cpp`` for you to work +through if you wish to get some practice with RAJA. The files +``RAJA/exercises/kernel-matrix-transpose-local-array._solutioncpp`` and +``RAJA/exercises/launch-matrix-transpose-local-array_solution.cpp`` contain +complete working code for the examples. You can use the solution files to +check your work and for guidance if you get stuck. To build +the exercises execute ``make (kernel/launch)-matrix-transpose-local-array`` and ``make (kernel/launch)-matrix-transpose-local-array_solution`` +from the build directory. - * ``RAJA::kernel_param`` method with multiple lambda expressions - * ``RAJA::statement::Tile`` type - * ``RAJA::statement::ForICount`` type - * ``RAJA::LocalArray`` - * Specifying lambda arguments through statements +Key RAJA features shown in this example are: -As in :ref:`tiledmatrixtranspose-label`, this example computes the transpose -of an input matrix :math:`A` of size :math:`N_r \times N_c` and stores the -result in a second matrix :math:`At` of size :math:`N_c \times N_r`. The -operation uses a local memory tiling algorithm. The algorithm tiles the outer + * ``RAJA::kernel_param`` method and execution policy usage with multiple lambda expressions + * ``RAJA::statement::Tile`` type for loop tiling + * ``RAJA::statement::ForICount`` type for generating local tile indices + * ``RAJA::LocalArray`` type for thread-local tile memory arrays + * ``RAJA::launch`` kernel execution interface + * ``RAJA::expt::tile`` type for loop tiling + * ``RAJA::expt::loop_icount`` method to generate local tile indices for Launch + * ``RAJA_TEAM_SHARED`` macro for thread-local tile memory arrays + +As in :ref:`tut-tiledmatrixtranspose-label`, this example computes the +transpose of an input matrix :math:`A` of size :math:`N_r \times N_c` and +stores the result in a second matrix :math:`At` of size :math:`N_c \times N_r`. +The operation uses a local memory tiling algorithm, which tiles the outer loops and iterates over tiles in inner loops. The algorithm first loads input matrix entries into a local two-dimensional array for a tile, and then reads from the tile swapping the row and column indices to generate the output matrix. -We start with a non-RAJA C++ implementation to show the algorithm pattern. We choose tile dimensions smaller than the dimensions of the matrix and note that it is not necessary for the tile dimensions to divide evenly the number -of rows and columns in the matrix A. As in the :ref:`tiledmatrixtranspose-label` -example, we start by defining the number of rows and columns in the matrices, -the tile dimensions, and the number of tiles. +of rows and columns in the matrix. As in the +:ref:`tut-tiledmatrixtranspose-label` example, we start by defining the number +of rows and columns in the matrices, the tile dimensions, and the number of +tiles. -.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp :start-after: // _mattranspose_localarray_dims_start :end-before: // _mattranspose_localarray_dims_end :language: C++ We also use RAJA View objects to simplify the multi-dimensional indexing -as in the :ref:`tiledmatrixtranspose-label` example. +as in the :ref:`tut-tiledmatrixtranspose-label` example. -.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp :start-after: // _mattranspose_localarray_views_start :end-before: // _mattranspose_localarray_views_end :language: C++ -The complete sequential C++ implementation of the tiled transpose operation +The complete sequential C-style implementation of the tiled transpose operation using a stack-allocated local array for the tiles is: -.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp :start-after: // _mattranspose_localarray_cstyle_start :end-before: // _mattranspose_localarray_cstyle_end :language: C++ @@ -72,15 +84,16 @@ using a stack-allocated local array for the tiles is: stride-1 data access. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -RAJA::kernel Version of Tiled Loops with Local Array +``RAJA::kernel`` Variants ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -RAJA provides mechanisms to tile loops and use *local arrays* -in kernels so that algorithm patterns like we just described can be -implemented with RAJA. A ``RAJA::LocalArray`` type specifies an object whose -memory is created inside a kernel using a ``RAJA::statement`` type in a RAJA -kernel execution policy. The local array data is only usable within the kernel. -See :ref:`local_array-label` for more information. +The ``RAJA::kernel`` interface provides mechanisms to tile loops and use +*local arrays* in kernels so that algorithm patterns like the C-style kernel +above can be implemented with RAJA. When using ``RAJA::kernel``, a +``RAJA::LocalArray`` type specifies an object whose memory is created inside +a kernel using a statement type in a RAJA kernel execution policy. The local +array data is only usable within the kernel. See :ref:`feat-local_array-label` +for more information. ``RAJA::kernel`` methods also support loop tiling statements which determine the number of tiles needed to perform an operation based on tile size and @@ -94,91 +107,98 @@ For the RAJA version of the matrix transpose kernel above, we define the type of the ``RAJA::LocalArray`` used for matrix entries in a tile and create an object to represent it: -.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp :start-after: // _mattranspose_localarray_start :end-before: // _mattranspose_localarray_end :language: C++ -The template parameters that define the type are: array data type, data stride -permutation for the array indices (here the identity permutation is given, so -the default RAJA conventions apply; i.e., the rightmost array index will be -stride-1), and the array dimensions. Next, we compare two RAJA implementations -of matrix transpose with RAJA. +The template parameters that define the type are: the array data type, the +data stride permutation for the array indices (here the identity permutation +is given, so the default RAJA conventions apply; i.e., the rightmost array +index will be stride-1), and the array dimensions. Next, we compare two +``RAJA::kernel`` implementations of the matrix transpose operation. The complete RAJA sequential CPU variant with kernel execution policy and kernel is: -.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp :start-after: // _mattranspose_localarray_raja_start :end-before: // _mattranspose_localarray_raja_end :language: C++ -The ``RAJA::statement::Tile`` types in the execution policy define +In the execution policy, the ``RAJA::statement::Tile`` types define tiling of the outer 'row' (iteration space tuple index '1') and 'col' -(iteration space tuple index '0') loops, including tile sizes +(iteration space tuple index '0') loops, as well as tile sizes (``RAJA::tile_fixed`` types) and loop execution policies. Next, -the ``RAJA::statement::InitLocalMem`` type initializes the local stack array +the ``RAJA::statement::InitLocalMem`` type allocates the local tile array based on the memory policy type (here, we use ``RAJA::cpu_tile_mem`` for a CPU stack-allocated array). The ``RAJA::ParamList<2>`` parameter indicates that the local array object is associated with position '2' in the parameter tuple argument passed to the ``RAJA::kernel_param`` method. The first two entries in the parameter tuple indicate storage for the local tile indices -which can be used in multiple lambdas in the kernel. Finally, we have two sets -of nested inner loops for reading the input matrix entries into the local -array and writing them out to the output matrix transpose. The inner bodies of -each of these loop nests are identified by lambda expression arguments -'0' and '1', respectively. - -Note that the loops over tiles use ``RAJA::statement::ForICount`` types -rather than ``RAJA::statement::For`` types that we have seen in other -nested loop examples. The ``RAJA::statement::ForICount`` type generates -local tile indices that are passed to lambda loop body expressions. As -the reader will observe, there is no local tile index computation -needed in the lambdas for the RAJA version of the kernel as a result. The -first integer template parameter for each ``RAJA::statement::ForICount`` type -indicates the item in the iteration space tuple passed to the -``RAJA::kernel_param`` method to which it applies; this is similar to -``RAJA::statement::For`` usage. The second template parameter for each +that are used in the two lambda expressions that comprise the kernel body. +Finally, we have two sets of nested inner loops for reading the input matrix +entries into the local tile array and writing them out to the output matrix +transpose. The inner bodies of each of these loop nests are identified by +lambda expression invocation statements ``RAJA::statement::Lambda<0>`` for +the first lambda passed as an argument to the ``RAJA::kernel_param`` method +and ``RAJA::statement::Lambda<1>`` for the second lambda argument. + +Note that the loops within tiles use ``RAJA::statement::ForICount`` types +rather than ``RAJA::statement::For`` types that we saw in the +tiled matrix transpose example in :ref:`tut-tiledmatrixtranspose-label`. +The ``RAJA::statement::ForICount`` type generates local tile indices that +are passed to lambda loop body expressions to index into the local tile +memory array. As the reader will observe, there is no local tile index +computation needed in the lambdas for the RAJA version of the kernel as a +result. The first integer template parameter for each +``RAJA::statement::ForICount`` type indicates the item in the iteration space +tuple passed to the ``RAJA::kernel_param`` method to which it applies. +The second template parameter for each ``RAJA::statement::ForICount`` type indicates the position in the parameter tuple passed to the ``RAJA::kernel_param`` method that will hold the -associated local tile index. The loop execution policy template -argument that follows works the same as in ``RAJA::statement::For`` usage. -For more detailed discussion of RAJA loop tiling statement types, please see -:ref:`tiling-label`. +associated local tile index. For more detailed discussion of RAJA loop tiling +statement types, please see :ref:`feat-tiling-label`. Now that we have described the execution policy in some detail, let's pull everything together by briefly walking though the call to the -``RAJA::kernel_param`` method. The first argument is a tuple of iteration -spaces that define the iteration ranges for the level in the loop nest. -Again, the first integer parameters given to the ``RAJA::statement::Tile`` and -``RAJA::statement::ForICount`` types identify the tuple entry they apply to. -The second argument is a tuple of data parameters that will hold the local -tile indices and ``RAJA::LocalArray`` tile memory. The tuple entries are +``RAJA::kernel_param`` method, which is similar to ``RAJA::kernel`` but takes +additional arguments needed to execute the operations involving local +tile indices and the local memory array. The first argument is a tuple of +iteration spaces that define the iteration ranges for the levels in the loop +nest. Again, the first integer parameters given to the ``RAJA::statement::Tile`` +and ``RAJA::statement::ForICount`` types identify the tuple entry to which +they apply. The second argument:: + + RAJA::make_tuple((int)0, (int)0, Tile_Array) + +is a tuple of data parameters that will hold the local tile indices and +``RAJA::LocalArray`` tile memory. The tuple entries are associated with various statements in the execution policy as we described earlier. Next, two lambda expression arguments are passed to the ``RAJA::kernel_param`` method for reading and writing the input and output matrix entries, respectively. -Note that each lambda expression takes five arguments. The first two are -the matrix column and row indices associated with the iteration space tuple. -The next three arguments correspond to the parameter tuple entries. The first -two of these are the local tile indices used to access entries in the +.. note:: ``RAJA::kernel_param`` accepts a parameter tuple argument after + the iteration space tuple, which enables the parameters to be + used in multiple lambda expressions in a kernel. + +In the kernel, both lambda expressions take the same five arguments. The first +two are the matrix global column and row indices associated with the iteration +space tuple. The next three arguments correspond to the parameter tuple entries. +The first two of these are the local tile indices used to access entries in the ``RAJA::LocalArray`` object memory. The last argument is a reference to the ``RAJA::LocalArray`` object itself. -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -RAJA::kernel Version of Tiled Loops with Local Array Specifying Lambda Arguments -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The next ``RAJA::kernel_param`` variant we present works the same as the one +above. It is different from the previous version since we include +additional template parameters in the ``RAJA::statement::Lambda`` types to +indicate which arguments each lambda expression takes and in which order. +Here is the complete version including execution policy and kernel: -The second RAJA variant works the same as the one above. The main differences -between the two variants is due to the fact that in this second one, we use -``RAJA::statement::Lambda`` types to indicate which arguments each lambda -takes and in which order. Here is the complete version including -execution policy and kernel: - -.. literalinclude:: ../../../../examples/tut_matrix-transpose-local-array.cpp - :start-after: // _mattranspose_localarray_raja_lambdaargs_start - :end-before: // _mattranspose_localarray_raja_lambdaargs_end +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-local-array_solution.cpp + :start-after: // _raja_mattranspose_lambdaargs_start + :end-before: // _raja_mattranspose_lambdaargs_start :language: C++ Here, the two ``RAJA::statement::Lambda`` types in the execution policy show @@ -193,11 +213,55 @@ As a consequence of specifying lambda arguments, there are two main differences. The local tile indices are properly computed and passed to the lambda expressions as a result of the ``RAJA::Offsets`` types that appear in the lambda statement types. The ``RAJA::statement::Lambda`` type for each -lambda shows the two ways to specify the local tile index args; we can use an -``Offsets`` statement for each argument, or include multiple segment ids in one -statement. Lastly, there is only one entry in the parameter -tuple in this case, the local tile array. The placeholders are not needed. - -The file ``RAJA/examples/tut_matrix-transpose-local-array.cpp`` contains the -complete working example code for the examples described in this section along -with OpenMP, CUDA, and HIP variants. +lambda shows the two ways to specify the local tile index arguments; we can +use an ``Offsets`` statement for each argument, or include multiple segment +ids in one statement. Lastly, there is only one entry in the parameter +tuple in this case, the local tile array. The placeholders in the +previous example are not needed. + +.. note:: In this example, we need all five arguments in each lambda + expression so the lambda expression argument lists are + the same. Another use case for the template parameter argument + specification described here is to be able to pass only the + arguments used in a lambda expression. In particular when we use + multiple lambda expressions to represent a kernel, each lambda + can have a different argument lists from the others. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``RAJA::expt::launch`` Variants +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``RAJA::expt::launch`` interface provides mechanisms to tile loops and use +*local arrays* in kernels to support algorithm patterns like the C-style kernel +above. When, using ``RAJA::expt::launch``, the ``RAJA_TEAM_SHARED`` macro is +used to create a GPU shared memory array or a CPU stack memory array inside +a kernel. + +``RAJA::expt::launch`` support methods for tiling over an iteration space +using ``RAJA::expt::tile`` and ``RAJA::expt::loop_icount`` methods to tile +loops and generate global iteration indices and local tile offsets. +Moreover, lambda expressions for these methods will not be invoked for +iterations outside the bounds of an iteration space when tile dimensions +do not divide evenly the size of the iteration space; thus, no conditional +checks on loop bounds are needed inside inner loops. + +A complete RAJA sequential CPU variant with kernel execution policy and +kernel is: + +.. literalinclude:: ../../../../exercises/launch-matrix-transpose-local-array_solution.cpp + :start-after: // _mattranspose_localarray_raja_start + :end-before: // _mattranspose_localarray_raja_end + :language: C++ + +Here, the ``RAJA::expt::tile`` method is used to create tilings of the outer +'row' and 'col' iteration spaces. The ``RAJA::expt::tile`` method +takes an additional argument specifying the tile size for the corresponding +loop. To traverse the tile, we use the ``RAJA::expt::loop_icount`` method, +which is similar to the ``RAJA::ForICount`` statement used in a +``RAJA::kernel`` execution policy as shown above. A +``RAJA::expt::loop_icount`` method call +will generate local tile index associated with the outer global index. +The local tile index is necessary as we use it to read and write entries +from/to global memory to ``RAJA_TEAM_SHARED`` memory array. + + diff --git a/docs/sphinx/user_guide/tutorial/matrix_transpose_tiled.rst b/docs/sphinx/user_guide/tutorial/matrix_transpose_tiled.rst new file mode 100644 index 0000000000..46d92b4c66 --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/matrix_transpose_tiled.rst @@ -0,0 +1,152 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-tiledmatrixtranspose-label: + +---------------------- +Tiled Matrix Transpose +---------------------- + +This section describes the implementation of a tiled matrix transpose kernel +using both ``RAJA::kernel`` and ``RAJA::launch`` interfaces. The intent +is to compare and contrast the two. The discussion builds on +:ref:`tut-matrixtranspose-label` by adding tiling to the matrix transpose +implementation. + +There are exercise files +``RAJA/exercises/kernel-matrix-transpose-tiled.cpp`` and +``RAJA/exercises/launch-matrix-transpose-tiled.cpp`` for you to work through +if you wish to get some practice with RAJA. The files +``RAJA/exercises/kernel-matrix-transpose-tiled_solution.cpp`` and +``RAJA/exercises/launch-matrix-transpose-tiled_solution.cpp`` contain +complete working code for the examples. You can use the solution files to +check your work and for guidance if you get stuck. To build +the exercises execute ``make (kernel/launch)-matrix-transpose-tiled`` and +``make (kernel/launch)-matrix-transpose-tiled_solution`` +from the build directory. + +Key RAJA features shown in this example are: + + * ``RAJA::kernel`` method and execution policies, and the ``RAJA::statement::Tile`` type + * ``RAJA::launch`` method and execution policies, and the ``RAJA::tile`` type + +As in :ref:`tut-matrixtranspose-label`, we compute the transpose of an input +matrix :math:`A` of size :math:`N_r \times N_c` and storing the result in a +second matrix :math:`At` of size :math:`N_c \times N_r`. + +We will compute the matrix transpose using a tiling algorithm, which iterates +over tiles and transposes the matrix entries in each tile. +The algorithm involves outer and inner loops to iterate over the tiles and +matrix entries within each tile, respectively. + +As in :ref:`tut-matrixtranspose-label`, we start by defining the matrix +dimensions. Additionally, we define a tile size smaller than the matrix +dimensions and determine the number of tiles in each dimension. Note that we +do not assume that tiles divide evenly the number of rows and and columns of +the matrix. However, we do assume square tiles. + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp + :start-after: // _tiled_mattranspose_dims_start + :end-before: // _tiled_mattranspose_dims_end + :language: C++ + +Then, we wrap the matrix data pointers in ``RAJA::View`` objects to +simplify the multi-dimensional indexing: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp + :start-after: // _tiled_mattranspose_views_start + :end-before: // _tiled_mattranspose_views_end + :language: C++ + +The C-style for-loop implementation looks like this: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp + :start-after: // _cstyle_tiled_mattranspose_start + :end-before: // _cstyle_tiled_mattranspose_end + :language: C++ + +.. note:: To prevent indexing out of bounds, when the tile dimensions do not + divide evenly the matrix dimensions, the algorithm requires a + bounds check in the inner loops. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``RAJA::kernel`` Variants +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For ``RAJA::kernel`` variants, we use ``RAJA::statement::Tile`` types +for the outer loop tiling and ``RAJA::tile_fixed`` types to +indicate the tile dimensions. The complete sequential RAJA variant is: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp + :start-after: // _raja_tiled_mattranspose_start + :end-before: // _raja_tiled_mattranspose_end + :language: C++ + +The ``RAJA::statement::Tile`` types compute the number of tiles needed to +iterate over all matrix entries in each dimension and generate iteration +index bounds for each tile, which are used to generate loops for the inner +``RAJA::statement::For`` types. Thus, the explicit bounds checking logic in the +C-style variant is not needed. Note that the integer template parameters +in the ``RAJA::statement::For`` types refer to the entries in the iteration +space tuple passed to the ``RAJA::kernel`` method. + +The ``RAJA::kernel`` CUDA variant is similar with sequential policies replaced +with CUDA execution policies: + +.. literalinclude:: ../../../../exercises/kernel-matrix-transpose-tiled_solution.cpp + :start-after: // _raja_mattranspose_cuda_start + :end-before: // _raja_mattranspose_cuda_end + :language: C++ + +A notable difference between the CPU and GPU execution policy is the insertion +of the ``RAJA::statement::CudaKernel`` type in the GPU version, which indicates +that the execution will launch a CUDA device kernel. + +The CUDA thread-block dimensions are set based on the tile dimensions and the +iterates withing each tile are mapped directly to GPU threads in each block +due to the ``RAJA::cuda_thread_{x, y}_direct`` policies. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``RAJA::launch`` Variants +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For ``RAJA::launch`` variants, we use ``RAJA::tile`` methods +for the outer loop tiling and ``RAJA::loop`` methods +to iterate within the tiles. The complete sequential tiled +``RAJA::launch`` variant is: + +.. literalinclude:: ../../../../exercises/launch-matrix-transpose-tiled_solution.cpp + :start-after: // _raja_tiled_mattranspose_start + :end-before: // _raja_tiled_mattranspose_end + :language: C++ + +Similar to the ``RAJA::statement::Tile`` type in the ``RAJA::kernel`` variant +above, the ``RAJA::tile`` method computes the number of tiles needed to +iterate over all matrix entries in each dimension and generates a corresponding +iteration space for each tile, which is used to generate loops for the inner +``RAJA::loop`` methods. Thus, the explicit bounds checking logic in the +C-style variant is not needed. + +A CUDA ``RAJA::launch`` tiled variant for the GPU is similar with +CUDA policies in the ``RAJA::loop`` methods. The complete +``RAJA::launch`` variant is: + +.. literalinclude:: ../../../../exercises/launch-matrix-transpose-tiled_solution.cpp + :start-after: // _raja_mattranspose_cuda_start + :end-before: // _raja_mattranspose_cuda_end + :language: C++ + +A notable difference between the CPU and GPU ``RAJA::launch`` +implementations is the definition of the compute grid. For the CPU +version, the argument list is empty for the ``RAJA::LaunchParams`` constructor. +For the CUDA GPU implementation, we define a 'Team' of one two-dimensional +thread-block with 16 x 16 = 256 threads. + + + + diff --git a/docs/sphinx/user_guide/tutorial/naming_kernels.rst b/docs/sphinx/user_guide/tutorial/naming_kernels.rst deleted file mode 100644 index 4e30eb1dcf..0000000000 --- a/docs/sphinx/user_guide/tutorial/naming_kernels.rst +++ /dev/null @@ -1,59 +0,0 @@ -.. ## -.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC -.. ## and RAJA project contributors. See the RAJA/LICENSE file -.. ## for details. -.. ## -.. ## SPDX-License-Identifier: (BSD-3-Clause) -.. ## - -.. _teamsbasic-label: - ------------------------------------- -Naming kernels with NVTX/ROCTX tools ------------------------------------- - -Key RAJA feature shown in the following example: - - * Naming kernels using the ``Grid`` object in ``RAJA::ext::Launch`` methods. - -In this example we illustrate kernel naming capabilities within the RAJA Teams -framework for use with NVTX or ROCTX region naming capabilities. - -Recalling the ``RAJA::expt::launch`` API, naming a kernel is done using the third -argument of the ``Resources`` constructor as illustrated below:: - RAJA::expt::launch(RAJA::expt::ExecPlace , - RAJA::expt::Grid(RAJA::expt::Teams(Nteams,Nteams), - RAJA::expt::Threads(Nthreads,Nthreads) - "myKernel"), - [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) { - - /* Express code here */ - - }); - -The kernel name is used to create NVTX (NVIDIA) or ROCTX (AMD) ranges enabling -developers to identify kernels using NVIDIA `Nsight `_ -and NVIDIA `Nvprof `_ profiling -tools or `ROCm `_ -profiling tools when using ROCTX. As an illustration, using Nvprof -kernels are identified as ranges of GPU activity through the user specified name:: - - ==73220== NVTX result: - ==73220== Thread "" (id = 290832) - ==73220== Domain "" - ==73220== Range "myKernel" - Type Time(%) Time Calls Avg Min Max Name - Range: 100.00% 32.868us 1 32.868us 32.868us 32.868us myKernel - GPU activities: 100.00% 2.0307ms 1 2.0307ms 2.0307ms 2.0307ms _ZN4RAJA4expt17launch_global_fcnIZ4mainEUlNS0_13LaunchContextEE_EEvS2_T_ - API calls: 100.00% 27.030us 1 27.030us 27.030us 27.030us cudaLaunchKernel - -In a similar fashion ROCm tools can be used to generate traces of the profile and -the resulting json file can be viewed using tools such as `perfetto -`_. - -As future work we plan to add support to other profiling tools; API changes may occur -based on user feedback and integration with other tools. Enabling NVTX profiling -with RAJA Teams requires RAJA to be configured with RAJA_ENABLE_NV_TOOLS_EXT=ON. -or RAJA_ENABLE_ROCTX=ON for ROCTX profiling on AMD platforms platforms. - -The file RAJA/examples/teams_reductions.cpp contains a complete working example code. diff --git a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst b/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst deleted file mode 100644 index 0542d93216..0000000000 --- a/docs/sphinx/user_guide/tutorial/nested_loop_reorder.rst +++ /dev/null @@ -1,172 +0,0 @@ -.. ## -.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -.. ## and RAJA project contributors. See the RAJA/LICENSE file -.. ## for details. -.. ## -.. ## SPDX-License-Identifier: (BSD-3-Clause) -.. ## - -.. _nestedreorder-label: - ---------------------------------- -Nested Loop Interchange ---------------------------------- - -Key RAJA features shown in this example: - - * ``RAJA::kernel`` loop iteration templates - * RAJA nested loop execution policies - * Nested loop reordering (i.e., loop interchange) - * RAJA strongly-types indices - -In :ref:`loop_elements-kernel-label`, we introduced the basic mechanics in -RAJA for representing nested loops. In :ref:`matrixmultiply-label`, we -presented a complete example using RAJA nested loop features. The following -example shows the nested loop interchange process in more detail. -Specifically, we describe how to reorder nested policy arguments and introduce -strongly-typed index variables that can help users write correct nested loop -code with RAJA. The example does not perform any actual computation; each -kernel simply prints out the loop indices in the order that the iteration -spaces are traversed. Thus, only sequential execution policies are used. -However, the mechanics work the same way for other RAJA execution policies. - -Before we dive into the example, we note important features applied here that -represent the main differences between nested-loop RAJA and the -``RAJA::forall`` loop construct for simple (i.e., non-nested) loops: - - * An index space (e.g., range segment) and lambda index argument are - required for each level in a loop nest. This example contains - triply-nested loops, so there will be three ranges and three index - arguments. - - * The index spaces for the nested loop levels are specified in a RAJA tuple - object. The order of spaces in the tuple must match the order of index - arguments to the lambda for this to be correct, in general. RAJA provides - strongly-typed indices to help with this, which we show here. - - * An execution policy is required for each level in a loop nest. These - are specified as nested statements in the ``RAJA::KernelPolicy`` type. - - * The loop nest ordering is specified in the nested kernel policy -- - the first ``statement::For`` type identifies the outermost loop, the - second ``statement::For`` type identifies the loop nested inside the - outermost loop, and so on. - -We begin by defining three named **strongly-typed** variables for the loop -index variables. - -.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp - :start-after: _nestedreorder_idxtypes_start - :end-before: _nestedreorder_idxtypes_end - :language: C++ - -We also define three **typed** range segments which bind the ranges to the -index variable types via template specialization: - -.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp - :start-after: _nestedreorder_ranges_start - :end-before: _nestedreorder_ranges_end - :language: C++ - -When these features are used as in this example, the compiler will -generate error messages if the lambda expression index argument ordering -and types do not match the index ordering in the tuple. - -We present a complete example, and then describe its key elements: - -.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp - :start-after: _nestedreorder_kji_start - :end-before: _nestedreorder_kji_end - :language: C++ - -Here, the ``RAJA::kernel`` execution template takes two arguments: a tuple of -ranges, one for each of the three levels in the loop nest, and the lambda -expression loop body. Note that the lambda has an index argument for each -range and that their order and types match. - -The execution policy for the loop nest is specified in the -``RAJA::KernelPolicy`` type. Each level in the loop nest is identified by a -``statement::For`` type, which identifies the iteration space and -execution policy for the level. Here, each level uses a -sequential execution policy. This is for -illustration purposes; if you run the example code, you will see the loop -index triple printed in the exact order in which the kernel executes. -The integer that appears as the first template argument to each -``statement::For`` type corresponds to the index of a range in the tuple -and also to the associated lambda index argument; i.e., '0' is for 'i', -'1' is for 'j', and '2' is for 'k'. - -Here, the 'k' index corresponds to the outermost loop (slowest index), -the 'j' index corresponds to the middle loop, and the 'i' index is for the -innermost loop (fastest index). In other words, if written using C-style -for-loops, the loop would appear as:: - - for (int k = 2; k< 4; ++k) { - for (int j = 1; j < 3; ++j) { - for (int i = 0; i < 2; ++i) { - // print loop index triple... - } - } - } - -The integer argument to each ``statement::For`` type is needed so -that the levels in the loop nest can be reordered by changing the policy -while the kernel remains the same. Next, we permute the loop nest ordering -so that the 'j' loop is the outermost, the 'i' loop is in the middle, and -the 'k' loop is the innermost with the following policy: - -.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp - :start-after: _nestedreorder_jik_start - :end-before: _nestedreorder_jik_end - :language: C++ - -Note that we have simply reordered the nesting of the ``RAJA::statement::For`` -types. This is analogous to reordering 'for' statements in traditional C-style -nested loops. Here, the analogous C-style loop nest would appear as:: - - for (int j = 1; j < 3; ++j) { - for (int i = 0; i < 2; ++i) { - for (int k = 2; k< 4; ++k) { - // print loop index triple... - } - } - } - -Finally, for completeness, we permute the loops again so that the 'i' loop -is the outermost, the 'k' loop is in the middle, and the 'j' loop is the -innermost with the following policy: - -.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp - :start-after: _nestedreorder_ikj_start - :end-before: _nestedreorder_ikj_end - :language: C++ - -The analogous C-style loop nest would appear as:: - - for (int i = 0; j < 2; ++i) { - for (int k = 2; k< 4; ++k) { - for (int j = 1; j < 3; ++j) { - // print loop index triple... - } - } - } - -Hopefully, it should be clear how this works at this point. If not, -the typed indices and typed range segments can help by enabling the -compiler to let you know when something is not correct. - -For example, this version of the loop will generate a compilation error -(note that the kernel execution policy is the same as in the previous example): - -.. literalinclude:: ../../../../examples/tut_nested-loop-reorder.cpp - :start-after: _nestedreorder_typemismatch_start - :end-before: _nestedreorder_typemismatch_end - :language: C++ - -If you carefully compare the range ordering in the tuple to the -lambda argument types, you will see what's wrong. - -Do you see the problem? - -The file ``RAJA/examples/tut_nested-loop-reorder.cpp`` contains the complete -working example code. diff --git a/docs/sphinx/user_guide/tutorial/offset-layout-5pt-stencil.rst b/docs/sphinx/user_guide/tutorial/offset-layout-5pt-stencil.rst new file mode 100644 index 0000000000..8b75eb37cb --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/offset-layout-5pt-stencil.rst @@ -0,0 +1,203 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-offsetlayout-label: + +------------------------------------------------ +OffsetLayout: Five-point Stencil +------------------------------------------------ + +This section contains an exercise file ``RAJA/exercises/offset-layout-stencil.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/offset-layout-stencil.cpp`` contains +complete working code for the examples discussed in this section. You can use +the solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make offset-layout-stencil`` and ``make offset-layout-stencil_solution`` +from the build directory. + +Key RAJA features shown in the following example: + + * ``RAJA::kernel`` loop execution template and execution policies + * ``RAJA::View`` multi-dimensional data access + * ``RAJA:make_offset_layout`` method to create an offset Layout + +The examples in this section apply a five-point stencil to the interior cells +of a two-dimensional lattice and store a resulting sum in a second +lattice of equal size. The five-point stencil associated with a lattice cell +accumulates the value in the cell and each of its four neighbors. We use +``RAJA::View`` and ``RAJA::OffsetLayout`` constructs to simplify +the multi-dimensional indexing so that we can write the stencil operation +naturally, as such:: + + output(row, col) = input(row, col) + + input(row - 1, col) + input(row + 1, col) + + input(row, col - 1) + input(row, col + 1) + +A lattice is assumed to have :math:`N_r \times N_c` interior cells with unit +values surrounded by a halo of cells containing zero values for a total +dimension of :math:`(N_r + 2) \times (N_c + 2)`. For example, when +:math:`N_r = N_c = 3`, the input lattice and values are: + + +---+---+---+---+---+ + | 0 | 0 | 0 | 0 | 0 | + +---+---+---+---+---+ + | 0 | 1 | 1 | 1 | 0 | + +---+---+---+---+---+ + | 0 | 1 | 1 | 1 | 0 | + +---+---+---+---+---+ + | 0 | 1 | 1 | 1 | 0 | + +---+---+---+---+---+ + | 0 | 0 | 0 | 0 | 0 | + +---+---+---+---+---+ + +After applying the stencil, the output lattice and values are: + + +---+---+---+---+---+ + | 0 | 0 | 0 | 0 | 0 | + +---+---+---+---+---+ + | 0 | 3 | 4 | 3 | 0 | + +---+---+---+---+---+ + | 0 | 4 | 5 | 4 | 0 | + +---+---+---+---+---+ + | 0 | 3 | 4 | 3 | 0 | + +---+---+---+---+---+ + | 0 | 0 | 0 | 0 | 0 | + +---+---+---+---+---+ + +For this :math:`(N_r + 2) \times (N_c + 2)` lattice case, here is our +(row, col) indexing scheme. + + +----------+---------+---------+---------+---------+ + | (-1, 3) | (0, 3) | (1, 3) | (2, 3) | (3, 3) | + +----------+---------+---------+---------+---------+ + | (-1, 2) | (0, 2) | (1, 2) | (2, 2) | (3, 2) | + +----------+---------+---------+---------+---------+ + | (-1, 1) | (0, 1) | (1, 1) | (2, 1) | (3, 1) | + +----------+---------+---------+---------+---------+ + | (-1, 0) | (0, 0) | (1, 0) | (2, 0) | (3, 0) | + +----------+---------+---------+---------+---------+ + | (-1, -1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | + +----------+---------+---------+---------+---------+ + +Notably, :math:`[0, N_r) \times [0, N_c)` corresponds to the interior index +range over which we apply the stencil, and :math:`[-1,N_r+1) \times [-1, N_c+1)` +is the full lattice index range. + +For reference and comparison to the ``RAJA::kernel`` implementations +described below, we begin by walking through a C-style version of the stencil +computation. First, we define the size of our lattice: + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _stencil_define_start + :end-before: _stencil_define_end + :language: C++ + +Then, after allocating input and output arrays, we initialize the input: + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _stencil_input_init_start + :end-before: _stencil_input_init_end + :language: C++ + +and compute the reference output solution: + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _stencil_output_ref_start + :end-before: _stencil_output_ref_end + :language: C++ + + +^^^^^^^^^^^^^^^^^^^ +RAJA Offset Layouts +^^^^^^^^^^^^^^^^^^^ + +We use the ``RAJA::make_offset_layout`` method to construct a +``RAJA::OffsetLayout`` object that we use to create ``RAJA::View`` objects +for our input and output data arrays: + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _offsetlayout_views_start + :end-before: _offsetlayout_views_end + :language: C++ + +Here, the row index range is :math:`[-1, N_r+1)`, and the column index +range is :math:`[-1, N_c+1)`. The first argument to each call to the +``RAJA::View`` constructor is the pointer to the array that holds the View +data. The second argument is the ``RAJA::OffsetLayout`` object. + +``RAJA::OffsetLayout`` objects allow us to write loops over +data arrays using non-zero based indexing and without having to manually +compute offsets into the arrays. + +For more information about RAJA View and Layout types, please see +:ref:`feat-view-label`. + +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +RAJA Kernel Variants +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For the RAJA implementations of the stencil computation, we use two +``RAJA::TypedRangeSegment`` objects to define the row and column iteration +spaces for the interior cells: + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _offsetlayout_ranges_start + :end-before: _offsetlayout_ranges_end + :language: C++ + +Now, we have all the ingredients to implement the stencil computation using +``RAJA::kernel``. Here is a sequential CPU variant: + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _offsetlayout_rajaseq_start + :end-before: _offsetlayout_rajaseq_end + :language: C++ + +This RAJA variant does the computation as the C-style variant +introduced above. + +Since the input and output arrays are distinct, the stencil computation is +data parallel. Thus, we can use ``RAJA::kernel`` and an appropriate +execution policy to run the computation in parallel. Here is an OpenMP +collapse variant that maps the row-column product index space to OpenMP +threads: + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _offsetlayout_rajaomp_start + :end-before: _offsetlayout_rajaomp_end + :language: C++ + +Note that the lambda expression representing the kernel body is identical to +the ``RAJA::kernel`` sequential version. + +Here are variants for CUDA + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _offsetlayout_rajacuda_start + :end-before: _offsetlayout_rajacuda_end + :language: C++ + +and HIP + +.. literalinclude:: ../../../../exercises/offset-layout-stencil_solution.cpp + :start-after: _offsetlayout_rajahip_start + :end-before: _offsetlayout_rajahip_end + :language: C++ + +The only difference between the CPU and GPU variants is that the RAJA macro +``RAJA_DEVICE`` is used to decorate the lambda expression with the +``__device__`` annotation, which is required when capturing a lambda for use +in a GPU device environment as we have discussed in other examples in this +tutorial. + +One other point to note is that the CUDA variant in the exercise files +uses Unified Memory and the HIP variant uses distinct host and device memory +arrays, with explicit host-device data copy operations. Thus, new +``RAJA::View`` objects were created for the HIP variant to wrap the +device data pointers used in the HIP kernel. Please see the exercise files +for this example for details. diff --git a/docs/sphinx/user_guide/tutorial/offset-layout.rst b/docs/sphinx/user_guide/tutorial/offset-layout.rst deleted file mode 100644 index 7692738408..0000000000 --- a/docs/sphinx/user_guide/tutorial/offset-layout.rst +++ /dev/null @@ -1,133 +0,0 @@ -.. ## -.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -.. ## and RAJA project contributors. See the RAJA/LICENSE file -.. ## for details. -.. ## -.. ## SPDX-License-Identifier: (BSD-3-Clause) -.. ## - -.. _offset-label: - ---------------------------------------------- -Stencil Computations (View Offsets) ---------------------------------------------- - -Key RAJA features shown in the following example: - - * ``RAJA::Kernel`` loop execution template - * RAJA kernel execution policies - * ``RAJA::View`` multi-dimensional data access - * ``RAJA:make_offset_layout`` method to apply index offsets - -This example applies a five-cell stencil sum to the interior cells of a -two-dimensional square lattice and stores the resulting sums in a second -lattice of equal size. The five-cell stencil accumulates values from each -interior cell and its four neighbors. We use ``RAJA::View`` and -``RAJA::Layout`` constructs to simplify the multi-dimensional indexing so -that we can write the stencil operation as follows:: - - output(row, col) = input(row, col) + - input(row - 1, col) + input(row + 1, col) + - input(row, col - 1) + input(row, col + 1) - -A lattice is assumed to have :math:`N_r \times N_c` interior cells with unit -values surrounded by a halo of cells containing zero values for a total -dimension of :math:`(N_r + 2) \times (N_c + 2)`. For example, when -:math:`N_r = N_c = 3`, the input lattice and values are: - - +---+---+---+---+---+ - | 0 | 0 | 0 | 0 | 0 | - +---+---+---+---+---+ - | 0 | 1 | 1 | 1 | 0 | - +---+---+---+---+---+ - | 0 | 1 | 1 | 1 | 0 | - +---+---+---+---+---+ - | 0 | 1 | 1 | 1 | 0 | - +---+---+---+---+---+ - | 0 | 0 | 0 | 0 | 0 | - +---+---+---+---+---+ - -After applying the stencil, the output lattice and values are: - - +---+---+---+---+---+ - | 0 | 0 | 0 | 0 | 0 | - +---+---+---+---+---+ - | 0 | 3 | 4 | 3 | 0 | - +---+---+---+---+---+ - | 0 | 4 | 5 | 4 | 0 | - +---+---+---+---+---+ - | 0 | 3 | 4 | 3 | 0 | - +---+---+---+---+---+ - | 0 | 0 | 0 | 0 | 0 | - +---+---+---+---+---+ - -For this :math:`(N_r + 2) \times (N_c + 2)` lattice case, here is our -(row, col) indexing scheme. - - +----------+---------+---------+---------+---------+ - | (-1, 3) | (0, 3) | (1, 3) | (2, 3) | (3, 3) | - +----------+---------+---------+---------+---------+ - | (-1, 2) | (0, 2) | (1, 2) | (2, 2) | (3, 2) | - +----------+---------+---------+---------+---------+ - | (-1, 1) | (0, 1) | (1, 1) | (2, 1) | (3, 1) | - +----------+---------+---------+---------+---------+ - | (-1, 0) | (0, 0) | (1, 0) | (2, 0) | (3, 0) | - +----------+---------+---------+---------+---------+ - | (-1, -1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | - +----------+---------+---------+---------+---------+ - -Notably :math:`[0, N_r) \times [0, N_c)` corresponds to the interior index -range over which we apply the stencil, and :math:`[-1,N_r] \times [-1, N_c]` -is the full lattice index range. - -^^^^^^^^^^^^^^^^^^^ -RAJA Offset Layouts -^^^^^^^^^^^^^^^^^^^ - -We use the ``RAJA::make_offset_layout`` method to construct a -``RAJA::OffsetLayout`` object that defines our two-dimensional indexing scheme. -Then, we create two ``RAJA::View`` objects for each of the input and output -lattice arrays. - -.. literalinclude:: ../../../../examples/tut_offset-layout.cpp - :start-after: _offsetlayout_views_start - :end-before: _offsetlayout_views_end - :language: C++ - -Here, the row index range is :math:`[-1, N_r]`, and the column index -range is :math:`[-1, N_c]`. The first argument to each call to the -``RAJA::View`` constructor is a pointer to an array that holds the data for -the view; we assume the arrays are properly allocated before these calls. - -The offset layout mechanics of RAJA allow us to write loops over -data arrays using non-zero based indexing and without having to manually -compute the proper offsets into the arrays. For more details on the -``RAJA::View`` and ``RAJA::Layout`` concepts we use in this example, please -refer to :ref:`view-label`. - -^^^^^^^^^^^^^^^^^^^^^^^^^^ -RAJA Kernel Implementation -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -For the RAJA implementations of the example computation, we use two -``RAJA::RangeSegment`` objects to define the row and column iteration -spaces for the interior cells: - -.. literalinclude:: ../../../../examples/tut_offset-layout.cpp - :start-after: _offsetlayout_ranges_start - :end-before: _offsetlayout_ranges_end - :language: C++ - -Here, is an implementation using ``RAJA::kernel`` multi-dimensional loop -execution with a sequential execution policy. - -.. literalinclude:: ../../../../examples/tut_offset-layout.cpp - :start-after: _offsetlayout_rajaseq_start - :end-before: _offsetlayout_rajaseq_end - :language: C++ - -Since the stencil operation is data parallel, any parallel execution policy -may be used. The file ``RAJA/examples/tut_offset-layout.cpp`` contains a -complete working example code with various parallel implementations. For more -information about using the ``RAJA::kernel`` interface, please see -:ref:`loop_elements-kernel-label`. diff --git a/docs/sphinx/user_guide/tutorial/permuted-layout-batch-matrix-multiply.rst b/docs/sphinx/user_guide/tutorial/permuted-layout-batch-matrix-multiply.rst new file mode 100644 index 0000000000..9f229912db --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/permuted-layout-batch-matrix-multiply.rst @@ -0,0 +1,161 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-permutedlayout-label: + +----------------------------------------------- +Permuted Layout: Batched Matrix-Multiplication +----------------------------------------------- + +This section contains an exercise file +``RAJA/exercises/permuted-layout-batch-matrix-multiply.cpp`` for you to work +through if you wish to get some practice with RAJA. The file +``RAJA/exercises/permuted-layout-batch-matrix-multiply_solution.cpp`` contains +complete working code for the examples discussed in this section. You can use +the solution file to check your work and for guidance if you get stuck. +To build the exercises execute ``make permuted-layout-batch-matrix-multiply`` +and ``make permuted-layout-batch-matrix-multiply_solution`` from the build +directory. + +Key RAJA features shown in the following example: + + * ``RAJA::forall`` loop traversal template + * RAJA execution policies + * ``RAJA::View`` multi-dimensional data access + * ``RAJA::make_permuted_layout`` method to permute data ordering + +This example performs a "batched" matrix multiplication operation for a +collection of :math:`3 \times 3` matrices. Each pair of matrices +:math:`A^{e}` and :math:`B^{e}`, indexed by 'e', is multiplied and the product +is stored in a matrix :math:`C^{e}`. :math:`A^{e}` matrix entries, for all +values of e, are stored in an array :math:`A`, all :math:`B^{e}` matrices +are stored in an array :math:`B`, and all :math:`C^{e}` matrices are stored in +an array :math:`C`. In the following discussion, the notation +:math:`A^{e}_{rc}` indicates the row r and column c entry of the +:math:`3 \times 3` matrix :math:`A^{e}`. + +In the exercise, we use two different data layouts for the arrays :math:`A`, +:math:`B`, and :math:`C` to represent different storage patterns for the +:math:`3 \times 3` matrices. Below, we describe these layouts +for two :math:`3 \times 3` matrices. The extension to more than two +matrices is straightforward as you will see in the exercise code. In the +exercise code, we time the execution of the batched matrix multiplication +operation to compare the performance for each layout and execution policy. +These comparisons are not completely conclusive as to which layout is best since +there may be additional performance to be gained by more specific tuning of +the memory layouts for an architecture and execution back-end. A complete, +detailed analysis of the performance implications of memory layout and access +patterns is beyond the scope of the exercise. + +In **layout 1**, the entries for each :math:`3 \times 3` matrix are contiguous +in memory following row major ordering; i.e., the ordering is column index, +then row index, then matrix index: + +.. math:: + A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02}, + A^{0}_{10}, A^{0}_{11}, A^{0}_{12}, + A^{0}_{20}, A^{0}_{21}, A^{0}_{22},\\ + A^{1}_{00}, A^{1}_{01}, A^{1}_{02}, + A^{1}_{10}, A^{1}_{11}, A^{1}_{12}, + A^{1}_{20}, A^{1}_{21}, A^{1}_{22}] + +In **layout 2**, the matrix entries are first ordered by matrix index, +then by column index, and finally by row index: + +.. math:: + A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01}, + A^{1}_{01}, A^{0}_{02}, A^{1}_{02}, + A^{0}_{10}, A^{1}_{10}, A^{0}_{11},\\ + A^{1}_{11}, A^{0}_{12}, A^{1}_{12}, + A^{0}_{20}, A^{1}_{20}, A^{0}_{21}, + A^{1}_{21}, A^{0}_{22}, A^{1}_{22}] + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Permuted Layouts +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Next, we show how to construct the two data layouts described above using +``RAJA::View`` and ``RAJA::Layout`` objects. For more information on these +RAJA concepts, please see :ref:`feat-view-label`. + +The views to access data for layout 1 are constructed as follows: + +.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp + :start-after: _permutedlayout_defviews_start + :end-before: _permutedlayout_defviews_end + :language: C++ + +The first argument to ``RAJA::make_permuted_layout`` is an array +whose entries correspond to the extent of each layout dimension. Here, we have +:math:`N` :math:`N_r \times N_c` matrices. The second argument, the layout +permutation, describes the striding order of the array indices. Note that +since this case follows the default RAJA ordering convention +(see :ref:`feat-view-label`), we use the identity permutation '(0,1,2)'. For each +matrix, the column index (index 2) has unit stride and the row index (index 1) +has stride :math:`N_c`, the number of columns in each matrix. The matrix index +(index 0) has stride :math:`N_r \times N_c`, the number of entries in each +matrix. + +The views for layout 2 are constructed similarly, with a different index +striding order: + +.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp + :start-after: _permutedlayout_permviews_start + :end-before: _permutedlayout_permviews_end + :language: C++ + +Here, the first argument to ``RAJA::make_permuted_layout`` is the same as in +layout 1 since we have the same number of matrices with the same matrix +dimensions, and we will use the same indexing scheme to access the matrix +entries. However, the permutation we use is '(1,2,0)'. This makes the matrix +index (index 0) have unit stride, the column index (index 2) have stride +N, which is the number of matrices, and the row index (index 1) has +stride :math:`N \times N_c`. + +^^^^^^^^^^^^^^^^^^^^^^ +RAJA Kernel Variants +^^^^^^^^^^^^^^^^^^^^^^ + +The exercise files contain RAJA variants that run the batched matrix +multiplication kernel with different execution back-ends. As mentioned +earlier, we print out execution timings for each so you can compare the run +times of the different layouts described above. For example, the sequential +CPU variant using layout 1 is: + +.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp + :start-after: _permutedlayout_batchedmatmult_loop_start + :end-before: _permutedlayout_batchedmatmult_loop_end + :language: C++ + +The sequential CPU variant using layout 2 is: + +.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp + :start-after: _permutedlayout2_batchedmatmult_loop_start + :end-before: _permutedlayout2_batchedmatmult_loop_end + :language: C++ + +The only differences between these two are the names of the views that appear +in the lambda expression loop body since a different layout is used to create +view objects for each layout case. To make the algorithm code identical for all +cases, we could use type aliases for the view and layout types in a header +file similar to how we may abstract the execution policy out of the +algorithm, and compile the code for the case we want to run. + +For comparison, here is an OpenMP CPU variant using layout 1: + +.. literalinclude:: ../../../../exercises/permuted-layout-batch-matrix-multiply_solution.cpp + :start-after: _permutedlayout_batchedmatmult_omp_start + :end-before: _permutedlayout_batchedmatmult_omp_end + :language: C++ + +The only difference between this variant and the sequential CPU variant shown +above is the execution policy. The lambda expression loop body is identical +to the sequential CPU variant. + +The exercise files also contain variants for RAJA CUDA and HIP back-ends. +Their similarities and differences are the same as what we've just described. diff --git a/docs/sphinx/user_guide/tutorial/permuted-layout.rst b/docs/sphinx/user_guide/tutorial/permuted-layout.rst deleted file mode 100644 index 9e56c77bfe..0000000000 --- a/docs/sphinx/user_guide/tutorial/permuted-layout.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. ## -.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -.. ## and RAJA project contributors. See the RAJA/LICENSE file -.. ## for details. -.. ## -.. ## SPDX-License-Identifier: (BSD-3-Clause) -.. ## - -.. _permuted-layout-label: - ---------------------------------------------- -Batched Matrix-Multiply (Permuted Layouts) ---------------------------------------------- - -Key RAJA features shown in the following example: - - * ``RAJA::forall`` loop traversal template - * RAJA execution policies - * ``RAJA::View`` multi-dimensional data access - * ``RAJA::make_permuted_layout`` method to permute data ordering - -This example performs batched matrix multiplication for a set of -:math:`3 \times 3` matrices using two different data layouts. - -Matrices :math:`A` and :math:`B` are multiplied with the product stored in -matrix :math:`C`. The notation :math:`A^{e}_{rc}` indicates the row r and -column c entry of matrix e. We describe the two data layouts we use for two -matrices. The extension to more than two matrices is straightforward. Using -different data layouts, we can assess which performs best for a given -execution policy and computing environment. - -Layout 1: -Entries in each matrix are grouped together with each each having row major -ordering; i.e., - -.. math:: - A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02}, - A^{0}_{10}, A^{0}_{11}, A^{0}_{12}, - A^{0}_{20}, A^{0}_{21}, A^{0}_{22},\\ - A^{1}_{00}, A^{1}_{01}, A^{1}_{02}, - A^{1}_{10}, A^{1}_{11}, A^{1}_{12}, - A^{1}_{20}, A^{1}_{21}, A^{1}_{22}]; - -Layout 2: -Matrix entries are first ordered by matrix index, -then by column index, and finally by row index; i.e., - -.. math:: - A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01}, - A^{1}_{01}, A^{0}_{02}, A^{1}_{02}, - A^{0}_{10}, A^{1}_{10}, A^{0}_{11},\\ - A^{1}_{11}, A^{0}_{12}, A^{1}_{12}, - A^{0}_{20}, A^{1}_{20}, A^{0}_{21}, - A^{1}_{21}, A^{0}_{22}, A^{1}_{22}]; - -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Permuted Layouts -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Next, we show how to construct the two data layouts using ``RAJA::View`` and -``RAJA::Layout`` objects. For more details on these RAJA concepts, please -refer to :ref:`view-label`. - -The views for layout 1 are constructed as follows: - -.. literalinclude:: ../../../../examples/tut_batched-matrix-multiply.cpp - :start-after: _permutedlayout_defviews_start - :end-before: _permutedlayout_defviews_end - :language: C++ - -The first argument to ``RAJA::make_permuted_layout`` is a C++ array -whose entries correspond to the size of each array dimension; i.e., we have -'N' :math:`N_r \times N_c` matrices. The second argument describes the -striding order of the array dimensions. Note that since this case follows -the default RAJA ordering convention (see :ref:`view-label`), we use the -identity permutation '(0,1,2)'. For each matrix, the column index (index 2) -has unit stride and the row index (index 1) has stride 3 (number of columns). -The matrix index (index 0) has stride 9 (:math:`N_c \times N_r`). - -The views for layout 2 are constructed similarly: - -.. literalinclude:: ../../../../examples/tut_batched-matrix-multiply.cpp - :start-after: _permutedlayout_permviews_start - :end-before: _permutedlayout_permviews_end - :language: C++ - -Here, the first argument to ``RAJA::make_permuted_layout`` is the same as in -layout 1 since we have the same number of matrices, matrix dimensions and we -will use the same indexing scheme to access the matrix entries. However, the -permutation we use is '(1,2,0)'. This makes the matrix index (index 0) have -unit stride, the column index (index 2) for each matrix has stride N, which -is the number of matrices, and the row index (index 1) has -stride :math:`N \times N_c`. - -^^^^^^^^^^^^^^^^^^^ -Example Code -^^^^^^^^^^^^^^^^^^^ - -Complete working examples that run the batched matrix-multiplication -computation for both layouts and various RAJA execution policies is located -in the file ``RAJA/examples/tut_batched-matrix-multiply.cpp``. - -It compares the execution run times of the two layouts described above -using four RAJA back-ends (Sequential, OpenMP, CUDA, and HIP). The OpenMP -version for layout 1 looks like this: - -.. literalinclude:: ../../../../examples/tut_batched-matrix-multiply.cpp - :start-after: _permutedlayout_batchedmatmult_omp_start - :end-before: _permutedlayout_batchedmatmult_omp_end - :language: C++ - -The only differences between the lambda loop body for layout 1 and layout 2 -cases are the names of the views. To make the algorithm code identical for all -cases, we would use type aliases for the view and layout types in a header -file similarly to how we would abstract the execution policy out of the -algorithm. diff --git a/docs/sphinx/user_guide/tutorial/reductions.rst b/docs/sphinx/user_guide/tutorial/reductions.rst index bfd4344aff..effcb65378 100644 --- a/docs/sphinx/user_guide/tutorial/reductions.rst +++ b/docs/sphinx/user_guide/tutorial/reductions.rst @@ -6,47 +6,61 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _reductions-label: +.. _tut-reduction-label: ---------------------------------- -Reductions ---------------------------------- +----------------------------------------------------- +Reduction Types and Kernels with Multiple Reductions +----------------------------------------------------- -Key RAJA features shown in this example: +This section contains an exercise file ``RAJA/exercises/reductions.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/reductions_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make reductions`` and ``make reductions_solution`` +from the build directory. - * ``RAJA::forall`` loop execution template - * ``RAJA::RangeSegment`` iteration space construct - * RAJA reduction types - * RAJA reduction policies +Key RAJA features shown in this section are: -In the :ref:`dotproduct-label` example, we showed how to use the RAJA sum + * ``RAJA::forall`` loop execution template and execution policies + * ``RAJA::TypedRangeSegment`` iteration space construct + * RAJA reduction types and reduction policies + +In the :ref:`tut-dotproduct-label` exercise, we showed how to use the RAJA sum reduction type. The following example uses all supported RAJA reduction types: min, max, sum, min-loc, max-loc. +.. note:: RAJA 'min-loc' and 'max-loc' reductions determine the min and max + reduction value, respectively, along with an iteration index at + which the main/max value is found. + .. note:: Multiple RAJA reductions can be combined in any RAJA loop kernel execution method, and reduction operations can be combined with any other kernel operations. -We start by allocating an array (the memory manager in the example uses -CUDA Unified Memory if CUDA is enabled) and initializing its values in a +.. note:: Each RAJA reduction type requires a reduction policy that must + be compatible with the execution policy for the kernel in which + it is used. + +We start by allocating an array and initializing its values in a manner that makes the example mildly interesting and able to show what the different reduction types do. Specifically, the array is initialized to a sequence of alternating values ('1' and '-1'). Then, two values near the middle of the array are set to '-100' and '100': -.. literalinclude:: ../../../../examples/tut_reductions.cpp +.. literalinclude:: ../../../../exercises/reductions_solution.cpp :start-after: _reductions_array_init_start :end-before: _reductions_array_init_end :language: C++ We also define a range segment to iterate over the array: -.. literalinclude:: ../../../../examples/tut_reductions.cpp +.. literalinclude:: ../../../../exercises/reductions_solution.cpp :start-after: _reductions_range_start - :end-before: _reductions_arange_end + :end-before: _reductions_range_end :language: C++ -With these parameters and data initialization, all the code examples +With these parameters and data initialization, the code example presented below will generate the following results: * the sum will be zero @@ -55,25 +69,27 @@ presented below will generate the following results: * the min loc will be N/2 * the max loc will be N/2 + 1 -A sequential kernel that exercises all RAJA sequential reduction types is: +A sequential kernel that exercises all RAJA sequential reduction types +along with operations after the kernel to print the reduced values is: -.. literalinclude:: ../../../../examples/tut_reductions.cpp +.. literalinclude:: ../../../../exercises/reductions_solution.cpp :start-after: _reductions_raja_seq_start :end-before: _reductions_raja_seq_end :language: C++ Note that each reduction object takes an initial value at construction. Also, within the kernel, updating each reduction is done via an operator or method -that is basically what you would expect (i.e., '+=' for sum, 'min()' for min, -etc.). After the kernel executes, the reduced value computed by each reduction +that is basically what you would expect for the type of reduction +(e.g., '+=' for sum, 'min()' for min, etc.). After the kernel executes, the +reduced value computed by each reduction object is retrieved after the kernel by calling a 'get()' method on the reduction object. The min-loc/max-loc index values are obtained using 'getLoc()' methods. -For parallel multithreading execution via OpenMP, the example can be run -by replacing the execution and reduction policies with: +For parallel multithreading execution via OpenMP, the exercise can be run with +the execution and reduction policies: -.. literalinclude:: ../../../../examples/tut_reductions.cpp +.. literalinclude:: ../../../../exercises/reductions_solution.cpp :start-after: _reductions_raja_omppolicy_start :end-before: _reductions_raja_omppolicy_end :language: C++ @@ -81,21 +97,15 @@ by replacing the execution and reduction policies with: Similarly, the kernel containing the reductions can be run in parallel on a GPU using CUDA policies: -.. literalinclude:: ../../../../examples/tut_reductions.cpp +.. literalinclude:: ../../../../exercises/reductions_solution.cpp :start-after: _reductions_raja_cudapolicy_start :end-before: _reductions_raja_cudapolicy_end :language: C++ or HIP policies: -.. literalinclude:: ../../../../examples/tut_reductions.cpp +.. literalinclude:: ../../../../exercises/reductions_solution.cpp :start-after: _reductions_raja_hippolicy_start :end-before: _reductions_raja_hippolicy_end :language: C++ -.. note:: Each RAJA reduction type requires a reduction policy that must - be compatible with the execution policy for the kernel in which - it is used. - -The file ``RAJA/examples/tut_reductions.cpp`` contains the complete -working example code. diff --git a/docs/sphinx/user_guide/tutorial/scan.rst b/docs/sphinx/user_guide/tutorial/scan.rst index a06c7f2eb6..10cc0535fb 100644 --- a/docs/sphinx/user_guide/tutorial/scan.rst +++ b/docs/sphinx/user_guide/tutorial/scan.rst @@ -6,81 +6,105 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _scan-label: +.. _tut-scan-label: -------------------------------------------------- Parallel Scan Operations -------------------------------------------------- -Key RAJA features shown in this section: +This section contains an exercise file ``RAJA/exercises/scan.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/scan_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make scan`` and ``make scan_solution`` +from the build directory. - * ``RAJA::inclusive_scan`` operation - * ``RAJA::inclusive_scan_inplace`` operation - * ``RAJA::exclusive_scan`` operation - * ``RAJA::exclusive_scan_inplace`` operation - * RAJA operators for different types of scans; e.g., plus, minimum, maximum, etc. +Key RAJA features shown in this section are: -Below, we present examples of RAJA sequential, OpenMP, -and CUDA scan operations and show how different scan operations can be + * ``RAJA::inclusive_scan``, ``RAJA::inclusive_scan_inplace``, + ``RAJA::exclusive_scan``, and ``RAJA::exclusive_scan_inplace`` operations + and execution policies + * RAJA operators for different types of scans; e.g., plus, minimum, maximum, + etc. + +In this section, we present examples of various RAJA scan operations using +multiple RAJA execution back-ends. Different scan operations can be performed by passing different RAJA operators to the RAJA scan template methods. Each operator is a template type, where the template argument is the type of the values it operates on. For a summary of RAJA scan -functionality, please see :ref:`scan-label`. +functionality, please see :ref:`feat-scan-label`. .. note:: RAJA scan operations use the same execution policy types that - ``RAJA::forall`` loop execution templates do. + ``RAJA::forall`` kernel execution templates do. + +.. note:: RAJA scan operations take 'span' arguments to express the sequential + index range of array entries used in the scan. Typically, these + span objects are created using the ``RAJA::make_span`` method + as shown in the examples below. Each of the examples below uses the same integer arrays for input -and output values. We set the input array and print them as follows: +and output values. We initialize the input array and print its values as such: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_array_init_start :end-before: _scan_array_init_end :language: C++ -This generates the following sequence of values in the 'in' array:: +This generates the following sequence of values. This sequence will be used as +the 'in' array for each of the following examples.:: - 3 -1 2 15 7 5 17 9 6 18 1 10 0 14 13 4 11 12 8 16 + -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ^^^^^^^^^^^^^^^^ Inclusive Scans ^^^^^^^^^^^^^^^^ -A sequential inclusive scan operation is performed by: +RAJA's scan operations are standalone operations. That is, they cannot be +combined with other operations in a kernel. A sequential +inclusive scan operation can be executed like so: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_inclusive_seq_start :end-before: _scan_inclusive_seq_end :language: C++ -Since no operator is passed to the scan method, the default 'sum' operation +Since no operator is passed to the scan method, the default 'plus' operation is applied and the result generated in the 'out' array is a prefix-sum based on the 'in' array. The resulting 'out' array contains the values:: - 3 2 4 19 26 31 48 57 63 81 82 92 92 106 119 123 134 146 154 170 + -1 -1 0 2 5 9 14 20 27 35 44 54 65 77 90 104 119 135 152 170 + +In particular, each entry in the output array is a *partial sum* of all +input array entries up to that array index. -We can be explicit about the operation used in the scan by passing the -'plus' operator to the scan method: +We can be explicit about the operation used in the scan by passing the RAJA +'plus' operator ``RAJA::operators::plus`` to the scan method: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_inclusive_seq_plus_start :end-before: _scan_inclusive_seq_plus_end :language: C++ -The result in the 'out' array is the same. +The result in the 'out' array is the same as above. An inclusive parallel scan operation using OpenMP multithreading is accomplished similarly by replacing the execution policy type: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_inclusive_omp_plus_start :end-before: _scan_inclusive_omp_plus_end :language: C++ -As is commonly done with RAJA, the only difference between this code and -the previous one is that the execution policy is different. If we want to -run the scan on a GPU using CUDA, we would use a CUDA execution policy. This -will be shown shortly. +As expected, this produces the same result as the previous two examples. + +As is commonly the case with RAJA, the only difference between this code and +the previous one is the execution policy. If we want to +run the scan on a GPU using CUDA, we would use a CUDA execution policy as +is shown in examples below. + +.. note:: If no operator is passed to a RAJA scan operation, the default + plus operator is used, resulting in a prefix-sum. ^^^^^^^^^^^^^^^^ Exclusive Scans @@ -88,89 +112,102 @@ Exclusive Scans A sequential exclusive scan (plus) operation is performed by: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_exclusive_seq_plus_start :end-before: _scan_exclusive_seq_plus_end :language: C++ This generates the following sequence of values in the output array:: - 0 3 2 4 19 26 31 48 57 63 81 82 92 92 106 119 123 134 146 154 + 0 -1 -1 0 2 5 9 14 20 27 35 44 54 65 77 90 104 119 135 152 + +The result of an exclusive scan is similar to the result of an +inclusive scan, but differs in two ways. First, the first entry in +the exclusive scan output array is the `identity` of the operator used. +In the example here, it is zero, since the operator is 'plus'. +Second, the output sequence is shifted one position to the right +when compared to an inclusive scan. + +.. note:: The `identity` of an operator is the default value of a given type + for that operation. For example: + - The identity of an int for a sum operation is 0. + - The identity of an int for a maximum operation is -2147483648. -Note that the exclusive scan result is different than the inclusive scan -result in two ways. The first entry in the result is the `identity` of the -operator used (here, it is zero, since the operator is 'plus') and, after -that, the output sequence is shifted one position to the right. Running the same scan operation on a GPU using CUDA is done by: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_exclusive_cuda_plus_start :end-before: _scan_exclusive_cuda_plus_end :language: C++ Note that we pass the number of threads per CUDA thread block as the template -argument to the CUDA execution policy as we do in other cases. +argument to the CUDA execution policy as we do when using ``RAJA::forall``. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In-place Scans and Other Operators ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -*In-place* scan operations generate the same results as the scan operations +*In-place* scan variants generate the same results as the scan operations we have just described. However, the result is generated in the input array directly so **only one array is passed to in-place scan methods.** Here is a sequential inclusive in-place scan that uses the 'minimum' operator: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_inclusive_inplace_seq_min_start :end-before: _scan_inclusive_inplace_seq_min_end :language: C++ -Note that, before the scan, we copy the input array into the output array so -the result is generated in the output array. Doing this, we avoid having to -re-initialize the input array to use it in other examples. +Note that, before the scan operation is invoked, we copy the +input array into the output array to provide the scan input array we want. This generates the following sequence in the output array:: - 3 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 + -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 + +Since the operator used in the scan is 'minimum' and the smallest values in +the input array is the first entry, the result is an array with that value +in all array slots. Here is a sequential exclusive in-place scan that uses the 'maximum' operator: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_exclusive_inplace_seq_max_start :end-before: _scan_exclusive_inplace_seq_max_end :language: C++ This generates the following sequence in the output array:: - -2147483648 3 3 3 15 15 15 17 17 17 18 18 18 18 18 18 18 18 18 18 + -2147483648 -1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 -Note that the first value in the result is the negative of the max int value; -i.e., the identity of the maximum operator. +Since it is an exclusive scan, the first value in the result is the negative +of the max int value, which is the identity of the 'maximum' operator. As you may expect at this point, running an exclusive in-place prefix-sum operation using OpenMP is accomplished by: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_exclusive_inplace_omp_plus_start :end-before: _scan_exclusive_inplace_omp_plus_end :language: C++ This generates the following sequence in the output array (as we saw earlier):: - 0 3 2 4 19 26 31 48 57 63 81 82 92 92 106 119 123 134 146 15 + 0 -1 -1 0 2 5 9 14 20 27 35 44 54 65 77 90 104 119 135 152 and the only difference is the execution policy template parameter. Lastly, we show a parallel inclusive in-place prefix-sum operation using CUDA: -.. literalinclude:: ../../../../examples/tut_scan.cpp +.. literalinclude:: ../../../../exercises/scan_solution.cpp :start-after: _scan_inclusive_inplace_cuda_plus_start :end-before: _scan_inclusive_inplace_cuda_plus_end :language: C++ -.. note:: RAJA scans for the HIP back-end are similar to those for CUDA. +and the same using the RAJA HIP back-end: -The file ``RAJA/examples/tut_scan.cpp`` contains the complete -working example code. +.. literalinclude:: ../../../../exercises/scan_solution.cpp + :start-after: _scan_inclusive_inplace_hip_plus_start + :end-before: _scan_inclusive_inplace_hip_plus_end + :language: C++ diff --git a/docs/sphinx/user_guide/tutorial/sort.rst b/docs/sphinx/user_guide/tutorial/sort.rst index a44fe9b7b2..1eefab9de9 100644 --- a/docs/sphinx/user_guide/tutorial/sort.rst +++ b/docs/sphinx/user_guide/tutorial/sort.rst @@ -6,87 +6,111 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _sort-label: +.. _tut-sort-label: -------------------------------------------------- Parallel Sort Operations -------------------------------------------------- -Key RAJA features shown in this section: +This section contains an exercise file ``RAJA/exercises/sort.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/sort_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make sort`` and ``make sort_solution`` +from the build directory. - * ``RAJA::sort`` operation - * ``RAJA::sort_pairs`` operation - * ``RAJA::stable_sort`` operation - * ``RAJA::stable_sort_pairs`` operation +Key RAJA features shown in this section are: + + * ``RAJA::sort``, ``RAJA::sort_pairs``, ``RAJA::stable_sort``, and ``RAJA::stable_sort_pairs`` operations and execution policies * RAJA comparators for different types of sorts; e.g., less, greater -Below, we present examples of RAJA sequential, OpenMP, -and CUDA sort operations and show how different sort orderings can be -achieved by passing different RAJA comparators to the RAJA sort template -methods. Each comparator is a template type, where the template argument is -the type of the values it compares. For a summary of RAJA sort -functionality, please see :ref:`sort-label`. +We show examples of RAJA sort operations using multiple RAJA execution +back-ends and describe how different sort orderings can be achieved by +passing different RAJA comparators to the RAJA sort template methods. Each +comparator is a template type, where the template argument is the type of +the values it compares. For a summary of available RAJA sorts, please see +:ref:`feat-sort-label`. .. note:: RAJA sort operations use the same execution policy types that ``RAJA::forall`` loop execution templates do. +.. note:: RAJA sort operations take 'span' arguments to express the sequential + index range of array entries used in the sort. Typically, these + span objects are created using the ``RAJA::make_span`` method + as shown in the examples below. + Each of the examples below uses the same integer arrays for input and output values. We set the input array and print them as follows: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_array_init_start :end-before: _sort_array_init_end :language: C++ -This generates the following sequence of values in the ``in`` array:: +This produces the following sequence of values in the ``in`` array:: 6 7 2 1 0 9 4 8 5 3 4 9 6 3 7 0 1 8 2 5 -and the following sequence of (key, value) pairs in the ``in`` and ``in_vals`` -arrays:: +and the following sequence of (key, value) pairs shown as pairs of values +in the ``in`` and ``in_vals`` arrays, respectively:: (6,0) (7,0) (2,0) (1,0) (0,0) (9,0) (4,0) (8,0) (5,0) (3,0) (4,1) (9,1) (6,1) (3,1) (7,1) (0,1) (1,1) (8,1) (2,1) (5,1) +.. note:: In the following sections, we discuss *stable* and *unstable* sort + operations. The difference between them is that a stable sort + preserves the relative order of equal elements, with respect to the + sort comparator operation, while an unstable sort may not preserve + the relative order of equal elements. For the examples below that + use integer arrays, there is no way to tell by inspecting the + output whether relative ordering is preserved for unstable sorts. + However, the preservation of relative ordering can be seen in the + sort pairs examples below. + ^^^^^^^^^^^^^^^^ Unstable Sorts ^^^^^^^^^^^^^^^^ A sequential unstable sort operation is performed by: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_seq_start :end-before: _sort_seq_end :language: C++ -Since no comparator is passed to the sort method, the default less operation -is applied and the result generated in the ``out`` array is non-decreasing sort -on the ``out`` array. The resulting ``out`` array contains the values:: +Since no comparator is passed to the sort method, the default 'less' operator +``RAJA::operators::less`` is applied and the result generated in the +``out`` array is a non-decreasing sequence of values from the ``in`` array; +i.e.,:: 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 -We can be explicit about the operation used in the sort by passing the -less operator to the sort method: +We can be explicit about the operation used in the sort operation by passing +the 'less' operator to the sort method manually: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_seq_less_start :end-before: _sort_seq_less_end :language: C++ -The result in the ``out`` array is the same. +The result in the ``out`` array is the same as before:: + + 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 -An unstable parallel sort operation using OpenMP multi-threading is -accomplished similarly by replacing the execution policy type: +An unstable parallel sort operation using OpenMP multithreading is +accomplished similarly by replacing the execution policy type with +and OpenMP policy: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_omp_less_start :end-before: _sort_omp_less_end :language: C++ -As is commonly done with RAJA, the only difference between this code and +As is common with RAJA, the only difference between this code and the previous one is that the execution policy is different. If we want to -run the sort on a GPU using CUDA, we would use a CUDA execution policy. This -will be shown shortly. +run the sort on a GPU using CUDA or HIP, we would use a CUDA or HIP execution +policy. This is shown in examples that follow. ^^^^^^^^^^^^^^^^ Stable Sorts @@ -94,62 +118,80 @@ Stable Sorts A sequential stable sort (less) operation is performed by: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_stable_seq_less_start :end-before: _sort_stable_seq_less_end :language: C++ -This generates the following sequence of values in the output array:: +This generates the following sequence of values in the output array +as expected based on the examples we discussed above:: 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 Note that the stable sort result is the same as the unstable sort in this case -because we are sorting integers. We will show an example of sorting pairs later -where this is not the case. +because we are sorting an array of integers. We will show an example of +sorting pairs later where this is not the case. Running the same sort operation on a GPU using CUDA is done by: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_stable_cuda_less_start :end-before: _sort_stable_cuda_less_end :language: C++ Note that we pass the number of threads per CUDA thread block as the template -argument to the CUDA execution policy as we do in other cases. +argument to the CUDA execution policy as we do when using ``RAJA::forall``. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Other Comparators ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Using a different comparator allows sorting in a different order. -Here is a sequential stable sort that uses the greater operator: +Using a different comparator operator allows sorting in a different order. +Here is a sequential stable sort that uses the 'greater' operator +``RAJA::operators::greater``: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_stable_seq_greater_start :end-before: _sort_stable_seq_greater_end :language: C++ -This generates the following sequence of values in non-increasing order in -the output array:: +and similarly for HIP: + +.. literalinclude:: ../../../../exercises/sort_solution.cpp + :start-after: _sort_stable_hip_greater_start + :end-before: _sort_stable_hip_greater_end + :language: C++ + +Both of these sorts generate the following sequence of values in +non-increasing order in the output array:: 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 -Note that the only operators provided by RAJA that are valid to use in sort -because they form a strict weak ordering of elements for arithmetic types are -less and greater. Also note that the the cuda sort backend only supports -RAJA's operators less and greater. +.. note:: * The only operators provided by RAJA that are valid to use in sort + because they enforce a strict weak ordering of elements for + arithmetic types are 'less' and 'greater'. Users may provide other + operators for different sorting operations. + * Also the RAJA CUDA sort back-end only supports RAJA operators + 'less' and 'greater' because it uses the NVIDIA CUB library. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Sort Pairs ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Sort *Pairs* operations generate the same results as the sort operations -we have just described. However, an additional array of values is also permuted -to match the sorted array so **two arrays are passed to sort pairs methods.** +*Sort pairs* operations generate the same results as the sort operations +we have just described. Additionally, a second array of values is +reordered using the ordering of the first sorted array so +**two arrays are passed to sort pairs methods.** + +.. note:: For ``RAJA::sort_pairs`` algorithms, two arrays are passed. The + first array (*keys*) will be sorted according to the given + comparator operator. The elements in the second array (*values*) + will be reordered based on the final order of the first sorted array. -Here is a sequential unstable sort pairs that uses the less operator: +Here is a sequential unstable sort pairs operation that uses the 'less' +operator: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_pairs_seq_less_start :end-before: _sort_pairs_seq_less_end :language: C++ @@ -159,13 +201,14 @@ This generates the following sequence in the output array:: (0,0) (0,1) (1,0) (1,1) (2,0) (2,1) (3,0) (3,1) (4,0) (4,1) (5,1) (5,0) (6,1) (6,0) (7,0) (7,1) (8,0) (8,1) (9,1) (9,0) -Note that some of the pairs with equivalent keys stayed in the same order -they appeared in the unsorted arrays like ``(8,0) (8,1)``, while others are -reversed like ``(9,1) (9,0)``. +Note that some of the pairs with equivalent *keys* stayed in the same order +that they appeared in the unsorted arrays like ``(8,0) (8,1)``, while others are +reversed like ``(9,1) (9,0)``. This illustrates that relative ordering of +equal elements may not be preserved in an unstable sort. Here is a sequential stable sort pairs that uses the greater operator: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_stable_pairs_seq_greater_start :end-before: _sort_stable_pairs_seq_greater_end :language: C++ @@ -176,12 +219,12 @@ This generates the following sequence in the output array:: (4,0) (4,1) (3,0) (3,1) (2,0) (2,1) (1,0) (1,1) (0,0) (0,1) Note that all pairs with equivalent keys stayed in the same order that they -appeared in the unsorted arrays. +appeared in the unsorted input arrays. As you may expect at this point, running an stable sort pairs operation using OpenMP is accomplished by: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_stable_pairs_omp_greater_start :end-before: _sort_stable_pairs_omp_greater_end :language: C++ @@ -195,12 +238,12 @@ and the only difference is the execution policy template parameter. Lastly, we show a parallel unstable sort pairs operation using CUDA: -.. literalinclude:: ../../../../examples/tut_sort.cpp +.. literalinclude:: ../../../../exercises/sort_solution.cpp :start-after: _sort_pairs_cuda_greater_start :end-before: _sort_pairs_cuda_greater_end :language: C++ -.. note:: RAJA sorts for the HIP back-end are similar to those for CUDA. +.. note:: RAJA sorts for the HIP back-end are similar to those for CUDA. + The only difference is that a HIP execution policy template + parameter type is used. -The file ``RAJA/examples/tut_sort.cpp`` contains the complete -working example code. diff --git a/docs/sphinx/user_guide/tutorial/teams_basic.rst b/docs/sphinx/user_guide/tutorial/teams_basic.rst deleted file mode 100644 index 9ad9e99d70..0000000000 --- a/docs/sphinx/user_guide/tutorial/teams_basic.rst +++ /dev/null @@ -1,90 +0,0 @@ -.. ## -.. ## Copyright (c) 2016-20, Lawrence Livermore National Security, LLC -.. ## and RAJA project contributors. See the RAJA/LICENSE file -.. ## for details. -.. ## -.. ## SPDX-License-Identifier: (BSD-3-Clause) -.. ## - -.. _teamsbasic-label: - ------------------------------- -Team based loops (RAJA Teams) ------------------------------- - -Key RAJA features shown in the following examples: - - * ``RAJA::expt::launch`` method to create a run-time - selectable host/device execution space. - * ``RAJA::expt::loop`` methods to express algorithms - in terms of nested for loops. - -In this example, we introduce the RAJA Teams framework and discuss -hierarchical loop-based parallelism. Development with RAJA Teams occurs -inside an execution space. The execution space is launched using the -``RAJA::expt::launch`` method:: - - RAJA::expt::launch(RAJA::expt::ExecPlace , - RAJA::expt::Grid(RAJA::expt::Teams(Nteams,Nteams), - RAJA::expt::Threads(Nthreads,Nthreads)), - [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) { - - /* Express code here */ - - }); - -The ``RAJA::expt::launch`` method is templated on both a host and a device launch policy. -As an example, the following constructs an execution space for a sequential -and CUDA kernel:: - - using launch_policy = RAJA::expt::LaunchPolicy - >; - -Kernel execution on either the host or device is driven by the first argument of -the method which takes a ``RAJA::expt::ExecPlace`` enum type, either ``HOST`` or ``DEVICE``. -Similar to thread, and block programming models, RAJA Teams carries out -computation in a predefined compute grid made up of threads which are -then grouped into teams. The execution space is then enclosed by a host/device -lambda which takes a ``RAJA::expt::LaunchContext`` object. The ``RAJA::expt::LaunchContext`` -may then be used to control the flow within the kernel, for example creating thread-team -synchronization points. - -Inside the execution space the ``RAJA::expt::loop`` methods enable developers -to express their code in terms of nested loops. The manner in which the loops -are executed depends on the template. Following the CUDA/HIP programming models -we follow a hierarchical structure in which outer loops are executed by thread-teams -and inner loops are executed by a thread in a team. - -.. literalinclude:: ../../../../examples/tut_teams_basic.cpp - :start-after: // _team_loops_start - :end-before: // _team_loops_end - :language: C++ - -The mapping between the thread and teams to programming model depends on -how they are defined. For example, we may define host and device mapping -strategies as the following:: - - using teams_x = RAJA::expt::LoopPolicy; - using thread_x = RAJA::expt::LoopPolicy; - -In the example above the ``RAJA::expt::LoopPolicy`` struct holds both the host and -device loop mapping strategies. On the host, both the team/thread strategies expand -out to standard C-style loops for execution: - -.. literalinclude:: ../../../../examples/tut_teams_basic.cpp - :start-after: // _c_style_loops_start - :end-before: // _c_style_loops_end - :language: C++ - -On the device the ``teams_x/y`` policies will map loop iterations directly to -CUDA thread blocks, while the ``thread_x/y`` policies will map loop iterations -directly to threads in a CUDA block. The CUDA equivalent is illustrated below: - -.. literalinclude:: ../../../../examples/tut_teams_basic.cpp - :start-after: // _device_loop_start - :end-before: // _device_loop_end - :language: C++ - -The file RAJA/examples/tut_teams_basic.cpp contains the complete working example code. diff --git a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst b/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst deleted file mode 100644 index 8554e273a4..0000000000 --- a/docs/sphinx/user_guide/tutorial/tiled_matrix_transpose.rst +++ /dev/null @@ -1,84 +0,0 @@ -.. ## -.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -.. ## and RAJA project contributors. See the RAJA/LICENSE file -.. ## for details. -.. ## -.. ## SPDX-License-Identifier: (BSD-3-Clause) -.. ## - -.. _tiledmatrixtranspose-label: - ----------------------- -Tiled Matrix Transpose ----------------------- - -Key RAJA features shown in this example are: - - * ``RAJA::kernel`` usage with multiple lambdas - * ``RAJA::statement::Tile`` type - -In this example, we compute the transpose of an input matrix -:math:`A` of size :math:`N_r \times N_c` and store the result in a second -matrix :math:`At` of size :math:`N_c \times N_r`. - -We compute the matrix transpose using a tiling algorithm, which iterates -over tiles of the matrix A and performs a transpose copy of a tile without -storing the tile in another array. The algorithm is expressed as a collection -of outer and inner loops. Iterations of the inner loop will transpose each tile, -while outer loops iterate over the tiles. - -We start with a non-RAJA C++ implementation, where we choose tile -dimensions smaller than the matrix dimensions. Note that we do not assume -that tiles divide evenly the number of rows and and columns of the matrix. -However, we do assume square tiles. First, we define matrix dimensions: - -.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp - :start-after: // _tiled_mattranspose_dims_start - :end-before: // _tiled_mattranspose_dims_end - :language: C++ - -Then, we wrap the matrix data pointers in ``RAJA::View`` objects to -simplify the multi-dimensional indexing: - -.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp - :start-after: // _tiled_mattranspose_views_start - :end-before: // _tiled_mattranspose_views_end - :language: C++ - -Then, the non-RAJA C++ implementation looks like this: - -.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp - :start-after: // _cstyle_tiled_mattranspose_start - :end-before: // _cstyle_tiled_mattranspose_end - :language: C++ - -Note that we need to include a bounds check in the code to avoid indexing out -of bounds when the tile sizes do not divide the matrix dimensions evenly. - -^^^^^^^^^^^^^^^^^^^^^ -RAJA::kernel Variants -^^^^^^^^^^^^^^^^^^^^^ - -For ``RAJA::kernel`` variants, we use ``RAJA::statement::Tile`` types -for the outer loop tiling and ``RAJA::tile_fixed`` types to -indicate the tile dimensions. The complete sequential RAJA variant is: - -.. literalinclude:: ../../../../examples/tut_tiled-matrix-transpose.cpp - :start-after: // _raja_tiled_mattranspose_start - :end-before: // _raja_tiled_mattranspose_end - :language: C++ - -The ``RAJA::statement::Tile`` types compute the number of tiles needed to -iterate over all matrix entries in each dimension and generate iteration -index bounds for each tile, which are used to generate loops for the inner -``RAJA::statement::For`` types. Thus, the bounds checking logic in the -non-RAJA variant is not needed. Note that the integer template parameters -to these statement types refer to the entries in the iteration space tuple -passed to the ``RAJA::kernel`` method. - -The file ``RAJA/examples/tut_tiled-matrix-transpose.cpp`` contains the complete working example code for the examples described in this section, including -OpenMP, CUDA, and HIP variants. - -A more advanced version using RAJA local arrays for CPU cache blocking and -using GPU shared memory is discussed in :ref:`matrixtransposelocalarray-label`. - diff --git a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst index f8423ba5d1..8e73076df0 100644 --- a/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst +++ b/docs/sphinx/user_guide/tutorial/vertexsum_coloring.rst @@ -6,127 +6,165 @@ .. ## SPDX-License-Identifier: (BSD-3-Clause) .. ## -.. _vertexsum-label: +.. _tut-vertexsum-label: -------------------------------------------------- -Mesh Vertex Sum Example: Iteration Space Coloring +Iteration Space Coloring: Mesh Vertex Sum -------------------------------------------------- -Key RAJA features shown in this example: +This section contains an exercise file ``RAJA/exercises/vertexsum-indexset.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/vertexsum-indexset_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make vertexsum-indexset`` and ``make vertexsum-indexset_solution`` +from the build directory. - * ``RAJA::forall`` loop execution template method - * ``RAJA::ListSegment`` iteration space construct - * ``RAJA::IndexSet`` iteration space segment container and associated execution policies +Key RAJA features shown in this example are: + * ``RAJA::forall`` loop execution template method + * ``RAJA::TypedListSegment`` iteration space construct + * ``RAJA::TypedIndexSet`` iteration space segment container and + associated execution policies The example computes a sum at each vertex on a logically-Cartesian 2D mesh as shown in the figure. .. figure:: ../figures/vertexsum.jpg - A portion of the area of each mesh element is summed to the vertices surrounding the element. + The "area" of each vertex is the sum of an area contribution from each element sharing the vertex (left). In particular, one quarter of the area of each mesh element is summed to the vertices surrounding the element (right). -Each sum is an average of the area of the mesh elements that share the vertex. -In many "staggered mesh" applications, such an operation is common and is -often written in a way that presents the algorithm clearly but prevents +Each sum is an average of the area of the four mesh elements that share the +vertex. In many "staggered mesh" applications, an operation like this is common +and is often written in a way that presents the algorithm clearly but prevents parallelization due to potential data races. That is, multiple loop iterates over mesh elements may attempt to write to the same shared vertex memory location at the same time. The example shows how RAJA constructs can be used to enable one to express such an algorithm in parallel and have it run correctly without fundamentally changing how it looks in source code. -After defining the number of elements in the mesh, necessary array offsets -and an array that indicates the mapping between an element and its four -surrounding vertices, a C-style version of the vertex sum calculation is: +We start by setting the size of the mesh, specifically, the total number of +elements and vertices and the number of elements and vertices in each direction: -.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp - :start-after: _cstyle_vertexsum_start - :end-before: _cstyle_vertexsum_end +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _vertexsum_define_start + :end-before: _vertexsum_define_end :language: C++ -^^^^^^^^^^^^^^^^^^^^^^^ -RAJA Sequential Variant -^^^^^^^^^^^^^^^^^^^^^^^ +We also set up an array to map each element to its four surrounding vertices +and set the area of each element: + +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _vertexsum_elemarea_start + :end-before: _vertexsum_elemarea_end + :language: C++ -A nested loop RAJA variant of this kernel is: +Then, a sequential C-style version of the vertex area calculation looks like +this: -.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp - :start-after: _raja_seq_vertexsum_start - :end-before: _raja_seq_vertexsum_end +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _cstyle_vertexarea_seq_start + :end-before: _cstyle_vertexarea_seq_end + :language: C++ + +We can't parallelize the entire computation at once due to potential race +conditions where multiple threads may attempt to sum to a shared element +vertex simultaneously. However, we can parallelize the computation in +parts. Here is a C-style OpenMP parallel implementation: + +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _cstyle_vertexarea_omp_start + :end-before: _cstyle_vertexarea_omp_end + :language: C++ + +What we've done is broken up the computation into four parts, each of which +can safely run in parallel because there are no overlapping writes to the +same entry in the vertex area array in each parallel section. Note that there +is an outer loop on length four, one iteration for each of the elements that +share a vertex. Inside the loop, we iterate over a subset of elements in +parallel using an indexing area that guarantees that we will have no +data races. In other words, we have "colored" the elements as shown in the +figure below. + +.. figure:: ../figures/vertexsum_color.png + :scale: 30 + :align: center + + We partition the mesh elements into four disjoint subsets shown by the colors and numbers so that within each subset no two elements share a vertex. + +For completeness, the computation of the four element indexing arrays is: + +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _vertexarea_color_start + :end-before: _vertexarea_color_end :language: C++ -Note that this version cannot be guaranteed to run correctly in parallel -by simply changing the loop execution policies as we have done in other -examples. We would like to use RAJA to enable parallel execution and without -changing the way the kernel looks in source code. By applying a RAJA index -set and suitably-defined list segments, we can accomplish this. ^^^^^^^^^^^^^^^^^^^^^^^ RAJA Parallel Variants ^^^^^^^^^^^^^^^^^^^^^^^ -To enable the kernel to run safely in parallel, by eliminating the race -conditions, we partition the element iteration space into four subsets -(or `colors`) indicated by the numbers in the figure below, which represents -a portion of our logically-Cartesian 2D mesh. - - +---+---+---+---+ - | 2 | 3 | 2 | 3 | - +---+---+---+---+ - | 0 | 1 | 0 | 1 | - +---+---+---+---+ - | 2 | 3 | 2 | 3 | - +---+---+---+---+ - | 0 | 1 | 0 | 1 | - +---+---+---+---+ - -Note that none of the elements with the same number share a common vertex. -Thus, we can iterate over all elements with the same number (i.e., color) -in parallel. - -First, we define four vectors to gather the mesh element indices for each -color: - -.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp - :start-after: _colorvectors_vertexsum_start - :end-before: _colorvectors_vertexsum_end +To implement the vertex sum calculation using RAJA, we employ +``RAJA::TypedListSegment`` iteration space objects to enumerate the mesh +elements for each color and put them in a ``RAJA::TypedIndexSet`` object. +This allows us to execute the entire calculation using one ``RAJA::forall`` +call. + +We declare a type alias for the list segments to make the code more compact: + +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _vertexarea_listsegtype_start + :end-before: _vertexarea_listsegtype_end :language: C++ -Then, we create a RAJA index set with four list segments, one for each color, -using the vectors: +Then, we build the index set: -.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp - :start-after: _colorindexset_vertexsum_start - :end-before: _colorindexset_vertexsum_end +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _vertexarea_indexset_start + :end-before: _vertexarea_indexset_end :language: C++ -Now, we can use an index set execution policy that iterates over the +Note that we construct the list segments using the arrays we made earlier +to partition the elements. Then, we push them onto the index set. + +Now, we can use a two-level index set execution policy that iterates over the segments sequentially and executes each segment in parallel using OpenMP -multithreading (and ``RAJA::forall``): +multithreading to run the kernel: -.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp - :start-after: _raja_seq_colorindexset_vertexsum_start - :end-before: _raja_seq_colorindexset_vertexsum_end +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _raja_vertexarea_omp_start + :end-before: _raja_vertexarea_omp_end :language: C++ -Note that we no longer need to use the offset variable to compute the -element index in terms of 'i' and 'j' since the loop is no longer nested -and the element indices are directly encoded in the list segments. +The execution of the RAJA version is similar to the C-style OpenMP variant +shown earlier, where we executed four OpenMP parallel loops in sequence, +but the code is more concise. In particular, we execute four parallel OpenMP +loops, one for each list segment in the index set. Also, note that we do +not have to manually extract the element index from the segments like we +did earlier since RAJA passes the segment entries directly to the lambda +expression. -For completeness, here is the RAJA variant where we iterate over the +Here is the RAJA variant where we iterate over the segments sequentially, and execute each segment in parallel via a CUDA -kernel launch on a GPU: +kernel launched on a GPU: -.. literalinclude:: ../../../../examples/tut_vertexsum-coloring.cpp - :start-after: _raja_cuda_colorindexset_vertexsum_start - :end-before: _raja_cuda_colorindexset_vertexsum_end +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _raja_vertexarea_cuda_start + :end-before: _raja_vertexarea_cuda_end :language: C++ -Here, we have marked the lambda loop body with the 'RAJA_DEVICE' macro -and specified the number of threads in a CUDA thread block in the segment -execution policy. +The only differences here are that we have marked the lambda loop body with the +``RAJA_DEVICE`` macro, used a CUDA segment execution policy, and built a new +index set with list segments created using a CUDA resource so that the indices +live in device memory. + +The RAJA HIP variant, which we show for completeness, is similar: + +.. literalinclude:: ../../../../exercises/vertexsum-indexset_solution.cpp + :start-after: _raja_vertexarea_hip_start + :end-before: _raja_vertexarea_hip_end + :language: C++ -The RAJA HIP variant is similar. +The main difference for the HIP variant is that we use explicit device +memory allocation/deallocation and host-device memory copy operations. -The file ``RAJA/examples/tut_vertexsum-coloring.cpp`` contains a complete -working example code, including a RAJA HIP variant. diff --git a/docs/sphinx/user_guide/tutorial/view_layout.rst b/docs/sphinx/user_guide/tutorial/view_layout.rst new file mode 100644 index 0000000000..df20a8af3a --- /dev/null +++ b/docs/sphinx/user_guide/tutorial/view_layout.rst @@ -0,0 +1,303 @@ +.. ## +.. ## Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _tut-view_layout-label: + +----------------------------------------------------------- +Data Views and Layouts +----------------------------------------------------------- + +This section contains an exercise file ``RAJA/exercises/view-layout.cpp`` +for you to work through if you wish to get some practice with RAJA. The +file ``RAJA/exercises/view-layout_solution.cpp`` contains complete +working code for the examples discussed in this section. You can use the +solution file to check your work and for guidance if you get stuck. To build +the exercises execute ``make view-layout`` and ``make view-layout_solution`` +from the build directory. + +Key RAJA features shown in this section are: + + * ``RAJA::View`` + * ``RAJA::Layout`` and ``RAJA::OffsetLayout`` constructs + * Layout permutations + +The examples in this section illustrate RAJA View and Layout concepts +and usage patterns. The goal is for you to gain an understanding of how +to use RAJA Views and Layouts to simplify and transform array data access +patterns. None of the examples use RAJA kernel execution methods, such +as ``RAJA::forall``. The intent is to focus on RAJA View and Layout mechanics. + +Consider a basic C-style implementation of a matrix-matrix multiplication +operation, using :math:`N \times N` matrices: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _cstyle_matmult_start + :end-before: _cstyle_matmult_end + :language: C++ + +As is commonly done for efficiency in C and C++, we have allocated the data +for the matrices as one-dimensional arrays. Thus, we need to manually compute +the data pointer offsets for the row and column indices in the kernel. +Here, we use the array ``Cref`` to hold a reference solution matrix that +we use to compare with results generated by the examples below. + +To simplify the multi-dimensional indexing, we can use ``RAJA::View`` objects, +which we define as: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _matmult_views_start + :end-before: _matmult_views_end + :language: C++ + +Here we define three ``RAJA::View`` objects, 'Aview', 'Bview', and 'Cview', +that *wrap* the array data pointers, 'A', 'B', and 'C', respectively. We +pass a data pointer as the first argument to each view constructor and then +the extent of each matrix dimension as the second and third arguments. There +are two extent arguments since we indicate in the ``RAJA::Layout`` template +parameter list. The matrices are square and each extent is 'N'. Here, the +template parameters to ``RAJA::View`` are the array data type 'double' and +a ``RAJA::Layout`` type. Specifically:: + + RAJA::Layout<2, int> + +means that each View represents a two-dimensional default data layout, and +that we will use values of type 'int' to index into the arrays. + +Using the ``RAJA::View`` objects, we can access the data entries for the rows +and columns using a more natural, less error-prone syntax: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _cstyle_matmult_views_start + :end-before: _cstyle_matmult_views_end + :language: C++ + +Default Layouts Use Row-major Ordering +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default data layout ordering in RAJA is *row-major*, which is the +convention for multi-dimensional array indexing in C and C++. This means that +the rightmost index will be stride-1, the index to the left of the rightmost +index will have stride equal to the extent of the rightmost dimension, and +so on. + +.. note:: RAJA Layouts and Views support any number of dimensions and + the default data access ordering is *row-major*. Please + see :ref:`feat-view-label` for more details. + +To illustrate the default data layout striding, we next show simple +one-, two-, and three-dimensional examples where the for-loop ordering +for the different dimensions is such that all data access is stride-1. We +begin by defining some dimensions, allocate and initialize arrays: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _default_views_init_start + :end-before: _default_views_init_end + :language: C++ + +The version of the array initialization kernel using a one-dimensional +``RAJA::View`` is: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _default_view1D_start + :end-before: _default_view1D_end + :language: C++ + +The version of the array initialization using a two-dimensional +``RAJA::View`` is: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _default_view2D_start + :end-before: _default_view2D_end + :language: C++ + +The three-dimensional version is: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _default_view3D_start + :end-before: _default_view3D_end + :language: C++ + +It's worth repeating that the data array access in all three variants shown +here using ``RAJA::View`` objects is stride-1 since we order the for-loops +in the loop nests to match the row-major ordering. + +RAJA Layout types support other data access patterns with different striding +orders, offsets, and permutations. To this point, we have used the default +Layout constructor. RAJA provides methods to generate Layouts for different +indexing patterns. We describe these in the next several sections. Next, we +show how to permute the data striding order using permuted Layouts. + +Permuted Layouts Change Data Striding Order +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Every ``RAJA::Layout`` object has a permutation. When a permutation is not +specified at creation, a Layout will use the identity permutation. Here are +examples where the identity permutation is explicitly provided. First, in +two dimensions: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _default_perm_view2D_start + :end-before: _default_perm_view2D_end + :language: C++ + +Then, in three dimensions: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _default_perm_view3D_start + :end-before: _default_perm_view3D_end + :language: C++ + +These two examples access the data with stride-1 ordering, the same as in +the earlier examples, which is shown by the nested loop ordering. +The identity permutation in two dimensions is '{0, 1}' and is '{0, 1, 2}' +for three dimensions. The method ``RAJA::make_permuted_layout`` is used to +create a ``RAJA::Layout`` object with a permutation. The method takes two +arguments, the extents of each dimension and the permutation. + +.. note:: If a permuted Layout is created with the *identity permutation* + (e.g., {0,1,2}), the Layout is the same as if it were created by + +Next, we permute the striding order for the two-dimensional example: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _perm_2D_start + :end-before: _perm_2D_end + :language: C++ + +Read from right to left, the permutation '{1, 0}' specifies that the first +(zero) index 'i' is stride-1 and the second index (one) 'j' has stride equal +to the extent of the first Layout dimension 'Nx'. This is evident in the +for-loop ordering. + +Here is the three-dimensional case, where we have reversed the striding order +using the permutation '{2, 1, 0}': + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _perma_view3D_start + :end-before: _perma_view3D_end + :language: C++ + +The data access remains stride-1 due to the for-loop reordering. For fun, +here is another three-dimensional permutation: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _permb_view3D_start + :end-before: _permb_view3D_end + :language: C++ + +The permutation is '{1, 2, 0}' so to make the data access stride-1, we +swap the 'j' and 'k' loops and leave the 'i' loop as the inner loop. + +Multi-dimensional Indices and Linear Indices +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``RAJA::Layout`` types provide methods to convert between linear indices and +multi-dimensional indices and vice versa. Recall the Layout 'perm3a_layout' +from above that was created with the permutation '{2, 1, 0}'. To get the +linear index corresponding to the index triple '(1, 2, 0)', you can do +this:: + + int lin = perm3a_layout(1, 2, 0); + +The value of 'lin' is 7 = 1 + 2 * Nx + 0 * Nx * Ny. To get the index triple +for linear index 7, you can do:: + + int i, j, k; + perm3a_layout.toIndices(7, i, j, k); + +This sets 'i' to 1, 'j' to 2, and 'k' to 0. + +Similarly for the Layout 'permb_layout', which was created with the +permutation '{1, 2, 0}':: + + lin = perm3b_layout(1, 2, 0); + +sets 'lin' to 13 = 1 + 0 * Nx + 2 * Nx * Nz and:: + + perm3b_layout.toIndices(13, i, j, k); + +sets 'i' to 1, 'j' to 2, and 'k' to 0. + +There are more examples in the exercise file associated with this section. +Feel free to experiment with them. + +One important item to note is that, by default, there is no bounds checking +on indices passed to a ``RAJA::View`` data access method or ``RAJA::Layout`` +index computation methods. Therefore, it is the responsibility of a user +to ensure that indices passed to ``RAJA::View`` and ``RAJA::Layoout`` +methods are in bounds to avoid accessing data outside +of the View or computing invalid indices. + +.. note:: RAJA provides a CMake variable ``RAJA_ENABLE_BOUNDS_CHECK`` to + turn run time bounds checking on or off when the code is compiled. + Enabling bounds checking is useful for debugging and to ensure + your code is correct. However, when enabled, bounds checking adds + noticeable run time overhead. So it should not be enabled for + a production build of your code. + +Offset Layouts Apply Offsets to Indices +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The last topic we cover in this exercise is the ``RAJA::OffsetLayout`` type. +We first illustrate the concept of an offset with a C-style for-loop: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _cstyle_offlayout1D_start + :end-before: _cstyle_offlayout1D_end + :language: C++ + +Here, the for-loop runs from 'imin' to 'imax-1' (i.e., -5 to 5). To avoid +out-of-bounds negative indexing, we subtract 'imin' (i.e., -5) from the loop +index 'i'. + +To do the same thing with RAJA, we create a ``RAJA::OffsetLayout`` object +and use it to index into the array: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _raja_offlayout1D_start + :end-before: _raja_offlayout1D_end + :language: C++ + +``RAJA::OffsetLayout`` is a different type than ``RAJA::Layout`` because +it contains offset information. The arguments to the +``RAJA::make_offset_layout`` method are the index bounds. + +As expected, the two dimensional case is similar. First, a C-style loop: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _cstyle_offlayout2D_start + :end-before: _cstyle_offlayout2D_end + :language: C++ + +and then the same operation using a ``RAJA::OffsetLayout`` object: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _raja_offlayout2D_start + :end-before: _raja_offlayout2D_end + :language: C++ + +Note that the first argument passed to ``RAJA::make_offset_layout`` contains +the lower bounds for 'i' and 'j' and the second argument contains the upper +bounds. Also, the 'j' index is stride-1 by default since we did not pass +a permutation to the ``RAJA::make_offset_layout`` method, which is the same +as the non-offset Layout usage. + +Just like ``RAJA::Layout`` has a permutation, so does ``RAJA::OffsetLayout``. +Here is an example where we permute the (i, j) index stride ordering: + +.. literalinclude:: ../../../../exercises/view-layout_solution.cpp + :start-after: _raja_permofflayout2D_start + :end-before: _raja_permofflayout2D_end + :language: C++ + +The permutation '{1, 0}' is passed as the third argument to +``RAJA::make_offset_layout``. From the ordering of the for-loops, we can see +that the 'i' index is stride-1 and the 'j' index has stride equal to the +extent of the 'i' dimension so the for-loop nest strides through +the data with unit stride. + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 2f2351ac7c..13c1395ee8 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,89 +1,51 @@ -############################################################################### -# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC # and RAJA project contributors. See the RAJA/LICENSE file for details. # # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### raja_add_executable( - NAME tut_teams_basic - SOURCES tut_teams_basic.cpp) + NAME tut_launch_basic + SOURCES tut_launch_basic.cpp) raja_add_executable( NAME resource-forall SOURCES resource-forall.cpp) raja_add_executable( - NAME tut_daxpy - SOURCES tut_daxpy.cpp) - -raja_add_executable( - NAME tut_add-vectors - SOURCES tut_add-vectors.cpp) - -raja_add_executable( - NAME tut_dot-product - SOURCES tut_dot-product.cpp) + NAME dynamic-forall + SOURCES dynamic-forall.cpp) raja_add_executable( - NAME tut_indexset-segments - SOURCES tut_indexset-segments.cpp) + NAME forall-param-reductions + SOURCES forall-param-reductions.cpp) raja_add_executable( - NAME tut_matrix-multiply - SOURCES tut_matrix-multiply.cpp) - -raja_add_executable( - NAME tut_nested-loop-reorder - SOURCES tut_nested-loop-reorder.cpp) + NAME resource-dynamic-forall + SOURCES resource-dynamic-forall.cpp) raja_add_executable( - NAME tut_vertexsum-coloring - SOURCES tut_vertexsum-coloring.cpp) - -raja_add_executable( - NAME tut_reductions - SOURCES tut_reductions.cpp) - -raja_add_executable( - NAME teams_flatten - SOURCES teams_flatten.cpp) - -raja_add_executable( - NAME teams_reductions - SOURCES teams_reductions.cpp) - -raja_add_executable( - NAME resource-runtime-teams - SOURCES resource-runtime-teams.cpp) - -raja_add_executable( - NAME tut_scan - SOURCES tut_scan.cpp) - -raja_add_executable( - NAME tut_sort - SOURCES tut_sort.cpp) + NAME tut_daxpy + SOURCES tut_daxpy.cpp) raja_add_executable( - NAME tut_atomic-histogram - SOURCES tut_atomic-histogram.cpp) + NAME dynamic_mat_transpose + SOURCES dynamic_mat_transpose.cpp) raja_add_executable( - NAME tut_offset-layout - SOURCES tut_offset-layout.cpp) + NAME tut_matrix-multiply + SOURCES tut_matrix-multiply.cpp) raja_add_executable( - NAME tut_batched-matrix-multiply - SOURCES tut_batched-matrix-multiply.cpp) + NAME launch_flatten + SOURCES launch_flatten.cpp) raja_add_executable( - NAME tut_matrix-transpose-local-array - SOURCES tut_matrix-transpose-local-array.cpp) + NAME launch_reductions + SOURCES launch_reductions.cpp) raja_add_executable( - NAME tut_tiled-matrix-transpose - SOURCES tut_tiled-matrix-transpose.cpp) + NAME resource-runtime-launch + SOURCES resource-runtime-launch.cpp) raja_add_executable( NAME tut_halo-exchange @@ -94,12 +56,12 @@ raja_add_executable( SOURCES pi-reduce_vs_atomic.cpp) raja_add_executable( - NAME raja-teams - SOURCES raja-teams.cpp) + NAME raja-launch + SOURCES raja-launch.cpp) raja_add_executable( - NAME teams_matrix-multiply - SOURCES teams_matrix-multiply.cpp) + NAME launch_matrix-multiply + SOURCES launch_matrix-multiply.cpp) raja_add_executable( NAME jacobi @@ -136,7 +98,7 @@ raja_add_executable( SOURCES resource-kernel.cpp) raja_add_executable( - NAME resource-teams - SOURCES resource-teams.cpp) + NAME resource-launch + SOURCES resource-launch.cpp) add_subdirectory(plugin) diff --git a/examples/dynamic-forall.cpp b/examples/dynamic-forall.cpp new file mode 100644 index 0000000000..18dbde8243 --- /dev/null +++ b/examples/dynamic-forall.cpp @@ -0,0 +1,145 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Vector Addition Example with dynamic policy selection + * + * Computes c = a + b, where a, b, c are vectors of ints. + * a policy selected at run-time + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Functions for checking and printing results +// +void checkResult(int* res, int len); +void printResult(int* res, int len); + +using policy_list = camp::list + ,RAJA::cuda_exec<512> +#endif + >; + +int main(int argc, char *argv[]) +{ + + if(argc != 2) { + RAJA_ABORT_OR_THROW("Usage ./dynamic-forall N, where N is the index of the policy to run"); + } + + // + // Run time policy section is demonstrated in this example by specifying + // kernel exection space as a command line argument + // Example usage ./dynamic_forall policy N + // + + const int pol = std::stoi(argv[1]); + + std::cout << "\n\nRAJA vector addition example...\n"; + std::cout << "Using policy # "<(N); + int *b = memoryManager::allocate(N); + int *c = memoryManager::allocate(N); + + for (int i = 0; i < N; ++i) { + a[i] = -i; + b[i] = i; + } + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style vector addition...\n"; + + // _cstyle_vector_add_start + for (int i = 0; i < N; ++i) { + c[i] = a[i] + b[i]; + } + // _cstyle_vector_add_end + + checkResult(c, N); +//printResult(c, N); + + +//----------------------------------------------------------------------------// +// Example of dynamic policy selection for forall +//----------------------------------------------------------------------------// + + //policy is chosen from the list + RAJA::expt::dynamic_forall(pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { + c[i] = a[i] + b[i]; + }); + // _rajaseq_vector_add_end + + checkResult(c, N); +//printResult(c, N); + + +//----------------------------------------------------------------------------// +// +// Clean up. +// + memoryManager::deallocate(a); + memoryManager::deallocate(b); + memoryManager::deallocate(c); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +void checkResult(int* res, int len) +{ + bool correct = true; + for (int i = 0; i < len; i++) { + if ( res[i] != 0 ) { correct = false; } + } + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + +// +// Function to print result. +// +void printResult(int* res, int len) +{ + std::cout << std::endl; + for (int i = 0; i < len; i++) { + std::cout << "result[" << i << "] = " << res[i] << std::endl; + } + std::cout << std::endl; +} diff --git a/examples/dynamic_mat_transpose.cpp b/examples/dynamic_mat_transpose.cpp new file mode 100644 index 0000000000..6386c0a42d --- /dev/null +++ b/examples/dynamic_mat_transpose.cpp @@ -0,0 +1,436 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "memoryManager.hpp" + +/* + * Matrix Transpose Example + * + * In this example, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At of size N_c x N_r. + * + * This operation is carried out using a local memory tiling + * algorithm. The algorithm first loads matrix entries into an + * iteraion shared tile, a two-dimensional array, and then + * reads from the tile with row and column indices swapped for + * the output matrix. + * + * The algorithm is expressed as a collection of ``outer`` + * and ``inner`` for loops. Iterations of the inner loops will load/read + * data into the tile; while outer loops will iterate over the number + * of tiles needed to carry out the transpose. + * + * RAJA variants of the example use RAJA dynamic shared memory as tile memory. + * RAJA shared memory is mapped to device shared memory which + * enables threads in the same thread block to share data. Host versions + * of the algorithms will use a dynamically sized array + * + * RAJA features shown: + * - Basic usage of 'RAJA::launch' abstractions for nested loops + * - Hierachial parallism + * - Dynamic shared memory + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +const int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + +using launch_policy = RAJA::LaunchPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp_launch_t +#else + RAJA::seq_launch_t +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_launch_t +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_launch_t +#endif +#if defined(RAJA_ENABLE_SYCL) + , + RAJA::sycl_launch_t +#endif + >; + +/* + * Define team policies. + * Up to 3 dimension are supported: x,y,z + */ +using outer0 = RAJA::LoopPolicy< + RAJA::loop_exec +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_block_x_direct +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_block_x_direct +#endif +#if defined(RAJA_ENABLE_SYCL) + , + RAJA::sycl_group_0_direct +#endif + >; + +using outer1 = RAJA::LoopPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp_for_exec +#else + RAJA::loop_exec +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_block_y_direct +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_block_y_direct +#endif +#if defined(RAJA_ENABLE_SYCL) + , + RAJA::sycl_group_1_direct +#endif + >; +/* + * Define thread policies. + * Up to 3 dimension are supported: x,y,z + */ +using inner0 = RAJA::LoopPolicy< + RAJA::loop_exec +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_thread_x_direct +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_thread_x_direct +#endif +#if defined(RAJA_ENABLE_SYCL) + , + RAJA::sycl_local_0_direct +#endif + >; + +using inner1 = RAJA::LoopPolicy; + +template +void switch_ptrs(T *A, T *d_A) +{ + T *tmp_ptr; + tmp_ptr = d_A; + d_A = A; + A = tmp_ptr; +} + +int main(int argc, char *argv[]) +{ + + std::cout << "\n\nRAJA matrix transpose example...\n"; + + if(argc != 2) { + RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); + } + + // + // Run time policy section is demonstrated in this example by specifying + // kernel exection space as a command line argument (host or device). + // Example usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device + // + std::string exec_space = argv[1]; + if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ + RAJA_ABORT_OR_THROW("Usage ./dynamic_mat_transpose host or ./dynamic_mat_transpose device"); + return 0; + } + + RAJA::ExecPlace select_cpu_or_gpu; + if(exec_space.compare("host") == 0) + { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA::launch reductions example on the host \n"); } + if(exec_space.compare("device") == 0) + { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA::launch reductions example on the device \n"); } + + + +#if defined(RAJA_ENABLE_SYCL) + memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()}; + ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res); +#endif + + // + // Define num rows/cols in matrix, tile dimensions, and number of tiles + // + // _mattranspose_localarray_dims_start + const int N_r = 267; + const int N_c = 251; + + const int TILE_DIM = 16; + + const int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + const int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + // _mattranspose_localarray_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_localarray_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_localarray_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of shared matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _mattranspose_localarray_cstyle_start + // + // (0) Outer loops to iterate over tiles + // + for (int by = 0; by < outer_Dimr; ++by) { + for (int bx = 0; bx < outer_Dimc; ++bx) { + + // Stack-allocated local array for data on a tile + int Tile[TILE_DIM][TILE_DIM]; + + // + // (1) Inner loops to read input matrix tile data into the array + // + // Note: loops are ordered so that input matrix data access + // is stride-1. + // + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Tile[ty][tx] = Aview(row, col); + } + } + } + + // + // (2) Inner loops to write array data into output array tile + // + // Note: loop order is swapped from above so that output matrix + // data access is stride-1. + // + for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Tile[ty][tx]; + } + } + } + + } + } + // _mattranspose_localarray_cstyle_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA matrix transpose w/ dynamic shared memory ...\n"; + +#if defined(RAJA_ENABLE_HIP) + + //Hip requires device side pointers + int *d_A = nullptr, *d_At = nullptr; + + if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + d_A = memoryManager::allocate_gpu(N_r * N_c); + d_At = memoryManager::allocate_gpu(N_r * N_c); + + hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + + //switch host/device pointers so we can reuse the views + switch_ptrs(d_A, A); + switch_ptrs(d_At, At); + } +#endif + + + constexpr size_t dynamic_shared_mem_size = TILE_DIM * TILE_DIM * sizeof(int); + + RAJA::launch + (select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(outer_Dimr, outer_Dimc), + RAJA::Threads(TILE_DIM, TILE_DIM), dynamic_shared_mem_size), + "Matrix tranpose with dynamic shared memory kernel", + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) + { + + RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimr), [&] (int by){ + RAJA::loop(ctx, RAJA::RangeSegment(0, outer_Dimc), [&] (int bx){ + + //Request memory from shared memory pool + int * tile_ptr = ctx.getSharedMemory(TILE_DIM * TILE_DIM); + + //Use RAJA View for simplified indexing + RAJA::View> Tile(tile_ptr, TILE_DIM, TILE_DIM); + + RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){ + RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){ + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Tile(ty,tx) = Aview(row, col); + } + + }); + }); + + //Barrier is needed to ensure all threads have written to Tile + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int ty){ + RAJA::loop(ctx, RAJA::RangeSegment(0, TILE_DIM), [&] (int tx){ + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Tile(ty, tx); + } + + }); + }); + + //The launch context uses bump style allocator to return different segments of shared memory + //to avoid requesting beyond the pre-allocated memory quantity we reset the allocator offset counter + //effectively releasing shared memory. + ctx.releaseSharedMemory(); + + }); + }); + + }); + + +#if defined(RAJA_ENABLE_HIP) + if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + switch_ptrs(d_At, At); + switch_ptrs(d_A, A); + + hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + } +#endif + + + checkResult(Atview, N_c, N_r); + //----------------------------------------------------------------------------// + + return 0; +} + + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + //std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + //<< std::endl; + printf("%d ",Atview(row, col)); + } + std::cout << "" << std::endl; + } + std::cout << std::endl; +} diff --git a/examples/forall-param-reductions.cpp b/examples/forall-param-reductions.cpp new file mode 100644 index 0000000000..9bb1aa62cb --- /dev/null +++ b/examples/forall-param-reductions.cpp @@ -0,0 +1,336 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Reduction Example + * + * This example illustrates use of the RAJA reduction types: min, max, + * sum, min-loc, and max-loc. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - Index range segment + * - Execution policies + * - Reduction types + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + Specify the number of threads in a GPU thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +constexpr int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +constexpr int HIP_BLOCK_SIZE = 256; +#endif + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA reductions example...\n"; + + // _reductions_array_init_start +// +// Define array length +// + constexpr int N = 1000000; + +// +// Allocate array data and initialize data to alternating sequence of 1, -1. +// + int* a = memoryManager::allocate(N); + + for (int i = 0; i < N; ++i) { + if ( i % 2 == 0 ) { + a[i] = 1; + } else { + a[i] = -1; + } + } + +// +// Set min and max loc values +// + constexpr int minloc_ref = N / 2; + a[minloc_ref] = -100; + + constexpr int maxloc_ref = N / 2 + 1; + a[maxloc_ref] = 100; + // _reductions_array_init_end + +// +// Note: with this data initialization scheme, the following results will +// be observed for all reduction kernels below: +// +// - the sum will be zero +// - the min will be -100 +// - the max will be 100 +// - the min loc will be N/2 +// - the max loc will be N/2 + 1 +// +// + +// +// Define index range for iterating over a elements in all examples +// + // _reductions_range_start + RAJA::TypedRangeSegment arange(0, N); + // _reductions_range_end + +// +// Define ValLoc Type +// + + using VALLOC_INT = RAJA::expt::ValLoc; +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential reductions...\n"; + + // _reductions_raja_seq_start + using EXEC_POL1 = RAJA::seq_exec; + + int seq_sum = 0; + int seq_min = std::numeric_limits::max(); + int seq_max = std::numeric_limits::min(); + VALLOC_INT seq_minloc(std::numeric_limits::max(), -1); + VALLOC_INT seq_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(arange, + RAJA::expt::Reduce(&seq_sum), + RAJA::expt::Reduce(&seq_min), + RAJA::expt::Reduce(&seq_max), + RAJA::expt::Reduce(&seq_minloc), + RAJA::expt::Reduce(&seq_maxloc), + [=](int i, int &_seq_sum, int &_seq_min, int &_seq_max, VALLOC_INT &_seq_minloc, VALLOC_INT &_seq_maxloc) { + _seq_sum += a[i]; + + _seq_min = RAJA_MIN(a[i], _seq_min); + _seq_max = RAJA_MAX(a[i], _seq_max); + + _seq_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _seq_minloc); + _seq_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _seq_maxloc); + //_seq_minloc.min(a[i], i); + //_seq_maxloc.max(a[i], i); + // Note : RAJA::expt::ValLoc objects provide min() and max() methods + // that are equivalent to the assignments with RAJA_MIN and RAJA_MAX + // above. + } + ); + + std::cout << "\tsum = " << seq_sum << std::endl; + std::cout << "\tmin = " << seq_min << std::endl; + std::cout << "\tmax = " << seq_max << std::endl; + std::cout << "\tmin, loc = " << seq_minloc.getVal() << " , " + << seq_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << seq_maxloc.getVal() << " , " + << seq_maxloc.getLoc() << std::endl; + // _reductions_raja_seq_end + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA OpenMP reductions...\n"; + + // _reductions_raja_omppolicy_start + using EXEC_POL2 = RAJA::omp_parallel_for_exec; + // _reductions_raja_omppolicy_end + + int omp_sum = 0; + int omp_min = std::numeric_limits::max(); + int omp_max = std::numeric_limits::min(); + VALLOC_INT omp_minloc(std::numeric_limits::max(), -1); + VALLOC_INT omp_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(arange, + RAJA::expt::Reduce(&omp_sum), + RAJA::expt::Reduce(&omp_min), + RAJA::expt::Reduce(&omp_max), + RAJA::expt::Reduce(&omp_minloc), + RAJA::expt::Reduce(&omp_maxloc), + [=](int i, int &_omp_sum, int &_omp_min, int &_omp_max, VALLOC_INT &_omp_minloc, VALLOC_INT &_omp_maxloc) { + _omp_sum += a[i]; + + _omp_min = RAJA_MIN(a[i], _omp_min); + _omp_max = RAJA_MAX(a[i], _omp_max); + + _omp_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_minloc); + _omp_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_maxloc); + //_omp_minloc.min(a[i], i); + //_omp_maxloc.max(a[i], i); + } + ); + + std::cout << "\tsum = " << omp_sum << std::endl; + std::cout << "\tmin = " << omp_min << std::endl; + std::cout << "\tmax = " << omp_max << std::endl; + std::cout << "\tmin, loc = " << omp_minloc.getVal() << " , " + << omp_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << omp_maxloc.getVal() << " , " + << omp_maxloc.getLoc() << std::endl; + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_TARGET_OPENMP) + std::cout << "\n Running RAJA OpenMP Target reductions...\n"; + + // _reductions_raja_omppolicy_start + using EXEC_POL3 = RAJA::omp_target_parallel_for_exec_nt; + // _reductions_raja_omppolicy_end + + int omp_t_sum = 0; + int omp_t_min = std::numeric_limits::max(); + int omp_t_max = std::numeric_limits::min(); + VALLOC_INT omp_t_minloc(std::numeric_limits::max(), -1); + VALLOC_INT omp_t_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(arange, + RAJA::expt::Reduce(&omp_t_sum), + RAJA::expt::Reduce(&omp_t_min), + RAJA::expt::Reduce(&omp_t_max), + RAJA::expt::Reduce(&omp_t_minloc), + RAJA::expt::Reduce(&omp_t_maxloc), + [=](int i, int &_omp_t_sum, int &_omp_t_min, int &_omp_t_max, VALLOC_INT &_omp_t_minloc, VALLOC_INT &_omp_t_maxloc) { + _omp_t_sum += a[i]; + + _omp_t_min = RAJA_MIN(a[i], _omp_t_min); + _omp_t_max = RAJA_MAX(a[i], _omp_t_max); + + _omp_t_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _omp_t_minloc); + _omp_t_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _omp_t_maxloc); + //_omp_t_minloc.min(a[i], i); + //_omp_t_maxloc.max(a[i], i); + } + ); + + std::cout << "\tsum = " << omp_t_sum << std::endl; + std::cout << "\tmin = " << omp_t_min << std::endl; + std::cout << "\tmax = " << omp_t_max << std::endl; + std::cout << "\tmin, loc = " << omp_t_minloc.getVal() << " , " + << omp_t_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << omp_t_maxloc.getVal() << " , " + << omp_t_maxloc.getLoc() << std::endl; + +#endif + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA CUDA reductions...\n"; + + // _reductions_raja_cudapolicy_start + using EXEC_POL3 = RAJA::cuda_exec; + // _reductions_raja_cudapolicy_end + + int cuda_sum = 0; + int cuda_min = std::numeric_limits::max(); + int cuda_max = std::numeric_limits::min(); + VALLOC_INT cuda_minloc(std::numeric_limits::max(), -1); + VALLOC_INT cuda_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(arange, + RAJA::expt::Reduce(&cuda_sum), + RAJA::expt::Reduce(&cuda_min), + RAJA::expt::Reduce(&cuda_max), + RAJA::expt::Reduce(&cuda_minloc), + RAJA::expt::Reduce(&cuda_maxloc), + [=] RAJA_DEVICE (int i, int &_cuda_sum, int &_cuda_min, int &_cuda_max, VALLOC_INT &_cuda_minloc, VALLOC_INT &_cuda_maxloc) { + _cuda_sum += a[i]; + + _cuda_min = RAJA_MIN(a[i], _cuda_min); + _cuda_max = RAJA_MAX(a[i], _cuda_max); + + _cuda_minloc = RAJA_MIN(VALLOC_INT(a[i], i), _cuda_minloc); + _cuda_maxloc = RAJA_MAX(VALLOC_INT(a[i], i), _cuda_maxloc); + //_cuda_minloc.min(a[i], i); + //_cuda_maxloc.max(a[i], i); + } + ); + + std::cout << "\tsum = " << cuda_sum << std::endl; + std::cout << "\tmin = " << cuda_min << std::endl; + std::cout << "\tmax = " << cuda_max << std::endl; + std::cout << "\tmin, loc = " << cuda_minloc.getVal() << " , " + << cuda_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << cuda_maxloc.getVal() << " , " + << cuda_maxloc.getLoc() << std::endl; + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + std::cout << "\n Running RAJA HIP reductions...\n"; + + int* d_a = memoryManager::allocate_gpu(N); + hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + + // _reductions_raja_hippolicy_start + using EXEC_POL3 = RAJA::hip_exec; + // _reductions_raja_hippolicy_end + + int hip_sum = 0; + int hip_min = std::numeric_limits::max(); + int hip_max = std::numeric_limits::min(); + VALLOC_INT hip_minloc(std::numeric_limits::max(), -1); + VALLOC_INT hip_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(arange, + RAJA::expt::Reduce(&hip_sum), + RAJA::expt::Reduce(&hip_min), + RAJA::expt::Reduce(&hip_max), + RAJA::expt::Reduce(&hip_minloc), + RAJA::expt::Reduce(&hip_maxloc), + [=] RAJA_DEVICE (int i, int &_hip_sum, int &_hip_min, int &_hip_max, VALLOC_INT &_hip_minloc, VALLOC_INT &_hip_maxloc) { + _hip_sum += d_a[i]; + + _hip_min = RAJA_MIN(d_a[i], _hip_min); + _hip_max = RAJA_MAX(d_a[i], _hip_max); + + _hip_minloc = RAJA_MIN(VALLOC_INT(d_a[i], i), _hip_minloc); + _hip_maxloc = RAJA_MAX(VALLOC_INT(d_a[i], i), _hip_maxloc); + //_hip_minloc.min(d_a[i], i); + //_hip_maxloc.max(d_a[i], i); + } + ); + + std::cout << "\tsum = " << hip_sum << std::endl; + std::cout << "\tmin = " << hip_min << std::endl; + std::cout << "\tmax = " << hip_max << std::endl; + std::cout << "\tmin, loc = " << hip_minloc.getVal() << " , " + << hip_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << hip_maxloc.getVal() << " , " + << hip_maxloc.getLoc() << std::endl; + + memoryManager::deallocate_gpu(d_a); +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(a); + + std::cout << "\n DONE!...\n"; + + return 0; +} diff --git a/examples/teams_flatten.cpp b/examples/launch_flatten.cpp similarity index 64% rename from examples/teams_flatten.cpp rename to examples/launch_flatten.cpp index 9ea516e5a3..c8171126c1 100644 --- a/examples/teams_flatten.cpp +++ b/examples/launch_flatten.cpp @@ -14,10 +14,10 @@ #include "RAJA/RAJA.hpp" /* - * Thread Flatten Example using RAJA Teams + * Thread Flatten Example using RAJA Launch * * This example illustrates use of the "flatten" - * policy inside RAJA Teams + * policy inside RAJA Launch * * The flatten policy enables reshaping * multi-dimensional thread teams to 1D @@ -34,16 +34,16 @@ */ #if defined(RAJA_ENABLE_CUDA) -using device_launch = RAJA::expt::LaunchPolicy>; -using device_inner_pol0 = RAJA::expt::LoopPolicy; -using device_inner_pol1 = RAJA::expt::LoopPolicy; -using device_flatten_pol = RAJA::expt::LoopPolicy; +using device_launch = RAJA::LaunchPolicy>; +using device_inner_pol0 = RAJA::LoopPolicy; +using device_inner_pol1 = RAJA::LoopPolicy; +using device_flatten_pol = RAJA::LoopPolicy; using reduce_policy = RAJA::cuda_reduce; #elif defined(RAJA_ENABLE_HIP) -using device_launch = RAJA::expt::LaunchPolicy>; -using device_inner_pol0 = RAJA::expt::LoopPolicy; -using device_inner_pol1 = RAJA::expt::LoopPolicy; -using device_flatten_pol = RAJA::expt::LoopPolicy; +using device_launch = RAJA::LaunchPolicy>; +using device_inner_pol0 = RAJA::LoopPolicy; +using device_inner_pol1 = RAJA::LoopPolicy; +using device_flatten_pol = RAJA::LoopPolicy; using reduce_policy = RAJA::hip_reduce; #endif @@ -51,8 +51,8 @@ using reduce_policy = RAJA::hip_reduce; * Define device launch policies */ -using host_launch = RAJA::expt::LaunchPolicy; -using host_loop = RAJA::expt::LoopPolicy; +using host_launch = RAJA::LaunchPolicy; +using host_loop = RAJA::LoopPolicy; int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { @@ -68,9 +68,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Configure grid size // - RAJA::expt::Grid grid(RAJA::expt::Teams(1), - RAJA::expt::Threads(N, N), - "Teams Flatten Kernel"); + RAJA::LaunchParams launch_params(RAJA::Teams(1), + RAJA::Threads(N, N)); + // // Resource object for host, used to allocate memory @@ -97,13 +97,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> d_A_2DView(d_A_ptr, N, N); RAJA::View> d_A_1DView(d_A_ptr, NN); - - RAJA::expt::launch - (grid, [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) + RAJA::launch + (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, N), [&] (int j) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) { d_A_2DView(j, i) = i + j; }); }); @@ -112,7 +111,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // RAJA flatten policy will reshape a 2/3D thread team to 1D simplifying // accumulating memory contents - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NN), [&] (int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, NN), [&] (int i) { device_kernel_sum += d_A_1DView(i); }); @@ -126,12 +125,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> h_A_2DView(h_A_ptr, N, N); RAJA::View> h_A_1DView(h_A_ptr, NN); - RAJA::expt::launch - (grid, [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) + RAJA::launch + (launch_params, [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, N), [&] (int j) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&] (int i) { h_A_2DView(j, i) = i + j; }); }); @@ -140,7 +139,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //As loops are dispatched as standard C loops we can revert to using //a regular loop_exec policy - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, NN), [&] (int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, NN), [&] (int i) { host_kernel_sum += h_A_1DView(i); }); diff --git a/examples/teams_matrix-multiply.cpp b/examples/launch_matrix-multiply.cpp similarity index 77% rename from examples/teams_matrix-multiply.cpp rename to examples/launch_matrix-multiply.cpp index 01282ddce5..bf2f403573 100644 --- a/examples/teams_matrix-multiply.cpp +++ b/examples/launch_matrix-multiply.cpp @@ -15,15 +15,15 @@ #include "RAJA/RAJA.hpp" /* - * Matrix Multiplication Examples using RAJA Teams + * Matrix Multiplication Examples using RAJA Launch * * Example computes the product of two square matrices and introduces - * RAJA Teams loop capabilities via a sequence of implementations. + * RAJA Launch loop capabilities via a sequence of implementations. * * RAJA features shown: * - Index range segment * - View abstraction - * - Basic usage of 'RAJA Teams' abstractions for nested loops + * - Basic usage of 'RAJA Launch' abstractions for nested loops * * If CUDA is enabled, CUDA unified memory is used. */ @@ -37,15 +37,15 @@ /* * Define host/device launch policies */ -using launch_policy = RAJA::expt::LaunchPolicy< - RAJA::expt::seq_launch_t +using launch_policy = RAJA::LaunchPolicy< + RAJA::seq_launch_t #if defined(RAJA_ENABLE_CUDA) , - RAJA::expt::cuda_launch_t + RAJA::cuda_launch_t #endif #if defined(RAJA_ENABLE_HIP) , - RAJA::expt::hip_launch_t + RAJA::hip_launch_t #endif >; @@ -56,9 +56,9 @@ using gpu_block_x_policy = RAJA::cuda_block_x_direct; using gpu_block_y_policy = RAJA::cuda_block_y_direct; using gpu_thread_x_policy = RAJA::cuda_thread_x_loop; using gpu_thread_y_policy = RAJA::cuda_thread_y_loop; -using gpu_global_thread_x_policy = RAJA::expt::cuda_global_thread_x; -using gpu_global_thread_y_policy = RAJA::expt::cuda_global_thread_y; -using gpu_global_thread_xy_policy = RAJA::expt::cuda_global_thread_xy; +using gpu_global_thread_x_policy = RAJA::cuda_global_thread_x; +using gpu_global_thread_y_policy = RAJA::cuda_global_thread_y; +using gpu_global_thread_xy_policy = RAJA::cuda_global_thread_xy; #endif #if defined(RAJA_ENABLE_HIP) @@ -66,56 +66,50 @@ using gpu_block_x_policy = RAJA::hip_block_x_direct; using gpu_block_y_policy = RAJA::hip_block_y_direct; using gpu_thread_x_policy = RAJA::hip_thread_x_loop; using gpu_thread_y_policy = RAJA::hip_thread_y_loop; -using gpu_global_thread_x_policy = RAJA::expt::hip_global_thread_x; -using gpu_global_thread_y_policy = RAJA::expt::hip_global_thread_y; -using gpu_global_thread_xy_policy = RAJA::expt::hip_global_thread_xy; +using gpu_global_thread_x_policy = RAJA::hip_global_thread_x; +using gpu_global_thread_y_policy = RAJA::hip_global_thread_y; +using gpu_global_thread_xy_policy = RAJA::hip_global_thread_xy; #endif /* Define RAJA Team/Thread policies, if a device is available add a device policy. */ -using teams_x = RAJA::expt::LoopPolicy; + >; -using teams_y = RAJA::expt::LoopPolicy; + >; -using threads_x = RAJA::expt::LoopPolicy; + >; -using threads_y = RAJA::expt::LoopPolicy; + >; -using global_thread_x = RAJA::expt::LoopPolicy; + >; -using global_thread_y = RAJA::expt::LoopPolicy; + >; // // Define dimensionality of matrices. @@ -320,13 +314,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //two for loops. // _matmult_basickernel_start - RAJA::expt::launch(RAJA::expt::HOST, - RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams), - RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::ExecPlace::HOST, + RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), + RAJA::Threads(THREAD_SZ,THREAD_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, col_range, [&] (int col) { - RAJA::expt::loop(ctx, row_range, [&] (int row) { + RAJA::loop(ctx, col_range, [&] (int col) { + RAJA::loop(ctx, row_range, [&] (int row) { double dot = 0.0; for (int k = 0; k < N; ++k) { @@ -355,17 +349,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //recompiling execution policies. When running exclusively on the host //the compute grid may be left uninitialized as loop methods get expanded to //standard C style loops. - using omp_launch_policy = RAJA::expt::LaunchPolicy; + using omp_launch_policy = RAJA::LaunchPolicy; - using omp_col_policy0 = RAJA::expt::LoopPolicy; + using omp_col_policy0 = RAJA::LoopPolicy; - using omp_row_policy0 = RAJA::expt::LoopPolicy; + using omp_row_policy0 = RAJA::LoopPolicy; - RAJA::expt::launch(RAJA::expt::Grid(), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::LaunchParams(), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, col_range, [&] (int col) { - RAJA::expt::loop(ctx, row_range, [&] (int row) { + RAJA::loop(ctx, col_range, [&] (int col) { + RAJA::loop(ctx, row_range, [&] (int row) { double dot = 0.0; for (int k = 0; k < N; ++k) { @@ -391,11 +385,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This is the same as using an OpenMP 'parallel for' directive on the // outer loop with a 'collapse(2) clause. // - using global_thread_xy = RAJA::expt::LoopPolicy; + using global_thread_xy = RAJA::LoopPolicy; - RAJA::expt::launch(RAJA::expt::HOST, - RAJA::expt::Grid(), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::ExecPlace::HOST, + RAJA::LaunchParams(), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::expt::loop(ctx, col_range, row_range, [&] (int col, int row) { @@ -431,13 +425,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // - RAJA::expt::launch(RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(N), - RAJA::expt::Threads(N)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(N), + RAJA::Threads(N)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, col_range, [&] (int col) { - RAJA::expt::loop(ctx, row_range, [&] (int row) { + RAJA::loop(ctx, col_range, [&] (int col) { + RAJA::loop(ctx, row_range, [&] (int row) { double dot = 0.0; for (int k = 0; k < N; ++k) { @@ -467,18 +461,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The tiling capabilities in RAJA will also mask out of bounds iterations. // - RAJA::expt::launch(RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams), - RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), + RAJA::Threads(THREAD_SZ,THREAD_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::tile + RAJA::tile (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) { - RAJA::expt::tile + RAJA::tile (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) { - RAJA::expt::loop(ctx, row_tile, [&] (int col) { - RAJA::expt::loop(ctx, col_tile, [&] (int row) { + RAJA::loop(ctx, row_tile, [&] (int col) { + RAJA::loop(ctx, col_tile, [&] (int row) { double dot = 0.0; for (int k = 0; k < N; ++k) { @@ -527,13 +521,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // and col = threadIdx.x in the kernel. // // - RAJA::expt::launch(RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(N), - RAJA::expt::Threads(N)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(N), + RAJA::Threads(N)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, col_range, [&] (int col) { - RAJA::expt::loop(ctx, row_range, [&] (int row) { + RAJA::loop(ctx, col_range, [&] (int col) { + RAJA::loop(ctx, row_range, [&] (int row) { double dot = 0.0; for (int k = 0; k < N; ++k) { @@ -567,18 +561,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // The tiling capabilities in RAJA will also mask out of bounds iterations. // - RAJA::expt::launch(RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams), - RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), + RAJA::Threads(THREAD_SZ,THREAD_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::tile + RAJA::tile (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &row_tile) { - RAJA::expt::tile + RAJA::tile (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &col_tile) { - RAJA::expt::loop(ctx, row_tile, [&] (int col) { - RAJA::expt::loop(ctx, col_tile, [&] (int row) { + RAJA::loop(ctx, row_tile, [&] (int col) { + RAJA::loop(ctx, col_tile, [&] (int row) { double dot = 0.0; for (int k = 0; k < N; ++k) { @@ -604,7 +598,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(C, 0, N*N * sizeof(double)); - using seq_loop = RAJA::expt::LoopPolicy; + using seq_loop = RAJA::LoopPolicy; // // This example builds on the RAJA tiling capabilies presented earlier @@ -616,49 +610,49 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This example also uses the teamSync() method in the launch context // to add a barrier ensuring all threads have loaded/read from shared memory // - RAJA::expt::launch(RAJA::expt::DEVICE, - RAJA::expt::Grid(RAJA::expt::Teams(NTeams,NTeams), - RAJA::expt::Threads(THREAD_SZ,THREAD_SZ)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(RAJA::ExecPlace::DEVICE, + RAJA::LaunchParams(RAJA::Teams(NTeams,NTeams), + RAJA::Threads(THREAD_SZ,THREAD_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { // // Loop over teams // - RAJA::expt::tile + RAJA::tile (ctx, THREAD_SZ, row_range, [&] (RAJA::RangeSegment const &y_tile) { - RAJA::expt::tile + RAJA::tile (ctx, THREAD_SZ, col_range, [&] (RAJA::RangeSegment const &x_tile) { RAJA_TEAM_SHARED double As[THREAD_SZ][THREAD_SZ]; RAJA_TEAM_SHARED double Bs[THREAD_SZ][THREAD_SZ]; RAJA_TEAM_SHARED double Cs[THREAD_SZ][THREAD_SZ]; - RAJA::expt::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::expt::loop_icount(ctx, x_tile, [&](int col, int tx) { + RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { Cs[ty][tx] = 0.0; }); }); - RAJA::expt::tile + RAJA::tile (ctx, THREAD_SZ, dot_range, [&] (RAJA::RangeSegment const &k_tile) { - RAJA::expt::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::expt::loop_icount(ctx, k_tile, [&](int k_id, int tx) { + RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount(ctx, k_tile, [&](int k_id, int tx) { As[ty][tx] = Aview(row,k_id); }); }); - RAJA::expt::loop_icount(ctx, k_tile, [&](int k_id, int ty) { - RAJA::expt::loop_icount(ctx, x_tile, [&](int col, int tx) { + RAJA::loop_icount(ctx, k_tile, [&](int k_id, int ty) { + RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { Bs[ty][tx] = Bview(k_id,col); }); }); ctx.teamSync(); - RAJA::expt::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::expt::loop_icount(ctx, x_tile, [&](int col, int tx) { + RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { - RAJA::expt::loop_icount(ctx, k_tile, [&] (int gid, int e) { + RAJA::loop_icount(ctx, k_tile, [&] (int gid, int e) { Cs[ty][tx] += As[ty][e] * Bs[e][tx]; }); @@ -669,8 +663,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) }); // slide across matrix - RAJA::expt::loop_icount(ctx, y_tile, [&](int row, int ty) { - RAJA::expt::loop_icount(ctx, x_tile, [&](int col, int tx) { + RAJA::loop_icount(ctx, y_tile, [&](int row, int ty) { + RAJA::loop_icount(ctx, x_tile, [&](int col, int tx) { Cview(col,row) = Cs[ty][tx]; }); }); diff --git a/examples/teams_reductions.cpp b/examples/launch_reductions.cpp similarity index 73% rename from examples/teams_reductions.cpp rename to examples/launch_reductions.cpp index 5d2a4d6cbf..2929bf5075 100644 --- a/examples/teams_reductions.cpp +++ b/examples/launch_reductions.cpp @@ -14,7 +14,7 @@ #include "RAJA/RAJA.hpp" /* - * Reduction Example using RAJA Teams + * Reduction Example using RAJA Launch * * This example illustrates use of the RAJA reduction types: min, max, * sum, min-loc, and max-loc. @@ -28,32 +28,32 @@ */ #if defined(RAJA_ENABLE_OPENMP) -using host_launch = RAJA::expt::omp_launch_t; +using host_launch = RAJA::omp_launch_t; using host_loop = RAJA::omp_for_exec; #else -using host_launch = RAJA::expt::seq_launch_t; +using host_launch = RAJA::seq_launch_t; using host_loop = RAJA::loop_exec; #endif #if defined(RAJA_ENABLE_CUDA) -using device_launch = RAJA::expt::cuda_launch_t; -using device_loop = RAJA::expt::cuda_global_thread_x; +using device_launch = RAJA::cuda_launch_t; +using device_loop = RAJA::cuda_global_thread_x; #elif defined(RAJA_ENABLE_HIP) -using device_launch = RAJA::expt::hip_launch_t; -using device_loop = RAJA::expt::hip_global_thread_x; +using device_launch = RAJA::hip_launch_t; +using device_loop = RAJA::hip_global_thread_x; #endif -using launch_policy = RAJA::expt::LaunchPolicy; + >; -using loop_pol = RAJA::expt::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) using reduce_policy = RAJA::cuda_reduce; @@ -70,25 +70,25 @@ int main(int argc, char *argv[]) { if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./tut_reductions device"); + RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device"); } // // Run time policy section is demonstrated in this example by specifying // kernel exection space as a command line argument (host or device). - // Example usage ./teams_reductions host or ./teams_reductions device + // Example usage ./launch_reductions host or ./launch_reductions device // std::string exec_space = argv[1]; if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./teams_reductions host or ./teams_reductions device"); + RAJA_ABORT_OR_THROW("Usage ./launch_reductions host or ./launch_reductions device"); return 0; } - RAJA::expt::ExecPlace select_cpu_or_gpu; + RAJA::ExecPlace select_cpu_or_gpu; if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::expt::HOST; printf("Running RAJA-Teams reductions example on the host \n"); } + { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Launch reductions example on the host \n"); } if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::expt::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); } + { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Launch reductions example on the device \n"); } // _reductions_array_init_start // @@ -149,15 +149,15 @@ int main(int argc, char *argv[]) const int TEAM_SZ = 256; const int GRID_SZ = RAJA_DIVIDE_CEILING_INT(N,TEAM_SZ); - RAJA::expt::launch + RAJA::launch (select_cpu_or_gpu, - RAJA::expt::Grid(RAJA::expt::Teams(GRID_SZ), - RAJA::expt::Threads(TEAM_SZ), - "Reduction Kernel"), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) + RAJA::LaunchParams(RAJA::Teams(GRID_SZ), + RAJA::Threads(TEAM_SZ)), + "Launch Reductions", + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, arange, [&] (int i) { + RAJA::loop(ctx, arange, [&] (int i) { kernel_sum += a[i]; diff --git a/examples/raja-teams.cpp b/examples/raja-launch.cpp similarity index 73% rename from examples/raja-teams.cpp rename to examples/raja-launch.cpp index 4c54221b0f..24ed83dda7 100644 --- a/examples/raja-teams.cpp +++ b/examples/raja-launch.cpp @@ -15,9 +15,9 @@ /* - * RAJA Teams Example: Upper Triangular Pattern + Shared Memory + * RAJA Launch Example: Upper Triangular Pattern + Shared Memory * - * Teams introduces hierarchal parallelism through the concept of + * Launch introduces hierarchical parallelism through the concept of * teams and threads. Computation is executed in a pre-defined grid * composed of threads and grouped into teams. The teams model enables * developers to express parallelism through loops over teams, and inner loops @@ -34,19 +34,19 @@ /* * Define host/device launch policies */ -using launch_policy = RAJA::expt::LaunchPolicy< +using launch_policy = RAJA::LaunchPolicy< #if defined(RAJA_ENABLE_OPENMP) - RAJA::expt::omp_launch_t + RAJA::omp_launch_t #else - RAJA::expt::seq_launch_t + RAJA::seq_launch_t #endif #if defined(RAJA_ENABLE_CUDA) , - RAJA::expt::cuda_launch_t + RAJA::cuda_launch_t #endif #if defined(RAJA_ENABLE_HIP) , - RAJA::expt::hip_launch_t + RAJA::hip_launch_t #endif >; @@ -54,7 +54,7 @@ using launch_policy = RAJA::expt::LaunchPolicy< * Define team policies. * Up to 3 dimension are supported: x,y,z */ -using teams_x = RAJA::expt::LoopPolicy< +using teams_x = RAJA::LoopPolicy< #if defined(RAJA_ENABLE_OPENMP) RAJA::omp_parallel_for_exec #else @@ -73,7 +73,7 @@ using teams_x = RAJA::expt::LoopPolicy< * Define thread policies. * Up to 3 dimension are supported: x,y,z */ -using threads_x = RAJA::expt::LoopPolicy(N_tri * N_tri); } -#if defined(RAJA_DEVICE_ACTIVE) - if (select_cpu_or_gpu == RAJA::expt::DEVICE) { +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { Ddat = device_res.allocate(N_tri * N_tri); } #endif /* - * RAJA::expt::launch just starts a "kernel" and doesn't provide any looping. + * RAJA::launch just starts a "kernel" and doesn't provide any looping. * * The first argument determines which policy should be executed, * @@ -144,7 +141,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) * and is used to perform thread synchronizations within a team. */ - if (select_cpu_or_gpu == RAJA::expt::HOST){ + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){ std::cout << "\n Running upper triangular pattern example on the host...\n"; }else { std::cout << "\n Running upper triangular pattern example on the device...\n"; @@ -153,35 +150,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::View> D(Ddat, N_tri, N_tri); - RAJA::expt::launch(select_cpu_or_gpu, - RAJA::expt::Grid(RAJA::expt::Teams(N_tri), RAJA::expt::Threads(N_tri)), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch + (select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { // Array shared within threads of the same team RAJA_TEAM_SHARED int s_A[1]; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { s_A[c] = r; }); // loop c ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { + RAJA::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { D(r, c) = r * N_tri + c; printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); }); // loop c }); // loop r + }); // outer lambda - if (select_cpu_or_gpu == RAJA::expt::HOST) { + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { host_res.deallocate(Ddat); } -#if defined(RAJA_DEVICE_ACTIVE) - if (select_cpu_or_gpu == RAJA::expt::DEVICE) { +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { device_res.deallocate(Ddat); } #endif diff --git a/examples/resource-dynamic-forall.cpp b/examples/resource-dynamic-forall.cpp new file mode 100644 index 0000000000..bb54d558a5 --- /dev/null +++ b/examples/resource-dynamic-forall.cpp @@ -0,0 +1,171 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Vector Addition Example with resource + dynamic policy selection + * + * Computes c = a + b, where a, b, c are vectors of ints using + * a policy selected at run-time + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Functions for checking and printing results +// +void checkResult(int* res, int len); +void printResult(int* res, int len); + +using policy_list = camp::list + ,RAJA::cuda_exec<512> +#endif + +#if defined(RAJA_ENABLE_HIP) + ,RAJA::hip_exec<256> + ,RAJA::hip_exec<512> +#endif + >; + + +int main(int argc, char *argv[]) +{ + + if(argc != 2) { + RAJA_ABORT_OR_THROW("Usage ./cuda-dynamic-forall N, where N is the index of the policy to run"); + } + + // + // Run time policy section is demonstrated in this example by specifying + // kernel exection space as a command line argument + // Example usage ./dynamic_forall policy N + // + + const int pol = std::stoi(argv[1]); + + RAJA::ExecPlace select_cpu_or_gpu; + if(pol < 2) { + select_cpu_or_gpu = RAJA::ExecPlace::HOST; + }else { + select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; + } + + std::cout << "\n\nRAJA vector addition example...\n"; + std::cout << "Using policy # "<(N); + int *b = memoryManager::allocate(N); + int *c = memoryManager::allocate(N); + + for (int i = 0; i < N; ++i) { + a[i] = -i; + b[i] = i; + } + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style vector addition...\n"; + + // _cstyle_vector_add_start + for (int i = 0; i < N; ++i) { + c[i] = a[i] + b[i]; + } + // _cstyle_vector_add_end + + checkResult(c, N); +//printResult(c, N); + + +//----------------------------------------------------------------------------// +// Example of dynamic policy selection for forall +//----------------------------------------------------------------------------// + + RAJA::resources::Host host_res; +#if defined(RAJA_ENABLE_CUDA) + RAJA::resources::Cuda device_res; +#endif +#if defined(RAJA_ENABLE_HIP) + RAJA::resources::Hip device_res; +#endif + + //Get typed erased resource - it will internally store if we are running on the host or device +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); +#else + RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); +#endif + + RAJA::expt::dynamic_forall + (res, pol, RAJA::RangeSegment(0, N), [=] RAJA_HOST_DEVICE (int i) { + + c[i] = a[i] + b[i]; + + }); + + checkResult(c, N); + //printResult(c, N); + + +//----------------------------------------------------------------------------// +// +// Clean up. +// + memoryManager::deallocate(a); + memoryManager::deallocate(b); + memoryManager::deallocate(c); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +void checkResult(int* res, int len) +{ + bool correct = true; + for (int i = 0; i < len; i++) { + if ( res[i] != 0 ) { correct = false; } + } + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + +// +// Function to print result. +// +void printResult(int* res, int len) +{ + std::cout << std::endl; + for (int i = 0; i < len; i++) { + std::cout << "result[" << i << "] = " << res[i] << std::endl; + } + std::cout << std::endl; +} diff --git a/examples/resource-forall.cpp b/examples/resource-forall.cpp index 8b15ab4004..09871eb4fb 100644 --- a/examples/resource-forall.cpp +++ b/examples/resource-forall.cpp @@ -127,7 +127,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA omp_parallel_for_exec vector addition...\n"; - RAJA::forall(host, RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { + RAJA::forall(host, RAJA::RangeSegment(0, N), + [=] (int i) { c[i] = a[i] + b[i]; }); @@ -139,7 +140,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA omp_parallel_for_static_exec (default chunksize) vector addition...\n"; - RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { + RAJA::forall>(host, RAJA::RangeSegment(0, N), + [=] (int i) { c[i] = a[i] + b[i]; }); @@ -151,7 +153,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA omp_for_dynamic_exec (chunksize = 16) vector addition...\n"; - RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { + RAJA::forall>(host, RAJA::RangeSegment(0, N), + [=] (int i) { c[i] = a[i] + b[i]; }); @@ -165,7 +168,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA tbb_for_dynamic vector addition...\n"; - RAJA::forall(host, RAJA::RangeSegment(0, N), [=] (int i) { + RAJA::forall(host, RAJA::RangeSegment(0, N), + [=] (int i) { c[i] = a[i] + b[i]; }); @@ -177,7 +181,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA tbb_for_static<8> vector addition...\n"; - RAJA::forall>(host, RAJA::RangeSegment(0, N), [=] (int i) { + RAJA::forall>(host, RAJA::RangeSegment(0, N), + [=] (int i) { c[i] = a[i] + b[i]; }); @@ -383,4 +388,3 @@ void printResult(int* res, int len) } std::cout << std::endl; } - diff --git a/examples/resource-teams.cpp b/examples/resource-launch.cpp similarity index 72% rename from examples/resource-teams.cpp rename to examples/resource-launch.cpp index 05b5430a61..81b2c8488b 100644 --- a/examples/resource-teams.cpp +++ b/examples/resource-launch.cpp @@ -14,7 +14,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { #if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running RAJA Resource Teams on Multiple Streams...\n"; + std::cout << "\n Running RAJA Resource Launch on Multiple Streams...\n"; constexpr int N = 10; constexpr int M = 1000000; @@ -28,11 +28,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::RangeSegment m_range(0, M); RAJA::RangeSegment n_range(0, N); - using launch_policy = RAJA::expt::LaunchPolicy>; + using launch_policy = RAJA::LaunchPolicy>; - using teams_x = RAJA::expt::LoopPolicy; + using teams_x = RAJA::LoopPolicy; - using threads_x = RAJA::expt::LoopPolicy; + using threads_x = RAJA::LoopPolicy; RAJA::forall(def_host_res, n_range, [=, &def_cuda_res](int i){ @@ -40,13 +40,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::resources::Cuda res_cuda; RAJA::resources::Event e = - RAJA::expt::launch(res_cuda, - RAJA::expt::Grid(RAJA::expt::Teams(64), - RAJA::expt::Threads(1), "RAJA Teams kernel"), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { + RAJA::launch(res_cuda, + RAJA::LaunchParams(RAJA::Teams(64), + RAJA::Threads(1)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::expt::loop(ctx, m_range, [&] (int j) { - RAJA::expt::loop(ctx, one_range, [&] (int k) { + RAJA::loop(ctx, m_range, [&] (int j) { + RAJA::loop(ctx, one_range, [&] (int k) { d_array[i*M + j] = i * M + j; diff --git a/examples/resource-runtime-teams.cpp b/examples/resource-runtime-launch.cpp similarity index 76% rename from examples/resource-runtime-teams.cpp rename to examples/resource-runtime-launch.cpp index 3344510e0c..45a07bb045 100644 --- a/examples/resource-runtime-teams.cpp +++ b/examples/resource-runtime-launch.cpp @@ -29,28 +29,28 @@ * */ -using host_launch = RAJA::expt::seq_launch_t; +using host_launch = RAJA::seq_launch_t; using host_loop = RAJA::loop_exec; #if defined(RAJA_ENABLE_CUDA) -using device_launch = RAJA::expt::cuda_launch_t; -using device_loop = RAJA::expt::cuda_global_thread_x; +using device_launch = RAJA::cuda_launch_t; +using device_loop = RAJA::cuda_global_thread_x; #elif defined(RAJA_ENABLE_HIP) -using device_launch = RAJA::expt::hip_launch_t; -using device_loop = RAJA::expt::hip_global_thread_x; +using device_launch = RAJA::hip_launch_t; +using device_loop = RAJA::hip_global_thread_x; #endif -using launch_policy = RAJA::expt::LaunchPolicy; + >; -using loop_pol = RAJA::expt::LoopPolicy; + >; #if defined(RAJA_ENABLE_CUDA) using reduce_policy = RAJA::cuda_reduce; @@ -78,11 +78,11 @@ int main(int argc, char *argv[]) return 0; } - RAJA::expt::ExecPlace select_cpu_or_gpu; + RAJA::ExecPlace select_cpu_or_gpu; if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::expt::HOST; printf("Running RAJA-Teams reductions example on the host \n"); } + { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams reductions example on the host \n"); } if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::expt::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); } + { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams reductions example on the device \n"); } // _reductions_array_init_start // @@ -153,19 +153,18 @@ int main(int argc, char *argv[]) #endif //Get typed erased resource - it will internally store if we are running on the host or device -#if defined(RAJA_DEVICE_ACTIVE) - RAJA::resources::Resource res = RAJA::expt::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + RAJA::resources::Resource res = RAJA::Get_Runtime_Resource(host_res, device_res, select_cpu_or_gpu); #else - RAJA::resources::Resource res = RAJA::expt::Get_Host_Resource(host_res, select_cpu_or_gpu); + RAJA::resources::Resource res = RAJA::Get_Host_Resource(host_res, select_cpu_or_gpu); #endif //How the kernel executes now depends on how the resource is constructed (host or device) - RAJA::expt::launch - (res, RAJA::expt::Grid(RAJA::expt::Teams(GRID_SZ), - RAJA::expt::Threads(TEAM_SZ), - "Reduction Kernel"), - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { - RAJA::expt::loop(ctx, arange, [&] (int i) { + RAJA::launch + (res, RAJA::LaunchParams(RAJA::Teams(GRID_SZ), + RAJA::Threads(TEAM_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop(ctx, arange, [&] (int i) { kernel_sum += a[i]; diff --git a/examples/tut_add-vectors.cpp b/examples/tut_add-vectors.cpp deleted file mode 100644 index 9d77468276..0000000000 --- a/examples/tut_add-vectors.cpp +++ /dev/null @@ -1,284 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include -#include - -#include "memoryManager.hpp" - -#include "RAJA/RAJA.hpp" - -/* - * Vector Addition Example - * - * Computes c = a + b, where a, b, c are vectors of ints. - * It illustrates similarities between a C-style for-loop and a RAJA - * forall loop. - * - * RAJA features shown: - * - `forall` loop iteration template method - * - Index range segment - * - Execution policies - * - * If CUDA is enabled, CUDA unified memory is used. - */ - -/* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block -*/ -#if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_SYCL) -const int SYCL_BLOCK_SIZE = 256; -#endif - -// -// Functions for checking and printing results -// -void checkResult(int* res, int len); -void printResult(int* res, int len); - - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) -{ - - std::cout << "\n\nRAJA vector addition example...\n"; - -#if defined(RAJA_ENABLE_SYCL) - memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()}; - ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res); -#endif - -// -// Define vector length -// - const int N = 1000000; - -// -// Allocate and initialize vector data -// - int *a = memoryManager::allocate(N); - int *b = memoryManager::allocate(N); - int *c = memoryManager::allocate(N); - - for (int i = 0; i < N; ++i) { - a[i] = -i; - b[i] = i; - } - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-style vector addition...\n"; - - // _cstyle_vector_add_start - for (int i = 0; i < N; ++i) { - c[i] = a[i] + b[i]; - } - // _cstyle_vector_add_end - - checkResult(c, N); -//printResult(c, N); - - -//----------------------------------------------------------------------------// -// RAJA::seq_exec policy enforces strictly sequential execution.... -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA sequential vector addition...\n"; - - // _rajaseq_vector_add_start - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); - // _rajaseq_vector_add_end - - checkResult(c, N); -//printResult(c, N); - - -//----------------------------------------------------------------------------// -// RAJA::simd_exec policy should force the compiler to generate SIMD -// vectorization optimizations.... -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA SIMD vector addition...\n"; - - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); - - checkResult(c, N); -//printResult(c, N); - - -//----------------------------------------------------------------------------// -// RAJA::loop_exec policy means that the compiler is allowed to generate -// optimizations (e.g., SIMD) if it thinks it is safe to do so... -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA loop-exec vector addition...\n"; - - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); - - checkResult(c, N); -//printResult(c, N); - - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running RAJA OpenMP vector addition...\n"; - - // _rajaomp_vector_add_start - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); - // _rajaomp_vector_add_end - - checkResult(c, N); -//printResult(c, N); -#endif - - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running RAJA CUDA vector addition...\n"; - - // _rajacuda_vector_add_start - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - c[i] = a[i] + b[i]; - }); - // _rajacuda_vector_add_end - - checkResult(c, N); -//printResult(c, N); - - const bool Asynchronous = false; - std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; - - // _rajacuda_explicit_vector_add_start - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - c[i] = a[i] + b[i]; - }); - // _rajacuda_explicit_vector_add_end - - checkResult(c, N); -//printResult(c, N); -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_HIP) - std::cout << "\n Running RAJA HIP vector addition...\n"; - - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); - - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); - - // _rajahip_vector_add_start - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c[i] = d_a[i] + d_b[i]; - }); - // _rajahip_vector_add_end - - hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); - - checkResult(c, N); -//printResult(c, N); - - memoryManager::deallocate_gpu(d_a); - memoryManager::deallocate_gpu(d_b); - memoryManager::deallocate_gpu(d_c); -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_SYCL) - std::cout << "\n Running RAJA SYCL vector addition...\n"; - - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); - int *d_c = memoryManager::allocate_gpu(N); - - memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int)); - memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); - - // _rajasycl_vector_add_start - RAJA::forall>(RAJA::RangeSegment(0, N), - [=] RAJA_DEVICE (int i) { - d_c[i] = d_a[i] + d_b[i]; - }); - // _rajasycl_vector_add_end - - memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); - - checkResult(c, N); -//printResult(c, N); - - memoryManager::deallocate_gpu(d_a); - memoryManager::deallocate_gpu(d_b); - memoryManager::deallocate_gpu(d_c); -#endif - -//----------------------------------------------------------------------------// -// -// Clean up. -// - memoryManager::deallocate(a); - memoryManager::deallocate(b); - memoryManager::deallocate(c); - - std::cout << "\n DONE!...\n"; - - return 0; -} - -// -// Function to check result and report P/F. -// -void checkResult(int* res, int len) -{ - bool correct = true; - for (int i = 0; i < len; i++) { - if ( res[i] != 0 ) { correct = false; } - } - if ( correct ) { - std::cout << "\n\t result -- PASS\n"; - } else { - std::cout << "\n\t result -- FAIL\n"; - } -} - -// -// Function to print result. -// -void printResult(int* res, int len) -{ - std::cout << std::endl; - for (int i = 0; i < len; i++) { - std::cout << "result[" << i << "] = " << res[i] << std::endl; - } - std::cout << std::endl; -} - diff --git a/examples/tut_atomic-histogram.cpp b/examples/tut_atomic-histogram.cpp deleted file mode 100644 index fad8269982..0000000000 --- a/examples/tut_atomic-histogram.cpp +++ /dev/null @@ -1,230 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include -#include -#include - -#include "memoryManager.hpp" - -#include "RAJA/RAJA.hpp" - -/* - * Atomic Histogram Example - * - * Given an array of length N containing integers ranging from [0, M), - * this example uses RAJA atomics to count the number of instances a - * number between 0 and M appear. - * - * RAJA features shown: - * - `forall` loop iteration template method - * - Atomic add - * - * If CUDA is enabled, CUDA unified memory is used. - */ - -/* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block -*/ -#if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; -#endif - -template -void printBins(T* bins, int M); - -int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) -{ - // - // Define the inital array containing values between 0 and M and - // create the iteration bounds - // - int M = 10; - int N = 30; - // _range_atomic_histogram_start - RAJA::TypedRangeSegment array_range(0, N); - // _range_atomic_histogram_end - - int* array = memoryManager::allocate(N); - int* bins = memoryManager::allocate(M); - - RAJA::forall(array_range, [=](int i) { - - array[i] = rand() % M; - - }); - //----------------------------------------------------------------------------// - - std::cout << "\n\n Running RAJA sequential binning" << std::endl; - std::memset(bins, 0, M * sizeof(int)); - - RAJA::forall(array_range, [=](int i) { - - RAJA::atomicAdd(&bins[array[i]], 1); - - }); - - printBins(bins, M); - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_OPENMP) - - std::cout << "\n\n Running RAJA OMP binning" << std::endl; - std::memset(bins, 0, M * sizeof(int)); - - // _rajaomp_atomic_histogram_start - RAJA::forall(array_range, [=](int i) { - - RAJA::atomicAdd(&bins[array[i]], 1); - - }); - // _rajaomp_atomic_histogram_end - - printBins(bins, M); - -//----------------------------------------------------------------------------// - - std::cout << "\n\n Running RAJA OMP binning with auto atomic" << std::endl; - std::memset(bins, 0, M * sizeof(int)); - - RAJA::forall(array_range, [=](int i) { - - RAJA::atomicAdd(&bins[array[i]], 1); - - }); - - printBins(bins, M); - -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_CUDA) - - std::cout << "\n\nRunning RAJA CUDA binning" << std::endl; - std::memset(bins, 0, M * sizeof(int)); - - // _rajacuda_atomic_histogram_start - RAJA::forall< RAJA::cuda_exec >(array_range, - [=] RAJA_DEVICE(int i) { - - RAJA::atomicAdd(&bins[array[i]], 1); - - }); - // _rajacuda_atomic_histogram_end - - printBins(bins, M); - -//----------------------------------------------------------------------------// - - std::cout << "\n\nRunning RAJA CUDA binning with auto atomic" << std::endl; - std::memset(bins, 0, M * sizeof(int)); - - // _rajacuda_atomicauto_histogram_start - RAJA::forall< RAJA::cuda_exec >(array_range, - [=] RAJA_DEVICE(int i) { - - RAJA::atomicAdd(&bins[array[i]], 1); - - }); - // _rajacuda_atomicauto_histogram_end - - printBins(bins, M); - -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_HIP) - - std::cout << "\n\nRunning RAJA HIP binning" << std::endl; - std::memset(bins, 0, M * sizeof(int)); - - int* d_array = memoryManager::allocate_gpu(N); - int* d_bins = memoryManager::allocate_gpu(M); - hipErrchk(hipMemcpy( d_array, array, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_bins, bins, M * sizeof(int), hipMemcpyHostToDevice )); - - // _rajahip_atomic_histogram_start - RAJA::forall< RAJA::hip_exec >(array_range, - [=] RAJA_DEVICE(int i) { - - RAJA::atomicAdd(&d_bins[d_array[i]], 1); - - }); - // _rajahip_atomic_histogram_end - - hipErrchk(hipMemcpy( bins, d_bins, M * sizeof(int), hipMemcpyDeviceToHost )); - - printBins(bins, M); - -//----------------------------------------------------------------------------// - - std::cout << "\n\nRunning RAJA HIP binning with auto atomic" << std::endl; - std::memset(bins, 0, M * sizeof(int)); - hipErrchk(hipMemcpy( d_bins, bins, M * sizeof(int), hipMemcpyHostToDevice )); - - // _rajahip_atomicauto_histogram_start - RAJA::forall< RAJA::hip_exec >(array_range, - [=] RAJA_DEVICE(int i) { - - RAJA::atomicAdd(&d_bins[d_array[i]], 1); - - }); - // _rajahip_atomicauto_histogram_end - - hipErrchk(hipMemcpy( bins, d_bins, M * sizeof(int), hipMemcpyDeviceToHost )); - - printBins(bins, M); - - memoryManager::deallocate_gpu(d_array); - memoryManager::deallocate_gpu(d_bins); -#endif - -//----------------------------------------------------------------------------// - - - // - // Clean up dellacate data - // - memoryManager::deallocate(array); - memoryManager::deallocate(bins); - - std::cout << "\n DONE!...\n"; - - return 0; -} - -template -void printBins(T* bins, int M) -{ - - std::cout << "Number of instances |"; - for (int i = 0; i < M; ++i) { - std::cout << bins[i] << " "; - } - std::cout << "" << std::endl; - - std::cout << "---------------------------"; - for (int i = 0; i < M; ++i) { - std::cout << "-" - << ""; - } - std::cout << "" << std::endl; - - std::cout << "Index id |"; - for (int i = 0; i < M; ++i) { - std::cout << i << " "; - } - std::cout << "\n" << std::endl; -} diff --git a/examples/tut_batched-matrix-multiply.cpp b/examples/tut_batched-matrix-multiply.cpp deleted file mode 100644 index 434263ff99..0000000000 --- a/examples/tut_batched-matrix-multiply.cpp +++ /dev/null @@ -1,689 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include -#include -#include - -#include "RAJA/RAJA.hpp" -#include "RAJA/util/Timer.hpp" - - -#include "memoryManager.hpp" - -/* - * Batched Matrix Multiply Example - * - * This example performs batched matrix multiplication - * for matrices of dimension 3 x 3 using two different - * data layouts. - * - * Matrices are stored in arrays A and B. Results - * are stored in a third array, C. - * We introduce the notation A^{e}_rc - * to correspond to the matrix entry in the row, r, - * column, c, of matrix, e. Below we describe the two - * layouts for the case of two (N=2) 3 x 3 matrices. - * - * Layout 1: - * Matrix entries are grouped together so that each - * matrix is in a row major ordering. - * i.e. A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02}, - * A^{0}_{10}, A^{0}_{11}, A^{0}_{12}, - * A^{0}_{20}, A^{0}_{21}, A^{0}_{22}, - * A^{1}_{00}, A^{1}_{01}, A^{1}_{02}, - * A^{1}_{10}, A^{1}_{11}, A^{1}_{12}, - * A^{1}_{20}, A^{1}_{21}, A^{1}_{22}]; - * - * Layout 2: - * Matrix entries are first ordered by matrix number, - * then by column number, and finally by row number. - * i.e. A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01}, - * A^{1}_{01}, A^{0}_{02}, A^{1}_{02}, - * A^{0}_{10}, A^{1}_{10}, A^{0}_{11}, - * A^{1}_{11}, A^{0}_{12}, A^{1}_{12}, - * A^{0}_{20}, A^{1}_{20}, A^{0}_{21}, - * A^{1}_{21}, A^{0}_{22}, A^{1}_{22}]; - * - * The extension to N > 2 matrices follows by direct - * extension. By exploring different data layouts, - * we can assess which performs best under a given - * execution policy and architecture. - * - * RAJA features shown: - * - `forall` loop iteration template method - * - RAJA View - * - RAJA make_permuted_layout - * - * If CUDA is enabled, CUDA unified memory is used. - */ - -/* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block -*/ -#if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; -#endif - -// -// By default a RAJA::Index_type -// is a long int -// -using RAJA::Index_type; - -// -//Function for checking results -// -template -void checkResult(T C, Index_type noMat, int nRows, int nCols); - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) -{ - - std::cout << "\n\nRAJA batched matrix multiplication example...\n"; - -// Dimensions of matrices - const int N_c = 3; - const int N_r = 3; - -// Number of matrices - const Index_type N = 8000000; - -// Number of iterations - const int NITER = 20; - - std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n"; - -// -// Initialize a RAJA timer object -// and variable to store minimum run time -// - auto timer = RAJA::Timer(); - double minRun; - -// -// Allocate space for data in layout 1 -// - double *A = memoryManager::allocate(N_c * N_r * N); - double *B = memoryManager::allocate(N_c * N_r * N); - double *C = memoryManager::allocate(N_c * N_r * N); - -// -// Layout 1 -// -// make_permuted_layout takes the number of entries in each dimension and a -// templated array indicating index arguments with slowest to fastest stride. -// Standard C++ arrays are used to hold the number of entries in each component. -// This example uses double braces to initalize the array and its subobjects. -// The layout object will index into the array as the following C macro would -// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. -// -// RAJA::Layout objects may be templated on dimension, argument type, and -// index with unit stride. Here, the column index has unit stride (argument 2). -// - // _permutedlayout_defviews_start - std::array perm1 {{0, 1, 2}}; - auto layout1 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 ); - - RAJA::View> Aview(A, layout1); - RAJA::View> Bview(B, layout1); - RAJA::View> Cview(C, layout1); - // _permutedlayout_defviews_end - -// -// Allocate space for data in layout 2 -// - double *A2 = memoryManager::allocate(N_c * N_r * N); - double *B2 = memoryManager::allocate(N_c * N_r * N); - double *C2 = memoryManager::allocate(N_c * N_r * N); - -// -// Permuted layout - equivalent to indexing using the following macro -// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] -// In this case the element index has unit stride (argument 0). -// - // _permutedlayout_permviews_start - std::array perm2 {{1, 2, 0}}; - auto layout2 = - RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 ); - - RAJA::View> Aview2(A2, layout2); - RAJA::View> Bview2(B2, layout2); - RAJA::View> Cview2(C2, layout2); - // _permutedlayout_permviews_end - -// -// Initialize data -// -#if defined(RAJA_ENABLE_OPENMP) - using INIT_POL = RAJA::omp_parallel_for_exec; -#else - using INIT_POL = RAJA::loop_exec; -#endif - - RAJA::forall(RAJA::RangeSegment(0, N), [=](Index_type e) { - for (Index_type row = 0; row < N_r; ++row) { - for (Index_type col = 0; col < N_c; ++col) { - Aview(e, row, col) = row; - Bview(e, row, col) = col; - Cview(e, row, col) = 0; - - Aview2(e, row, col) = row; - Bview2(e, row, col) = col; - Cview2(e, row, col) = 0; - } - } - }); - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_OPENMP) - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 1 (RAJA - omp parallel for) ... " << std::endl; - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - // _permutedlayout_batchedmatmult_omp_start - RAJA::forall( - RAJA::RangeSegment(0, N), [=](Index_type e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - - }); - // _permutedlayout_batchedmatmult_omp_end - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; - checkResult(Cview, N, N_r, N_c); - -//----------------------------------------------------------------------------// - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 2 (RAJA - omp parallel for) ... " << std::endl; - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall( - RAJA::RangeSegment(0, N), [=](Index_type e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - }); - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - std::cout<< "\trun time : " << minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); - -#endif - -//----------------------------------------------------------------------------// - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 1 (RAJA - sequential) ... " << std::endl; - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall(RAJA::RangeSegment(0, N), [=](Index_type e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - }); - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - - std::cout << "\trun time : " << minRun << " seconds" << std::endl; - checkResult(Cview, N, N_r, N_c); - -//----------------------------------------------------------------------------// - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 2 (RAJA - sequential) ... " << std::endl; - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall(RAJA::RangeSegment(0, N), [=](Index_type e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - }); - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_CUDA) - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 1 (RAJA - cuda) ... " << std::endl; - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall>( - RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) { - - Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) - + Aview(e, 0, 1) * Bview(e, 1, 0) - + Aview(e, 0, 2) * Bview(e, 2, 0); - Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) - + Aview(e, 0, 1) * Bview(e, 1, 1) - + Aview(e, 0, 2) * Bview(e, 2, 1); - Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) - + Aview(e, 0, 1) * Bview(e, 1, 2) - + Aview(e, 0, 2) * Bview(e, 2, 2); - - Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) - + Aview(e, 1, 1) * Bview(e, 1, 0) - + Aview(e, 1, 2) * Bview(e, 2, 0); - Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) - + Aview(e, 1, 1) * Bview(e, 1, 1) - + Aview(e, 1, 2) * Bview(e, 2, 1); - Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) - + Aview(e, 1, 1) * Bview(e, 1, 2) - + Aview(e, 1, 2) * Bview(e, 2, 2); - - Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) - + Aview(e, 2, 1) * Bview(e, 1, 0) - + Aview(e, 2, 2) * Bview(e, 2, 0); - Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) - + Aview(e, 2, 1) * Bview(e, 1, 1) - + Aview(e, 2, 2) * Bview(e, 2, 1); - Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) - + Aview(e, 2, 1) * Bview(e, 1, 2) - + Aview(e, 2, 2) * Bview(e, 2, 2); - - }); - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; - checkResult(Cview, N, N_r, N_c); - -//----------------------------------------------------------------------------// - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 2 (RAJA - cuda) ... " << std::endl; - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall>( - RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) { - - Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) - + Aview2(e, 0, 1) * Bview2(e, 1, 0) - + Aview2(e, 0, 2) * Bview2(e, 2, 0); - Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) - + Aview2(e, 0, 1) * Bview2(e, 1, 1) - + Aview2(e, 0, 2) * Bview2(e, 2, 1); - Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) - + Aview2(e, 0, 1) * Bview2(e, 1, 2) - + Aview2(e, 0, 2) * Bview2(e, 2, 2); - - Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) - + Aview2(e, 1, 1) * Bview2(e, 1, 0) - + Aview2(e, 1, 2) * Bview2(e, 2, 0); - Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) - + Aview2(e, 1, 1) * Bview2(e, 1, 1) - + Aview2(e, 1, 2) * Bview2(e, 2, 1); - Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) - + Aview2(e, 1, 1) * Bview2(e, 1, 2) - + Aview2(e, 1, 2) * Bview2(e, 2, 2); - - Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) - + Aview2(e, 2, 1) * Bview2(e, 1, 0) - + Aview2(e, 2, 2) * Bview2(e, 2, 0); - Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) - + Aview2(e, 2, 1) * Bview2(e, 1, 1) - + Aview2(e, 2, 2) * Bview2(e, 2, 1); - Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) - + Aview2(e, 2, 1) * Bview2(e, 1, 2) - + Aview2(e, 2, 2) * Bview2(e, 2, 2); - - }); - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_HIP) - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 1 (RAJA - hip) ... " << std::endl; - - double *d_A = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C = memoryManager::allocate_gpu(N_c * N_r * N); - - double *d_A2 = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_B2 = memoryManager::allocate_gpu(N_c * N_r * N); - double *d_C2 = memoryManager::allocate_gpu(N_c * N_r * N); - - RAJA::View> d_Aview(d_A, layout1); - RAJA::View> d_Bview(d_B, layout1); - RAJA::View> d_Cview(d_C, layout1); - - RAJA::View> d_Aview2(d_A2, layout2); - RAJA::View> d_Bview2(d_B2, layout2); - RAJA::View> d_Cview2(d_C2, layout2); - - hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall>( - RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) { - - d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 0); - d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 1); - d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 0, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 0, 2) * d_Bview(e, 2, 2); - - d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 0); - d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 1); - d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 1, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 1, 2) * d_Bview(e, 2, 2); - - d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 0) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 0); - d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 1) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 1); - d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) - + d_Aview(e, 2, 1) * d_Bview(e, 1, 2) - + d_Aview(e, 2, 2) * d_Bview(e, 2, 2); - - }); - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - - hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); - - std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; - checkResult(Cview, N, N_r, N_c); - -//----------------------------------------------------------------------------// - - std::cout << " \n Performing batched matrix multiplication" - << " with layout 2 (RAJA - hip) ... " << std::endl; - - minRun = std::numeric_limits::max(); - for (int i = 0; i < NITER; ++i) { - - timer.start(); - RAJA::forall>( - RAJA::RangeSegment(0, N), [=] RAJA_DEVICE(Index_type e) { - - d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) - + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) - + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0); - d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1) - + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1) - + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1); - d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2) - + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2) - + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2); - - d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0) - + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0) - + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0); - d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1) - + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1) - + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1); - d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2) - + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2) - + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2); - - d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0) - + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0) - + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0); - d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1) - + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1) - + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1); - d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2) - + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2) - + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2); - - }); - timer.stop(); - - RAJA::Timer::ElapsedType tMin = timer.elapsed(); - if (tMin < minRun) minRun = tMin; - timer.reset(); - } - - hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); - - std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; - checkResult(Cview2, N, N_r, N_c); - - memoryManager::deallocate_gpu(d_A); - memoryManager::deallocate_gpu(d_B); - memoryManager::deallocate_gpu(d_C); - memoryManager::deallocate_gpu(d_A2); - memoryManager::deallocate_gpu(d_B2); - memoryManager::deallocate_gpu(d_C2); -#endif - -//----------------------------------------------------------------------------// - -// -// Clean up. -// - memoryManager::deallocate(A); - memoryManager::deallocate(B); - memoryManager::deallocate(C); - memoryManager::deallocate(A2); - memoryManager::deallocate(B2); - memoryManager::deallocate(C2); - - std::cout << "\n DONE!...\n"; - return 0; -} - -// -// check result -// -template -void checkResult(T C, Index_type noMat, int nRows, int nCols) -{ - - bool status = true; - for (int e = 0; e < noMat; ++e) { - for (int row = 0; row < nRows; ++row) { - for (int col = 0; col < nCols; ++col) { - if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) { - status = false; - } - } - } - } - - if ( status ) { - std::cout << "\tresult -- PASS\n"; - } else { - std::cout << "\tresult -- FAIL\n"; - } -} diff --git a/examples/tut_halo-exchange.cpp b/examples/tut_halo-exchange.cpp index eea8493812..a225161e78 100644 --- a/examples/tut_halo-exchange.cpp +++ b/examples/tut_halo-exchange.cpp @@ -251,7 +251,7 @@ int main(int argc, char **argv) std::vector unpack_index_lists(num_neighbors, nullptr); std::vector unpack_index_list_lengths(num_neighbors, 0); create_unpack_lists(unpack_index_lists, unpack_index_list_lengths, halo_width, grid_dims); - // _halo_exchange_index_lisgeneratete_end + // _halo_exchange_index_list_generate_end // @@ -489,7 +489,8 @@ int main(int argc, char **argv) using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::loop_work, RAJA::ordered, - RAJA::ragged_array_of_objects >; + RAJA::ragged_array_of_objects, + RAJA::indirect_function_call_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, int, @@ -733,7 +734,8 @@ int main(int argc, char **argv) using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::omp_work, RAJA::ordered, - RAJA::ragged_array_of_objects >; + RAJA::ragged_array_of_objects, + RAJA::indirect_function_call_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, int, @@ -1047,7 +1049,8 @@ int main(int argc, char **argv) using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::cuda_work_async, RAJA::unordered_cuda_loop_y_block_iter_x_threadblock_average, - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, int, @@ -1343,11 +1346,12 @@ int main(int argc, char **argv) } +#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) //----------------------------------------------------------------------------// // RAJA::WorkGroup with hip_work allows deferred kernel fusion execution //----------------------------------------------------------------------------// { - std::cout << "\n Running RAJA Hip workgroup halo exchange...\n"; + std::cout << "\n Running RAJA Hip indirect dispatch workgroup halo exchange...\n"; double minCycle = std::numeric_limits::max(); @@ -1380,12 +1384,9 @@ int main(int argc, char **argv) using workgroup_policy = RAJA::WorkGroupPolicy < RAJA::hip_work_async, -#if defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, -#else - RAJA::ordered, -#endif - RAJA::constant_stride_array_of_objects >; + RAJA::constant_stride_array_of_objects, + RAJA::indirect_function_call_dispatch >; using workpool = RAJA::WorkPool< workgroup_policy, int, @@ -1518,6 +1519,196 @@ int main(int argc, char **argv) } + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; + + // check results against reference copy + checkResult(vars, vars_ref, var_size, num_vars); + //printResult(vars, var_size, num_vars); + } +#endif + +//----------------------------------------------------------------------------// +// RAJA::WorkGroup with hip_work allows deferred kernel fusion execution +//----------------------------------------------------------------------------// + { + std::cout << "\n Running RAJA Hip direct dispatch workgroup halo exchange...\n"; + + double minCycle = std::numeric_limits::max(); + + + std::vector hip_vars(num_vars, nullptr); + std::vector hip_pack_index_lists(num_neighbors, nullptr); + std::vector hip_unpack_index_lists(num_neighbors, nullptr); + + for (int v = 0; v < num_vars; ++v) { + hip_vars[v] = memoryManager::allocate_gpu(var_size); + } + + for (int l = 0; l < num_neighbors; ++l) { + int pack_len = pack_index_list_lengths[l]; + hip_pack_index_lists[l] = memoryManager::allocate_gpu(pack_len); + hipErrchk(hipMemcpy( hip_pack_index_lists[l], pack_index_lists[l], pack_len * sizeof(int), hipMemcpyHostToDevice )); + + int unpack_len = unpack_index_list_lengths[l]; + hip_unpack_index_lists[l] = memoryManager::allocate_gpu(unpack_len); + hipErrchk(hipMemcpy( hip_unpack_index_lists[l], unpack_index_lists[l], unpack_len * sizeof(int), hipMemcpyHostToDevice )); + } + + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(unpack_index_lists, hip_unpack_index_lists); + + + using forall_policy = RAJA::hip_exec_async; + + struct Packer { + double* buffer; + double* var; + int* list; + RAJA_DEVICE void operator() (int i) const { + buffer[i] = var[list[i]]; + } + }; + + struct UnPacker { + double* buffer; + double* var; + int* list; + RAJA_DEVICE void operator()(int i) const { + var[list[i]] = buffer[i]; + } + }; + + using workgroup_policy = RAJA::WorkGroupPolicy < + RAJA::hip_work_async, + RAJA::unordered_hip_loop_y_block_iter_x_threadblock_average, + RAJA::constant_stride_array_of_objects, + RAJA::direct_dispatch, + camp::list> + >; + + using workpool = RAJA::WorkPool< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + + using workgroup = RAJA::WorkGroup< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + + using worksite = RAJA::WorkSite< workgroup_policy, + int, + RAJA::xargs<>, + pinned_allocator >; + + std::vector buffers(num_neighbors, nullptr); + + for (int l = 0; l < num_neighbors; ++l) { + + int buffer_len = num_vars * pack_index_list_lengths[l]; + + buffers[l] = memoryManager::allocate_gpu(buffer_len); + + } + + workpool pool_pack (pinned_allocator{}); + workpool pool_unpack(pinned_allocator{}); + + for (int c = 0; c < num_cycles; ++c ) { + timer.start(); + { + + // set vars + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + RAJA::forall(range_segment(0, var_size), [=] RAJA_DEVICE (int i) { + var[i] = i + v; + }); + } + + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = pack_index_lists[l]; + int len = pack_index_list_lengths[l]; + + // pack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_pack.enqueue(range_segment(0, len), Packer{buffer, var, list}); + + buffer += len; + } + } + + workgroup group_pack = pool_pack.instantiate(); + + worksite site_pack = group_pack.run(); + + hipErrchk(hipDeviceSynchronize()); + + // send all messages + + // recv all messages + + for (int l = 0; l < num_neighbors; ++l) { + + double* buffer = buffers[l]; + int* list = unpack_index_lists[l]; + int len = unpack_index_list_lengths[l]; + + // unpack + for (int v = 0; v < num_vars; ++v) { + + double* var = vars[v]; + + pool_unpack.enqueue(range_segment(0, len), UnPacker{buffer, var, list}); + + buffer += len; + } + } + + workgroup group_unpack = pool_unpack.instantiate(); + + worksite site_unpack = group_unpack.run(); + + hipErrchk(hipDeviceSynchronize()); + + } + timer.stop(); + + RAJA::Timer::ElapsedType tCycle = timer.elapsed(); + if (tCycle < minCycle) minCycle = tCycle; + timer.reset(); + } + + for (int l = 0; l < num_neighbors; ++l) { + + memoryManager::deallocate_gpu(buffers[l]); + + } + + + std::swap(vars, hip_vars); + std::swap(pack_index_lists, hip_pack_index_lists); + std::swap(unpack_index_lists, hip_unpack_index_lists); + + for (int v = 0; v < num_vars; ++v) { + hipErrchk(hipMemcpy( vars[v], hip_vars[v], var_size * sizeof(double), hipMemcpyDeviceToHost )); + memoryManager::deallocate_gpu(hip_vars[v]); + } + + for (int l = 0; l < num_neighbors; ++l) { + memoryManager::deallocate_gpu(hip_pack_index_lists[l]); + memoryManager::deallocate_gpu(hip_unpack_index_lists[l]); + } + + std::cout<< "\tmin cycle run time : " << minCycle << " seconds" << std::endl; // check results against reference copy diff --git a/examples/tut_indexset-segments.cpp b/examples/tut_indexset-segments.cpp deleted file mode 100644 index d072a6618e..0000000000 --- a/examples/tut_indexset-segments.cpp +++ /dev/null @@ -1,465 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include -#include -#include -#include - -#include "memoryManager.hpp" - -#include "RAJA/RAJA.hpp" - -#include "camp/resource.hpp" - -/* - * Index sets and Segments Example - * - * This example uses the daxpy kernel from a previous example. It - * illustrates how to use RAJA index sets and segments. This is - * important for applications and algorithms that need to use - * indirection arrays for irregular access. Combining range and - * list segments in a single index set, when possible, can - * increase performance by allowing compilers to optimize for - * specific segment types (e.g., SIMD for range segments). - * - * RAJA features shown: - * - `forall` loop iteration template method - * - Index range segment - * - Index list segment - * - Strided index range segment - * - TypedIndexSet segment container - * - Hierarchical execution policies - * - * If CUDA is enabled, CUDA unified memory is used. - */ - -/* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block -*/ -#if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; -#endif - -//----------------------------------------------------------------------------// -// Define types for ListSegments and indices used in examples -//----------------------------------------------------------------------------// -// _raja_list_segment_type_start -using IdxType = RAJA::Index_type; -using ListSegType = RAJA::TypedListSegment; -// _raja_list_segment_type_end - -// -// Functions to check and print results -// -void checkResult(double* v1, double* v2, IdxType len); -void printResult(double* v, int len); - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) -{ - - std::cout << "\n\nRAJA index sets and segments example...\n"; - -// -// Define vector length -// - const IdxType N = 1000000; - -// -// Allocate and initialize vector data -// - double* a0 = memoryManager::allocate(N); - double* aref = memoryManager::allocate(N); - - double* a = memoryManager::allocate(N); - double* b = memoryManager::allocate(N); - - double c = 3.14159; - - for (IdxType i = 0; i < N; i++) { - a0[i] = 1.0; - b[i] = 2.0; - } - - -//----------------------------------------------------------------------------// -// -// C-version of the daxpy kernel to set the reference result. -// - std::cout << "\n Running C-version of daxpy to set reference result...\n"; - - std::memcpy( aref, a0, N * sizeof(double) ); - - for (IdxType i = 0; i < N; i++) { - aref[i] += b[i] * c; - } - -//printResult(a, N); - -//----------------------------------------------------------------------------// -// -// In the following, we show RAJA versions of the daxpy operation and -// using different Segment constructs and TypedIndexSets. These are all -// run sequentially. The only thing that changes in these versions is -// the object passed to the 'forall' method that defines the iteration -// space. -// -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA range segment daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - - // _rajaseq_daxpy_range_start - RAJA::forall(RAJA::RangeSegment(0, N), [=] (IdxType i) { - a[i] += b[i] * c; - }); - // _rajaseq_daxpy_range_end - - checkResult(a, aref, N); -//printResult(a, N); - -//----------------------------------------------------------------------------// -// Resource object used to construct list segment objects with indices -// living in host (CPU) memory. - - camp::resources::Resource host_res{camp::resources::Host()}; - - -// -// RAJA list segment version #1 -// - std::cout << "\n Running RAJA list segment daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - -// -// Collect indices in a vector to create list segment -// -// - // _rajaseq_daxpy_list_start - std::vector idx; - for (IdxType i = 0; i < N; ++i) { - idx.push_back(i); - } - - ListSegType idx_list( &idx[0], idx.size(), host_res ); - - RAJA::forall(idx_list, [=] (IdxType i) { - a[i] += b[i] * c; - }); - // _rajaseq_daxpy_list_end - - checkResult(a, aref, N); -//printResult(a, N); - -//----------------------------------------------------------------------------// -// -// RAJA list segment version #2 -// - std::cout << "\n Running RAJA list segment daxpy with indices reversed...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - -// -// Reverse the order of indices in the vector -// - // _raja_list_segment_daxpy_reverse_start - std::reverse( idx.begin(), idx.end() ); - - ListSegType idx_reverse_list( &idx[0], idx.size(), host_res ); - - RAJA::forall(idx_reverse_list, [=] (IdxType i) { - a[i] += b[i] * c; - }); - // _raja_list_segment_daxpy_reverse_end - - checkResult(a, aref, N); -//printResult(a, N); - -//----------------------------------------------------------------------------// -// -// Alternatively, we can also use a RAJA strided range segment to run the -// loop in reverse. -// - std::cout << "\n Running RAJA daxpy with indices reversed via negatively strided range segment...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - -// -// Reverse the order of indices in the vector -// - // _raja_range_segment_daxpy_negstride_start - RAJA::forall(RAJA::RangeStrideSegment(N-1, -1, -1), - [=] (IdxType i) { - a[i] += b[i] * c; - }); - // _raja_range_segment_daxpy_negstride_end - - checkResult(a, aref, N); -//printResult(a, N); - -//----------------------------------------------------------------------------// - -// -// Sequential index set execution policy used in several of the following -// example implementations. -// - - // _raja_seq_indexset_policy_daxpy_start - using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; - // _raja_seq_indexset_policy_daxpy_end - -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA index set (ListSegment) daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - - // _raja_indexset_list_daxpy_start - RAJA::TypedIndexSet is1; - - is1.push_back( idx_list ); // use list segment created earlier. - - RAJA::forall(is1, [=] (IdxType i) { - a[i] += b[i] * c; - }); - // _raja_indexset_list_daxpy_end - - checkResult(a, aref, N); -//printResult(a, N); - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA index set (2 RangeSegments) daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - - // _raja_indexset_2ranges_daxpy_start - RAJA::TypedIndexSet is2; - is2.push_back( RAJA::RangeSegment(0, N/2) ); - is2.push_back( RAJA::RangeSegment(N/2, N) ); - - RAJA::forall(is2, [=] (IdxType i) { - a[i] += b[i] * c; - }); - // _raja_indexset_2ranges_daxpy_end - - checkResult(a, aref, N); -//printResult(a, N); - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - - // _raja_indexset_2ranges_1list_daxpy_start -// -// Collect indices in a vector to create list segment -// - std::vector idx1; - for (IdxType i = N/3; i < 2*N/3; ++i) { - idx1.push_back(i); - } - - ListSegType idx1_list( &idx1[0], idx1.size(), host_res ); - - RAJA::TypedIndexSet is3; - is3.push_back( RAJA::RangeSegment(0, N/3) ); - is3.push_back( idx1_list ); - is3.push_back( RAJA::RangeSegment(2*N/3, N) ); - - RAJA::forall(is3, [=] (IdxType i) { - a[i] += b[i] * c; - }); - // _raja_indexset_2ranges_1list_daxpy_end - - checkResult(a, aref, N); -//printResult(a, N); - - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_OPENMP) -// -// Run the previous version in parallel (2 different ways) just for fun... -// - - std::cout << - "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << - " (sequential iteration over segments, OpenMP parallel segment execution)...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - - // _raja_indexset_ompinnerpolicy_daxpy_start - using OMP_ISET_EXECPOL1 = RAJA::ExecPolicy; - // _raja_indexset_ompinnerpolicy_daxpy_end - - RAJA::forall(is3, [=] (IdxType i) { - a[i] += b[i] * c; - }); - - checkResult(a, aref, N); -//printResult(a, N); - - -//----------------------------------------------------------------------------// - - std::cout << - "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << - " (OpenMP parallel iteration over segments, sequential segment execution)...\n"; - - std::memcpy( a, a0, N * sizeof(double) ); - - // _raja_indexset_ompouterpolicy_daxpy_start - using OMP_ISET_EXECPOL2 = RAJA::ExecPolicy; - // _raja_indexset_ompouterpolicy_daxpy_end - - RAJA::forall(is3, [=] (IdxType i) { - a[i] += b[i] * c; - }); - - checkResult(a, aref, N); -//printResult(a, N); -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_CUDA) - -// -// We create a new resource object and index set so that list segment -// indices live in CUDA deviec memory. -// - camp::resources::Resource cuda_res{camp::resources::Cuda()}; - - ListSegType idx1_list_cuda( &idx1[0], idx1.size(), cuda_res ); - - RAJA::TypedIndexSet is3_cuda; - is3_cuda.push_back( RAJA::RangeSegment(0, N/3) ); - is3_cuda.push_back( idx1_list_cuda ); - is3_cuda.push_back( RAJA::RangeSegment(2*N/3, N) ); - - - std::cout << - "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << - " (sequential iteration over segments, CUDA parallel segment execution)...\n"; - - // _raja_indexset_cudapolicy_daxpy_start - using CUDA_ISET_EXECPOL = RAJA::ExecPolicy>; - // _raja_indexset_cudapolicy_daxpy_end - - std::memcpy( a, a0, N * sizeof(double) ); - - RAJA::forall(is3_cuda, [=] RAJA_DEVICE (IdxType i) { - a[i] += b[i] * c; - }); - - checkResult(a, aref, N); -//printResult(a, N); -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_HIP) - -// -// We create a new resource object and index set so that list segment -// indices live in Hip deviec memory. -// - camp::resources::Resource hip_res{camp::resources::Hip()}; - - ListSegType idx1_list_hip( &idx1[0], idx1.size(), hip_res ); - - RAJA::TypedIndexSet is3_hip; - is3_hip.push_back( RAJA::RangeSegment(0, N/3) ); - is3_hip.push_back( idx1_list_hip ); - is3_hip.push_back( RAJA::RangeSegment(2*N/3, N) ); - - std::cout << - "\n Running RAJA index set (2 RangeSegments, 1 ListSegment) daxpy\n" << - " (sequential iteration over segments, HIP parallel segment execution)...\n"; - - // _raja_indexset_hippolicy_daxpy_start - using HIP_ISET_EXECPOL = RAJA::ExecPolicy>; - // _raja_indexset_hippolicy_daxpy_end - - double* d_a = memoryManager::allocate_gpu(N); - double* d_b = memoryManager::allocate_gpu(N); - - hipErrchk(hipMemcpy( d_a, a0, N * sizeof(double), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice )); - - RAJA::forall(is3_hip, [=] RAJA_DEVICE (IdxType i) { - d_a[i] += d_b[i] * c; - }); - - hipErrchk(hipMemcpy( a, d_a, N * sizeof(double), hipMemcpyDeviceToHost )); - - checkResult(a, aref, N); -//printResult(a, N); - - memoryManager::deallocate_gpu(d_a); - memoryManager::deallocate_gpu(d_b); -#endif - -//----------------------------------------------------------------------------// - -// -// Clean up. -// - memoryManager::deallocate(a); - memoryManager::deallocate(b); - memoryManager::deallocate(a0); - memoryManager::deallocate(aref); - - std::cout << "\n DONE!...\n"; - - return 0; -} - -// -// Function to check result and report P/F. -// -void checkResult(double* v1, double* v2, IdxType len) -{ - bool match = true; - for (IdxType i = 0; i < len; i++) { - if ( v1[i] != v2[i] ) { match = false; } - } - if ( match ) { - std::cout << "\n\t result -- PASS\n"; - } else { - std::cout << "\n\t result -- FAIL\n"; - } -} - -// -// Function to print result. -// -void printResult(double* v, IdxType len) -{ - std::cout << std::endl; - for (IdxType i = 0; i < len; i++) { - std::cout << "result[" << i << "] = " << v[i] << std::endl; - } - std::cout << std::endl; -} - diff --git a/examples/tut_teams_basic.cpp b/examples/tut_launch_basic.cpp similarity index 64% rename from examples/tut_teams_basic.cpp rename to examples/tut_launch_basic.cpp index 06b04bd510..2ea0c83ea8 100644 --- a/examples/tut_teams_basic.cpp +++ b/examples/tut_launch_basic.cpp @@ -12,18 +12,18 @@ #include "RAJA/RAJA.hpp" /* - * Developing with RAJA Teams + * Developing with RAJA Launch * * This example serves as a basic overview of - * capabilities with the RAJA Teams API. + * capabilities with the RAJA Launch API. * * RAJA features shown: - * - RAJA::expt::launch - * - RAJA::expt::loop + * - RAJA::launch + * - RAJA::loop */ /* - * The RAJA teams framework enables developers + * The RAJA::Launch framework enables developers * to expressed algorithms in terms of nested * loops within an execution space. RAJA teams * enables run time selection of a host or @@ -34,27 +34,27 @@ */ // __host_launch_start -using host_launch = RAJA::expt::seq_launch_t; +using host_launch = RAJA::seq_launch_t; // __host_launch_end #if defined(RAJA_ENABLE_CUDA) // __device_launch_start -using device_launch = RAJA::expt::cuda_launch_t; +using device_launch = RAJA::cuda_launch_t; // __device_launch_end #elif defined(RAJA_ENABLE_HIP) -using device_launch = RAJA::expt::hip_launch_t; +using device_launch = RAJA::hip_launch_t; #endif -using launch_policy = RAJA::expt::LaunchPolicy< +using launch_policy = RAJA::LaunchPolicy< host_launch -#if defined(RAJA_DEVICE_ACTIVE) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ,device_launch #endif >; /* - * RAJA teams follows a similar thread/block programming model - * as found in CUDA/HIP/SYCL. Loops within an execution + * RAJA launch exposes a thread/block programming model + * as used in CUDA/HIP/SYCL. Loops within an execution * maybe mapped to either threads or teams. Under this * programming model, computation is performed with * a collection of threads which are grouped into teams. @@ -64,7 +64,7 @@ using launch_policy = RAJA::expt::LaunchPolicy< * On the host the loops expands to standard C style for loops. */ -using teams_x = RAJA::expt::LoopPolicy< +using teams_x = RAJA::LoopPolicy< RAJA::loop_exec #if defined(RAJA_ENABLE_CUDA) , @@ -76,7 +76,7 @@ using teams_x = RAJA::expt::LoopPolicy< #endif >; -using teams_y = RAJA::expt::LoopPolicy< +using teams_y = RAJA::LoopPolicy< RAJA::loop_exec #if defined(RAJA_ENABLE_CUDA) , @@ -88,7 +88,7 @@ using teams_y = RAJA::expt::LoopPolicy< #endif >; -using threads_x = RAJA::expt::LoopPolicy; -using threads_y = RAJA::expt::LoopPolicy; -#if defined(RAJA_DEVICE_ACTIVE) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) __global__ void gpuKernel() { //Equivalent CUDA/HIP style thread/block mapping @@ -133,29 +133,35 @@ __global__ void gpuKernel() } #endif -int main(int argc, char *argv[]) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +int main(int argc, char* argv[]) +#else +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv)) +#endif { +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + if(argc != 2) { - RAJA_ABORT_OR_THROW("Usage ./tut_teams_basic host or ./tut_teams_basic device"); + RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device"); } // // Run time policy section is demonstrated in this example by specifying // kernel exection space as a command line argument (host or device). -// Example usage ./tut_teams_basic host or ./tut_teams_basic device +// Example usage ./tut_launch_basic host or ./tut_launch_basic device // std::string exec_space = argv[1]; if(!(exec_space.compare("host") == 0 || exec_space.compare("device") == 0 )){ - RAJA_ABORT_OR_THROW("Usage ./tut_teams_basic host or ./tut_teams_basic device"); + RAJA_ABORT_OR_THROW("Usage ./tut_launch_basic host or ./tut_launch_basic device"); return 0; } - RAJA::expt::ExecPlace select_cpu_or_gpu; + RAJA::ExecPlace select_cpu_or_gpu; if(exec_space.compare("host") == 0) - { select_cpu_or_gpu = RAJA::expt::HOST; printf("Running RAJA-Teams on the host \n"); } + { select_cpu_or_gpu = RAJA::ExecPlace::HOST; printf("Running RAJA-Teams on the host \n"); } if(exec_space.compare("device") == 0) - { select_cpu_or_gpu = RAJA::expt::DEVICE; printf("Running RAJA-Teams on the device \n"); } + { select_cpu_or_gpu = RAJA::ExecPlace::DEVICE; printf("Running RAJA-Teams on the device \n"); } // // The following three kernels illustrate loop based parallelism @@ -168,20 +174,22 @@ int main(int argc, char *argv[]) const int Nteams = 2; const int Nthreads = 2; // __compute_grid_end - - RAJA::expt::launch(select_cpu_or_gpu, - RAJA::expt::Grid(RAJA::expt::Teams(Nteams,Nteams), - RAJA::expt::Threads(Nthreads,Nthreads)), - [=] RAJA_HOST_DEVICE (RAJA::expt::LaunchContext ctx) { - // _team_loops_start - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nteams), [&] (int by) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nteams), [&] (int bx) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nthreads), [&] (int ty) { - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, Nthreads), [&] (int tx) { + RAJA::launch(select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(Nteams,Nteams), + RAJA::Threads(Nthreads,Nthreads)), + + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + // _team_loops_start + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nteams), [&] (int by) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nteams), [&] (int bx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nthreads), [&] (int ty) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, Nthreads), [&] (int tx) { + printf("RAJA Teams: threadId_x %d threadId_y %d teamId_x %d teamId_y %d \n", + tx, ty, bx, by); - printf("RAJA Teams: threadId_x %d threadId_y %d teamId_x %d teamId_y %d \n", - tx, ty, bx, by); }); }); @@ -193,16 +201,16 @@ int main(int argc, char *argv[]) }); //Equivalent C style loops - if(select_cpu_or_gpu == RAJA::expt::HOST) { + if(select_cpu_or_gpu == RAJA::ExecPlace::HOST) { // _c_style_loops_start - for(int by=0; by>>(); cudaDeviceSynchronize(); #endif #if defined(RAJA_ENABLE_HIP) - if(select_cpu_or_gpu == RAJA::expt::DEVICE) + if(select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) hipLaunchKernelGGL((gpuKernel), dim3(griddim), dim3(blockdim), 0, 0); hipDeviceSynchronize(); #endif +#else + std::cout << "Please build with CUDA or HIP to run this example ...\n"; +#endif + return 0; } diff --git a/examples/tut_matrix-multiply.cpp b/examples/tut_matrix-multiply.cpp index 8d4b4adeab..be9bebbd11 100644 --- a/examples/tut_matrix-multiply.cpp +++ b/examples/tut_matrix-multiply.cpp @@ -150,9 +150,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // row, column, and dot-product loops for RAJA variants // // _matmult_ranges_start - RAJA::RangeSegment row_range(0, N); - RAJA::RangeSegment col_range(0, N); - RAJA::RangeSegment dot_range(0, N); + RAJA::TypedRangeSegment row_range(0, N); + RAJA::TypedRangeSegment col_range(0, N); + RAJA::TypedRangeSegment dot_range(0, N); // _matmult_ranges_end //----------------------------------------------------------------------------// @@ -1012,10 +1012,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) Shmem aShared, bShared, cShared; - RAJA::kernel_param(RAJA::make_tuple(RAJA::RangeSegment(0, N), - RAJA::RangeSegment(0, N), - RAJA::RangeSegment(0, N)), - RAJA::make_tuple(aShared, bShared, cShared), + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N)), + RAJA::make_tuple(aShared, bShared, cShared), // Zero out thread local memory for storing dot products [=] RAJA_HOST_DEVICE (int tn, int tp, Shmem &cShared) { diff --git a/examples/tut_nested-loop-reorder.cpp b/examples/tut_nested-loop-reorder.cpp deleted file mode 100644 index 9bcf4c5a5b..0000000000 --- a/examples/tut_nested-loop-reorder.cpp +++ /dev/null @@ -1,141 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include - -#include "RAJA/RAJA.hpp" - -/* - * Nested Loop Reorder Example - * - * This example shows how to reorder RAJA nested loops by reordering - * nested policy arguments. It does no actual computation and just - * prints out the loop indices to show the different orderings. - * - * RAJA features shown: - * - Index range segment - * - 'RAJA::kernel' loop abstractions and execution policies - * - Nested loop reordering - * - Strongly-typed loop indices - */ - -// -// Define three named loop index types used in the triply-nested loop examples. -// These will trigger compilation errors if lambda index argument ordering -// and types do not match the typed range index ordering. -// -// _nestedreorder_idxtypes_start -RAJA_INDEX_VALUE(KIDX, "KIDX"); -RAJA_INDEX_VALUE(JIDX, "JIDX"); -RAJA_INDEX_VALUE(IIDX, "IIDX"); -// _nestedreorder_idxtypes_end - - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) -{ - - std::cout << "\n\nRAJA nested loop reorder example...\n"; - -// -// Typed index ranges -// -// _nestedreorder_ranges_start - RAJA::TypedRangeSegment KRange(2, 4); - RAJA::TypedRangeSegment JRange(1, 3); - RAJA::TypedRangeSegment IRange(0, 2); -// _nestedreorder_ranges_end - -//----------------------------------------------------------------------------// - - std::cout << "\n Running loop reorder example (K-outer, J-middle, I-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - - // _nestedreorder_kji_start - using KJI_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::seq_exec, // k - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec,// i - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); - // _nestedreorder_kji_end - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running loop reorder example (J-outer, I-middle, K-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - - // _nestedreorder_jik_start - using JIK_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::For<2, RAJA::seq_exec,// k - RAJA::statement::Lambda<0> - > - > - > - >; - // _nestedreorder_jik_end - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running loop reorder example (I-outer, K-middle, J-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - - // _nestedreorder_ikj_start - using IKJ_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::For<2, RAJA::seq_exec, // k - RAJA::statement::For<1, RAJA::seq_exec,// j - RAJA::statement::Lambda<0> - > - > - > - >; - // _nestedreorder_ikj_end - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); - - -#if 0 -//----------------------------------------------------------------------------// -// The following demonstrates that code will not compile if lambda argument -// types/order do not match the types/order of the For statements. -//----------------------------------------------------------------------------// - - // _nestedreorder_typemismatch_start - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (JIDX i, IIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); - // _nestedreorder_typemismatch_end - -#endif - - std::cout << "\n DONE!...\n"; - - return 0; -} - diff --git a/examples/tut_vertexsum-coloring.cpp b/examples/tut_vertexsum-coloring.cpp deleted file mode 100644 index 10116b0ab9..0000000000 --- a/examples/tut_vertexsum-coloring.cpp +++ /dev/null @@ -1,446 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include -#include -#include -#include - -#include "memoryManager.hpp" - -#include "RAJA/RAJA.hpp" - -#include "camp/resource.hpp" - -/* - * Mesh Vertex Sum with Index Coloring Example - * - * Example computes a sum at each vertex on a logically-Cartesian - * 2D mesh. Each sum includes a contribution from each mesh element - * that share a vertex. In many "staggered mesh" applications, such - * operations are written in a way that prevents parallelization due - * to potential data races -- specifically, multiple loop iterates - * over mesh elements writing to the same shared vertex memory location. - * This example illustrates how RAJA contructs can be used to enable one - * to get some parallelism from such operations without fundamentally - * changing how the algorithm looks in source code. - * - * RAJA features shown: - * - `forall` loop iteration template method - * - Index list segment - * - TypedIndexSet segment container - * - Hierarchical execution policies - * - * If CUDA is enabled, CUDA unified memory is used. - */ - -/* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block -*/ -#if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; -#endif - -// -// Functions to check and print result. -// -void checkResult(double* vol, double* volref, int n); -void printMeshData(double* v, int n, int joff); - - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) -{ - - std::cout << "\n\nRAJA mesh vertex sum example...\n"; - -// -// 2D mesh has N^2 "interior" vertices, (N+2)^2 total vertices and -// (N+1)^2 elements (including "ghost" elems) -// - const int N = 1000; - const int N_elem = N + 1; - const int N_vert = N + 2; - double* elemvol = memoryManager::allocate(N_elem*N_elem); - double* vertexvol = memoryManager::allocate(N_vert*N_vert); - double* vertexvol_ref = memoryManager::allocate(N_vert*N_vert); - int* elem2vert_map = memoryManager::allocate(4*N_elem*N_elem); - -// -// Some basic mesh parameters (offsets, mesh spacing factor 'h'), -// set up elem to vertex mapping array. -// - int jeoff = N_elem; - - int jvoff = N_vert; - - double h = 0.1; - - for (int j = 0 ; j < N_elem ; ++j) { - for (int i = 0 ; i < N_elem ; ++i) { - int ielem = i + j*jeoff ; - int imap = 4 * ielem ; - elem2vert_map[imap] = ielem + j; - elem2vert_map[imap+1] = ielem + j + 1; - elem2vert_map[imap+2] = ielem + j + jvoff; - elem2vert_map[imap+3] = ielem + j + 1 + jvoff; - } - } - -// -// Initialize hexahedral element volumes so every element volume -// depends on its i,j coordinates. -// - std::memset(elemvol, 0, N_elem*N_elem * sizeof(double)); - - for (int j = 0 ; j < N_elem ; ++j) { - for (int i = 0 ; i < N_elem ; ++i) { - int ielem = i + j*jeoff ; - elemvol[ielem] = h*(i+1) * h*(j+1); - } - } - -//std::cout << "\n Element volumes...\n"; -//printMeshData(elemvol, N_elem, jeoff); - -//----------------------------------------------------------------------------// - - std::cout << "\n Running C-version of vertex sum...\n"; - - std::memset(vertexvol_ref, 0, N_vert*N_vert * sizeof(double)); - - // _cstyle_vertexsum_start - for (int j = 0 ; j < N_elem ; ++j) { - for (int i = 0 ; i < N_elem ; ++i) { - int ie = i + j*jeoff ; - int* iv = &(elem2vert_map[4*ie]); - vertexvol_ref[ iv[0] ] += elemvol[ie] / 4.0 ; - vertexvol_ref[ iv[1] ] += elemvol[ie] / 4.0 ; - vertexvol_ref[ iv[2] ] += elemvol[ie] / 4.0 ; - vertexvol_ref[ iv[3] ] += elemvol[ie] / 4.0 ; - } - } - // _cstyle_vertexsum_end - -//std::cout << "\n Vertex volumes (reference)...\n"; -//printMeshData(vertexvol_ref, N_vert, jvoff); - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA nested sequential version...\n"; - - std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double)); - - // _raja_seq_vertexsum_start - using EXEC_POL1 = - RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec, // i - RAJA::statement::Lambda<0> - > - > - >; - - RAJA::kernel( RAJA::make_tuple(RAJA::RangeSegment(0, N_elem), - RAJA::RangeSegment(0, N_elem)), - [=](int i, int j) { - int ie = i + j*jeoff ; - int* iv = &(elem2vert_map[4*ie]); - vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ; - }); - // _raja_seq_vertexsum_end - - checkResult(vertexvol, vertexvol_ref, N_vert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(vertexvol, N_vert, jvoff); - -//----------------------------------------------------------------------------// - -// -// Note that the C-style and RAJA versions of the vertex sum calculation -// above cannot safely execute in parallel due to potential data races; -// i.e., multiple loop iterates over mesh elements writing to the same -// shared vertex memory location. -// -// In the following, we partition the element iteration space into four -// subsets (or "colors") indicated by numbers in the figure below. -// -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// | 2 | 3 | 2 | 3 | -// ----------------- -// | 0 | 1 | 0 | 1 | -// ----------------- -// -// Since none of the elements with the same number share a common vertex, -// we can iterate over each subset ("color") in parallel. -// -// We use RAJA ListSegments and a RAJA TypedIndexSet to define the element -// partitioning. -// - -// -// First, gather the element indices for each color in a vector. -// - // _colorvectors_vertexsum_start - std::vector idx0; - std::vector idx1; - std::vector idx2; - std::vector idx3; - - for (int j = 0 ; j < N_elem ; ++j) { - for (int i = 0 ; i < N_elem ; ++i) { - int ie = i + j*jeoff ; - if ( i % 2 == 0 ) { - if ( j % 2 == 0 ) { - idx0.push_back(ie); - } else { - idx2.push_back(ie); - } - } else { - if ( j % 2 == 0 ) { - idx1.push_back(ie); - } else { - idx3.push_back(ie); - } - } - } - } - // _colorvectors_vertexsum_end - -// -// Second, create a RAJA TypedIndexSet with four ListSegments -// -// The TypedIndexSet is a variadic template, where the template arguments -// are the segment types that the TypedIndexSet can hold. -// - // _colorindexset_vertexsum_start - using SegmentType = RAJA::TypedListSegment; - - RAJA::TypedIndexSet colorset; - - camp::resources::Resource host_res{camp::resources::Host()}; - - colorset.push_back( SegmentType(&idx0[0], idx0.size(), host_res) ); - colorset.push_back( SegmentType(&idx1[0], idx1.size(), host_res) ); - colorset.push_back( SegmentType(&idx2[0], idx2.size(), host_res) ); - colorset.push_back( SegmentType(&idx3[0], idx3.size(), host_res) ); - // _colorindexset_vertexsum_end - -//----------------------------------------------------------------------------// - -// -// RAJA vertex volume calculation - sequential TypedIndexSet version -// (sequential iteration over segments, -// sequential iteration of each segment) -// -// NOTE: we do not need i,j indices for this version since the element -// indices are contained in the list segments -// - std::cout << "\n Running RAJA sequential index set version...\n"; - - std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double)); - - // _raja_seq_colorindexset_vertexsum_start - using EXEC_POL2 = RAJA::ExecPolicy; - - RAJA::forall(colorset, [=](int ie) { - int* iv = &(elem2vert_map[4*ie]); - vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ; - }); - // _raja_seq_colorindexset_vertexsum_end - - checkResult(vertexvol, vertexvol_ref, N_vert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(vertexvol, N_vert, jvoff); - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_OPENMP) -// -// RAJA vertex volume calculation - OpenMP TypedIndexSet version -// (sequential iteration over segments, -// OpenMP parallel iteration of each segment) -// - std::cout << "\n Running RAJA OpenMP index set version...\n"; - - std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double)); - - using EXEC_POL3 = RAJA::ExecPolicy; - - RAJA::forall(colorset, [=](int ie) { - int* iv = &(elem2vert_map[4*ie]); - vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ; - }); - - checkResult(vertexvol, vertexvol_ref, N_vert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(vertexvol, N_vert, jvoff); -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_CUDA) -// -// RAJA vertex volume calculation - CUDA TypedIndexSet version -// (sequential iteration over segments, -// CUDA parallel execution of each segment) -// - std::cout << "\n Running RAJA CUDA index set version...\n"; - -// -// We create a RAJA TypedIndexSet with four ListSegments as before, -// but now we use a CUDA resource so the segment indices live in -// device memory. -// - RAJA::TypedIndexSet colorset_cuda; - - camp::resources::Resource cuda_res{camp::resources::Cuda()}; - - colorset_cuda.push_back( SegmentType(&idx0[0], idx0.size(), cuda_res) ); - colorset_cuda.push_back( SegmentType(&idx1[0], idx1.size(), cuda_res) ); - colorset_cuda.push_back( SegmentType(&idx2[0], idx2.size(), cuda_res) ); - colorset_cuda.push_back( SegmentType(&idx3[0], idx3.size(), cuda_res) ); - - std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double)); - - // _raja_cuda_colorindexset_vertexsum_start - using EXEC_POL4 = RAJA::ExecPolicy>; - - RAJA::forall(colorset_cuda, [=] RAJA_DEVICE (int ie) { - int* iv = &(elem2vert_map[4*ie]); - vertexvol[ iv[0] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[1] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[2] ] += elemvol[ie] / 4.0 ; - vertexvol[ iv[3] ] += elemvol[ie] / 4.0 ; - }); - // _raja_cuda_colorindexset_vertexsum_end - - checkResult(vertexvol, vertexvol_ref, N_vert); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(vertexvol, N_vert, jvoff); -#endif - -//----------------------------------------------------------------------------// - -#if defined(RAJA_ENABLE_HIP) -// -// RAJA vertex volume calculation - HIP IndexSet version -// (sequential iteration over segments, -// HIP parallel execution of each segment) -// - double* d_elemvol = memoryManager::allocate_gpu(N_elem*N_elem); - double* d_vertexvol = memoryManager::allocate_gpu(N_vert*N_vert); - int* d_elem2vert_map = memoryManager::allocate_gpu(4*N_elem*N_elem); - - hipMemcpy(d_elemvol, elemvol, N_elem*N_elem*sizeof(double), hipMemcpyHostToDevice); - hipMemcpy(d_elem2vert_map, elem2vert_map, 4*N_elem*N_elem*sizeof(int), hipMemcpyHostToDevice); - - std::cout << "\n Running RAJA HIP index set version...\n"; - - std::memset(vertexvol, 0, N_vert*N_vert * sizeof(double)); - hipMemcpy(d_vertexvol, vertexvol, N_vert*N_vert*sizeof(double), hipMemcpyHostToDevice); - -// -// We create a RAJA TypedIndexSet with four ListSegments as before, -// but now we use a Hip resource so the segment indices live in -// device memory. -// - RAJA::TypedIndexSet colorset_hip; - - camp::resources::Resource hip_res{camp::resources::Hip()}; - - colorset_hip.push_back( SegmentType(&idx0[0], idx0.size(), hip_res) ); - colorset_hip.push_back( SegmentType(&idx1[0], idx1.size(), hip_res) ); - colorset_hip.push_back( SegmentType(&idx2[0], idx2.size(), hip_res) ); - colorset_hip.push_back( SegmentType(&idx3[0], idx3.size(), hip_res) ); - - using EXEC_POL4 = RAJA::ExecPolicy>; - - RAJA::forall(colorset_hip, [=] RAJA_DEVICE (int ie) { - int* iv = &(d_elem2vert_map[4*ie]); - d_vertexvol[ iv[0] ] += d_elemvol[ie] / 4.0 ; - d_vertexvol[ iv[1] ] += d_elemvol[ie] / 4.0 ; - d_vertexvol[ iv[2] ] += d_elemvol[ie] / 4.0 ; - d_vertexvol[ iv[3] ] += d_elemvol[ie] / 4.0 ; - }); - - hipMemcpy(vertexvol, d_vertexvol, N_vert*N_vert*sizeof(double), hipMemcpyDeviceToHost); - checkResult(vertexvol, vertexvol_ref, N_vert); - - memoryManager::deallocate_gpu(d_elemvol); - memoryManager::deallocate_gpu(d_vertexvol); - memoryManager::deallocate_gpu(d_elem2vert_map); -//std::cout << "\n Vertex volumes...\n"; -//printMeshData(vertexvol, N_vert, jvoff); -#endif - -//----------------------------------------------------------------------------// - - // Clean up... - memoryManager::deallocate(elemvol); - memoryManager::deallocate(vertexvol); - memoryManager::deallocate(vertexvol_ref); - memoryManager::deallocate(elem2vert_map); - - std::cout << "\n DONE!...\n"; - - return 0; -} - -// -// Function to compare result to reference and print result P/F. -// -void checkResult(double* vol, double* volref, int n) -{ - bool match = true; - for (int i = 0; i < n*n; i++) { - if ( std::abs(vol[i] - volref[i]) > 10e-12 ) { match = false; } - } - if ( match ) { - std::cout << "\n\t result -- PASS\n"; - } else { - std::cout << "\n\t result -- FAIL\n"; - } -} - -// -// Function to print mesh data with mesh indices. -// -void printMeshData(double* v, int n, int joff) -{ - std::cout << std::endl; - for (int j = 0 ; j < n ; ++j) { - for (int i = 0 ; i < n ; ++i) { - int ii = i + j*joff ; - std::cout << "v(" << i << "," << j << ") = " - << v[ii] << std::endl; - } - } - std::cout << std::endl; -} diff --git a/exercises/CMakeLists.txt b/exercises/CMakeLists.txt index e6251e0fe3..7289dd9001 100644 --- a/exercises/CMakeLists.txt +++ b/exercises/CMakeLists.txt @@ -5,4 +5,144 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +raja_add_executable( + NAME atomic-histogram + SOURCES atomic-histogram.cpp) +raja_add_executable( + NAME atomic-histogram_solution + SOURCES atomic-histogram_solution.cpp) + +raja_add_executable( + NAME dot-product + SOURCES dot-product.cpp) +raja_add_executable( + NAME dot-product_solution + SOURCES dot-product_solution.cpp) + +raja_add_executable( + NAME kernelintro-execpols + SOURCES kernelintro-execpols.cpp) +raja_add_executable( + NAME kernelintro-execpols_solution + SOURCES kernelintro-execpols_solution.cpp) + +raja_add_executable( + NAME launchintro-execpols + SOURCES launchintro-execpols.cpp) +raja_add_executable( + NAME launchintro-execpols_solution + SOURCES launchintro-execpols_solution.cpp) + +raja_add_executable( + NAME kernelintro-nested-loop-reorder + SOURCES kernelintro-nested-loop-reorder.cpp) +raja_add_executable( + NAME kernelintro-nested-loop-reorder_solution + SOURCES kernelintro-nested-loop-reorder_solution.cpp) + +raja_add_executable( + NAME kernel-matrix-transpose + SOURCES kernel-matrix-transpose.cpp) +raja_add_executable( + NAME kernel-matrix-transpose_solution + SOURCES kernel-matrix-transpose_solution.cpp) + +raja_add_executable( + NAME launch-matrix-transpose + SOURCES launch-matrix-transpose.cpp) +raja_add_executable( + NAME launch-matrix-transpose_solution + SOURCES launch-matrix-transpose_solution.cpp) + +raja_add_executable( + NAME kernel-matrix-transpose-tiled + SOURCES kernel-matrix-transpose-tiled.cpp) +raja_add_executable( + NAME kernel-matrix-transpose-tiled_solution + SOURCES kernel-matrix-transpose-tiled_solution.cpp) + +raja_add_executable( + NAME launch-matrix-transpose-tiled + SOURCES launch-matrix-transpose-tiled.cpp) +raja_add_executable( + NAME launch-matrix-transpose-tiled_solution + SOURCES launch-matrix-transpose-tiled_solution.cpp) + +raja_add_executable( + NAME kernel-matrix-transpose-local-array + SOURCES kernel-matrix-transpose-local-array.cpp) +raja_add_executable( + NAME kernel-matrix-transpose-local-array_solution + SOURCES kernel-matrix-transpose-local-array_solution.cpp) + +raja_add_executable( + NAME launch-matrix-transpose-local-array + SOURCES launch-matrix-transpose-local-array.cpp) +raja_add_executable( + NAME launch-matrix-transpose-local-array_solution + SOURCES launch-matrix-transpose-local-array_solution.cpp) + +raja_add_executable( + NAME offset-layout-stencil + SOURCES offset-layout-stencil.cpp) +raja_add_executable( + NAME offset-layout-stencil_solution + SOURCES offset-layout-stencil_solution.cpp) + +raja_add_executable( + NAME permuted-layout-batch-matrix-multiply + SOURCES permuted-layout-batch-matrix-multiply.cpp) +raja_add_executable( + NAME permuted-layout-batch-matrix-multiply_solution + SOURCES permuted-layout-batch-matrix-multiply_solution.cpp) + +raja_add_executable( + NAME reductions + SOURCES reductions.cpp) +raja_add_executable( + NAME reductions_solution + SOURCES reductions_solution.cpp) + +raja_add_executable( + NAME scan + SOURCES scan.cpp) +raja_add_executable( + NAME scan_solution + SOURCES scan_solution.cpp) + +raja_add_executable( + NAME segment-indexset-basics + SOURCES segment-indexset-basics.cpp) +raja_add_executable( + NAME segment-indexset-basics_solution + SOURCES segment-indexset-basics_solution.cpp) + +raja_add_executable( + NAME sort + SOURCES sort.cpp) +raja_add_executable( + NAME sort_solution + SOURCES sort_solution.cpp) + +raja_add_executable( + NAME vector-addition + SOURCES vector-addition.cpp) +raja_add_executable( + NAME vector-addition_solution + SOURCES vector-addition_solution.cpp) + +raja_add_executable( + NAME vertexsum-indexset + SOURCES vertexsum-indexset.cpp) +raja_add_executable( + NAME vertexsum-indexset_solution + SOURCES vertexsum-indexset_solution.cpp) + +raja_add_executable( + NAME view-layout + SOURCES view-layout.cpp) +raja_add_executable( + NAME view-layout_solution + SOURCES view-layout_solution.cpp) + add_subdirectory(tutorial_halfday) diff --git a/exercises/Dockerfile b/exercises/Dockerfile new file mode 100644 index 0000000000..1a72413e41 --- /dev/null +++ b/exercises/Dockerfile @@ -0,0 +1,36 @@ +############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +FROM ghcr.io/rse-ops/cuda-ubuntu-20.04:cuda-11.1.1 AS nvcc11 + +ARG USER=AWSUSER +ENV HOME /home/${USER} + +RUN apt-get update && apt-get install -y supervisor + +RUN useradd --create-home --shell /bin/bash ${USER} +USER ${USER} + +WORKDIR $HOME +RUN git clone --recursive -b task/tut-reorg-aws https://github.com/llnl/raja + +WORKDIR $HOME/raja/build +RUN . /opt/spack/share/spack/setup-env.sh && spack load cuda && \ + cmake -DCMAKE_CXX_COMPILER=g++ -DENABLE_CUDA=On -DCMAKE_CUDA_STANDARD=14 -DCMAKE_CUDA_ARCHITECTURES=70 -DENABLE_OPENMP=On .. + +WORKDIR /opt/archives +RUN curl -L https://github.com/gitpod-io/openvscode-server/releases/download/openvscode-server-v1.69.1/openvscode-server-v1.69.1-linux-x64.tar.gz > \ + /opt/archives/openvscode-server-v1.69.1-linux-x64.tar.gz +RUN tar xzf openvscode-server-v1.69.1-linux-x64.tar.gz && chown -R ${USER}:${USER} openvscode-server-v1.69.1-linux-x64 + +USER root +ADD supervisord.conf /etc/supervisord.conf +RUN sed -i "s/XXX/${USER}/g" /etc/supervisord.conf + +RUN touch /var/log/openvscode-server.log && chown -R ${USER}:${USER} /var/log/openvscode-server.log + +CMD ["/usr/bin/supervisord"] diff --git a/exercises/tutorial_halfday/ex4_atomic-histogram.cpp b/exercises/atomic-histogram.cpp similarity index 73% rename from exercises/tutorial_halfday/ex4_atomic-histogram.cpp rename to exercises/atomic-histogram.cpp index 8ad89a45e3..dac9f9bcdd 100644 --- a/exercises/tutorial_halfday/ex4_atomic-histogram.cpp +++ b/exercises/atomic-histogram.cpp @@ -15,7 +15,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #4: Atomic histogram + * Atomic histogram exercise * * In this exercise, you will use use RAJA atomic operations to compute * an array which represents a histogram of values in another array. @@ -38,14 +38,15 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block - - Uncomment to use when filling in exercises. - + Specifies the number of threads in a GPU thread block +*/ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; +//const int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +//const int HIP_BLOCK_SIZE = 256; #endif -*/ // // Functions to check and print result. @@ -57,32 +58,35 @@ void printArray(int* v, int len); int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nExercise #4: Atomic histogram...\n"; + std::cout << "\n\nExercise: Atomic histogram...\n"; // // Define array bounds and initialize array to compute histogram of values // on. // - int M = 20; - int N = 100000; + + // _array_atomic_histogram_start + constexpr int M = 20; + constexpr int N = 100000; int* array = memoryManager::allocate(N); int* hist = memoryManager::allocate(M); - int* hist_ref = memoryManager::allocate(M); for (int i = 0; i < N; ++i) { array[i] = rand() % M; } + // _array_atomic_histogram_end + int* hist_ref = memoryManager::allocate(M); //----------------------------------------------------------------------------// // C-style sequential variant establishes reference solution to compare with. //----------------------------------------------------------------------------// - std::memset(hist_ref, 0, M * sizeof(int)); - std::cout << "\n\n Running C-style sequential historgram...\n"; + std::memset(hist_ref, 0, M * sizeof(int)); + for (int i = 0; i < N; ++i) { hist_ref[ array[i] ]++; } @@ -120,6 +124,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::memset(hist, 0, M * sizeof(int)); + // _range_atomic_histogram_start + //RAJA::TypedRangeSegment array_range(0,N); + // _range_atomic_histogram_end + /// /// TODO... /// @@ -127,7 +135,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// method with RAJA::seq_exec execution policy type and a /// RAJA::atomicAdd operation with RAJA::seq_atomic policy. /// - + /// You will need to uncomment the range segment definition + /// above to use it in the kernel. + /// + //RAJA::forall(array_range, [=](int i) { + //}); checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -151,7 +163,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// and a RAJA::atomicAdd operation with RAJA::omp_atomic policy. /// - checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -177,7 +188,6 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// and a RAJA::atomicAdd operation with RAJA::auto_atomic policy. /// - checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -201,7 +211,9 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// method with RAJA::cuda_exec execution policy type /// and a RAJA::atomicAdd operation with RAJA::cuda_atomic policy. /// - + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -227,6 +239,63 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) /// method with RAJA::cuda_exec execution policy type /// and a RAJA::atomicAdd operation with RAJA::auto_atomic policy. /// + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + checkResult(hist, hist_ref, M); +//printArray(hist, M); + +#endif + +//----------------------------------------------------------------------------// +// RAJA hip_atomic policy is used with the RAJA HIP execution policy. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + + std::cout << "\n Running RAJA HIP atomic histogram...\n"; + + std::memset(hist, 0, M * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall + /// method with RAJA::hip_exec execution policy type + /// and a RAJA::atomicAdd operation with RAJA::hip_atomic policy. + /// + /// NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + checkResult(hist, hist_ref, M); +//printArray(hist, M); + +#endif + + +//----------------------------------------------------------------------------// +// RAJA auto_atomic policy can also be used with the RAJA HIP +// execution policy. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + + std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n"; + + std::memset(hist, 0, M * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement the atomic histogram kernel using a RAJA::forall + /// method with RAJA::hip_exec execution policy type + /// and a RAJA::atomicAdd operation with RAJA::auto_atomic policy. + /// + /// NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// checkResult(hist, hist_ref, M); //printArray(hist, M); diff --git a/exercises/tutorial_halfday/ex4_atomic-histogram_solution.cpp b/exercises/atomic-histogram_solution.cpp similarity index 68% rename from exercises/tutorial_halfday/ex4_atomic-histogram_solution.cpp rename to exercises/atomic-histogram_solution.cpp index 7f937e48d5..924721385f 100644 --- a/exercises/tutorial_halfday/ex4_atomic-histogram_solution.cpp +++ b/exercises/atomic-histogram_solution.cpp @@ -15,7 +15,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #4: Atomic histogram + * Atomic histogram exercise * * In this exercise, you will use use RAJA atomic operations to compute * an array which represents a histogram of values in another array. @@ -38,12 +38,16 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block + Specifies the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) const int CUDA_BLOCK_SIZE = 256; #endif +#if defined(RAJA_ENABLE_HIP) +const int HIP_BLOCK_SIZE = 256; +#endif + // // Functions to check and print result. // @@ -54,23 +58,26 @@ void printArray(int* v, int len); int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nExercise #4: Atomic histogram...\n"; + std::cout << "\n\nExercise: Atomic histogram...\n"; // // Define array bounds and initialize array to compute histogram of values // on. // - int M = 20; - int N = 100000; + + // _array_atomic_histogram_start + constexpr int M = 20; + constexpr int N = 100000; int* array = memoryManager::allocate(N); int* hist = memoryManager::allocate(M); - int* hist_ref = memoryManager::allocate(M); for (int i = 0; i < N; ++i) { array[i] = rand() % M; } + // _array_atomic_histogram_end + int* hist_ref = memoryManager::allocate(M); //----------------------------------------------------------------------------// // C-style sequential variant establishes reference solution to compare with. @@ -93,10 +100,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) - std::memset(hist, 0, M * sizeof(int)); - std::cout << "\n\n Running C-style OpenMP historgram...\n"; + std::memset(hist, 0, M * sizeof(int)); + #pragma omp parallel for for (int i = 0; i < N; ++i) { #pragma omp atomic @@ -113,15 +120,18 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // RAJA::seq_exec policy enforces strictly sequential execution. //----------------------------------------------------------------------------// + std::cout << "\n Running RAJA sequential atomic histogram...\n"; + std::memset(hist, 0, M * sizeof(int)); - using EXEC_POL1 = RAJA::seq_exec; - using ATOMIC_POL1 = RAJA::seq_atomic; + // _range_atomic_histogram_start + RAJA::TypedRangeSegment array_range(0,N); + // _range_atomic_histogram_end - std::cout << "\n Running RAJA sequential atomic histogram...\n"; + RAJA::forall(array_range, [=](int i) { + + RAJA::atomicAdd(&hist[array[i]], 1); - RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { - RAJA::atomicAdd(&hist[array[i]], 1); }); checkResult(hist, hist_ref, M); @@ -134,16 +144,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA OpenMP atomic histogram...\n"; + std::memset(hist, 0, M * sizeof(int)); - using EXEC_POL2 = RAJA::omp_parallel_for_exec; - using ATOMIC_POL2 = RAJA::omp_atomic; + // _rajaomp_atomic_histogram_start + RAJA::forall(array_range, [=](int i) { - std::cout << "\n Running RAJA OpenMP atomic histogram...\n"; + RAJA::atomicAdd(&hist[array[i]], 1); - RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { - RAJA::atomicAdd(&hist[array[i]], 1); }); + // _rajaomp_atomic_histogram_end checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -158,15 +169,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n"; + std::memset(hist, 0, M * sizeof(int)); - using EXEC_POL3 = RAJA::omp_parallel_for_exec; - using ATOMIC_POL3 = RAJA::auto_atomic; + RAJA::forall(array_range, [=](int i) { + + RAJA::atomicAdd(&hist[array[i]], 1); - std::cout << "\n Running RAJA OpenMP histogram with auto atomic policy...\n"; - - RAJA::forall(RAJA::RangeSegment(0, N), [=](int i) { - RAJA::atomicAdd(&hist[array[i]], 1); }); checkResult(hist, hist_ref, M); @@ -181,16 +191,17 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA CUDA atomic histogram...\n"; + std::memset(hist, 0, M * sizeof(int)); - std::cout << "\n Running RAJA CUDA atomic histogram...\n"; + // _rajacuda_atomic_histogram_start + RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { - using EXEC_POL4 = RAJA::cuda_exec; - using ATOMIC_POL4 = RAJA::cuda_atomic; + RAJA::atomicAdd(&hist[array[i]], 1); - RAJA::forall(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { - RAJA::atomicAdd(&hist[array[i]], 1); }); + // _rajacuda_atomic_histogram_end checkResult(hist, hist_ref, M); //printArray(hist, M); @@ -205,16 +216,65 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n"; + std::memset(hist, 0, M * sizeof(int)); - using EXEC_POL5 = RAJA::cuda_exec; - using ATOMIC_POL5 = RAJA::auto_atomic; + // _rajacuda_atomicauto_histogram_start + RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { - std::cout << "\n Running RAJA CUDA histogram with auto atomic policy...\n"; + RAJA::atomicAdd(&hist[array[i]], 1); + + }); + // _rajacuda_atomicauto_histogram_end + + checkResult(hist, hist_ref, M); +//printArray(hist, M); + +#endif + +//----------------------------------------------------------------------------// +// RAJA hip_atomic policy is used with the RAJA HIP execution policy. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + + std::cout << "\n Running RAJA HIP atomic histogram...\n"; + + std::memset(hist, 0, M * sizeof(int)); + + // _rajahip_atomic_histogram_start + RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { + + RAJA::atomicAdd(&hist[array[i]], 1); + + }); + // _rajahip_atomic_histogram_end + + checkResult(hist, hist_ref, M); +//printArray(hist, M); + +#endif + + +//----------------------------------------------------------------------------// +// RAJA auto_atomic policy can also be used with the RAJA HIP +// execution policy. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + + std::cout << "\n Running RAJA HIP histogram with auto atomic policy...\n"; - RAJA::forall(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { - RAJA::atomicAdd(&hist[array[i]], 1); + std::memset(hist, 0, M * sizeof(int)); + + // _rajahip_atomicauto_histogram_start + RAJA::forall>(array_range, [=] RAJA_DEVICE (int i) { + + RAJA::atomicAdd(&hist[array[i]], 1); + }); + // _rajahip_atomicauto_histogram_end checkResult(hist, hist_ref, M); //printArray(hist, M); diff --git a/exercises/dot-product.cpp b/exercises/dot-product.cpp new file mode 100644 index 0000000000..67ec877f89 --- /dev/null +++ b/exercises/dot-product.cpp @@ -0,0 +1,236 @@ +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Vector Dot Product Exercise + * + * Computes dot = (a,b), where a, b are vectors of + * doubles and dot is a scalar double. It illustrates how RAJA + * supports a portable parallel reduction opertion in a way that + * the code looks like it does in a sequential implementation. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - Index range segment + * - Execution policies + * - Reduction types + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Function to check dot product result. +// +void checkResult(double compdot, double refdot); + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nExercise: vector dot product...\n"; + +#if defined(RAJA_ENABLE_SYCL) + memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()}; + ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res); +#endif + +// +// Define vector length +// + constexpr int N = 1000000; + +// +// Allocate and initialize vector data +// + double *a = memoryManager::allocate(N); + double *b = memoryManager::allocate(N); + + for (int i = 0; i < N; ++i) { + a[i] = 1.0; + b[i] = 1.0; + } + +//----------------------------------------------------------------------------// + +// +// C-style dot product operation. +// + std::cout << "\n Running C-version of dot product...\n"; + + // _csytle_dotprod_start + double dot = 0.0; + + for (int i = 0; i < N; ++i) { + dot += a[i] * b[i]; + } + + std::cout << "\t (a, b) = " << dot << std::endl; + // _csytle_dotprod_end + + double dot_ref = dot; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential dot product...\n"; + + dot = 0.0; + + /// + /// TODO... + /// + /// EXERCISE: Implement the dot product kernel using a RAJA::seq_exec + /// execution policy type and RAJA::seq_reduce. + /// + /// NOTE: We've done this one for you to help you get started... + /// + + RAJA::ReduceSum seqdot(0.0); + + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { + seqdot += a[i] * b[i]; + }); + + dot = seqdot.get(); + + std::cout << "\t (a, b) = " << dot << std::endl; + + checkResult(dot, dot_ref); + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA OpenMP dot product...\n"; + + dot = 0.0; + + /// + /// TODO... + /// + /// EXERCISE: Implement the dot product kernel using a RAJA::omp_parallel_for_exec + /// execution policy type and RAJA::omp_reduce reduction policy type. + /// + + std::cout << "\t (a, b) = " << dot << std::endl; + + checkResult(dot, dot_ref); +#endif + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + +//const int CUDA_BLOCK_SIZE = 256; + + std::cout << "\n Running RAJA CUDA dot product...\n"; + + dot = 0.0; + + /// + /// TODO... + /// + /// EXERCISE: Implement the dot product kernel using a RAJA::cuda_exec + /// execution policy type and RAJA::cuda_reduce reduction policy type. + /// + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' above. + /// if you want to use it here. + /// + + std::cout << "\t (a, b) = " << dot << std::endl; + + checkResult(dot, dot_ref); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + +//const int HIP_BLOCK_SIZE = 256; + + std::cout << "\n Running RAJA HIP dot product...\n"; + + dot = 0.0; + + int *d_a = memoryManager::allocate_gpu(N); + int *d_b = memoryManager::allocate_gpu(N); + + hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + + /// + /// TODO... + /// + /// EXERCISE: Implement the dot product kernel using a RAJA::hip_exec + /// execution policy type and RAJA::hip_reduce reduction policy type. + /// + /// NOTE: You will need to uncomment 'HIP_BLOCK_SIZE' above + /// if you want to use it here. + /// + + std::cout << "\t (a, b) = " << dot << std::endl; + + checkResult(dot, dot_ref); + + memoryManager::deallocate_gpu(d_a); + memoryManager::deallocate_gpu(d_b); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_SYCL) + +//const int SYCL_BLOCK_SIZE = 256; + + std::cout << "\n Running RAJA SYCL dot product...\n"; + + dot = 0.0; + + /// + /// TODO... + /// + /// EXERCISE: Implement the dot product kernel using a RAJA::sycl_exec + /// execution policy type and RAJA::sycl_reduce. + /// + /// NOTE: You will need to uncomment 'SYCL_BLOCK_SIZE' above + /// if you want to use it here. + /// + + std::cout << "\t (a, b) = " << dot << std::endl; + + checkResult(dot, dot_ref); + +#endif + +//----------------------------------------------------------------------------// + + + memoryManager::deallocate(a); + memoryManager::deallocate(b); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check computed dot product and report P/F. +// +void checkResult(double compdot, double refdot) +{ + if ( compdot == refdot ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} + diff --git a/examples/tut_dot-product.cpp b/exercises/dot-product_solution.cpp similarity index 87% rename from examples/tut_dot-product.cpp rename to exercises/dot-product_solution.cpp index 00a1691048..9d984fa066 100644 --- a/examples/tut_dot-product.cpp +++ b/exercises/dot-product_solution.cpp @@ -14,7 +14,7 @@ #include "RAJA/RAJA.hpp" /* - * Vector Dot Product Example + * Vector Dot Product Exercise * * Computes dot = (a,b), where a, b are vectors of * doubles and dot is a scalar double. It illustrates how RAJA @@ -30,21 +30,6 @@ * If CUDA is enabled, CUDA unified memory is used. */ -/* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block -*/ -#if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; -#endif - -#if defined(RAJA_ENABLE_SYCL) -const int SYCL_BLOCK_SIZE = 256; -#endif - // // Function to check dot product result. // @@ -53,7 +38,7 @@ void checkResult(double compdot, double refdot); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nRAJA vector dot product example...\n"; + std::cout << "\n\nExercise: vector dot product...\n"; #if defined(RAJA_ENABLE_SYCL) memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()}; @@ -63,7 +48,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Define vector length // - const int N = 1000000; + constexpr int N = 1000000; // // Allocate and initialize vector data @@ -89,9 +74,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) for (int i = 0; i < N; ++i) { dot += a[i] * b[i]; } - // _csytle_dotprod_end std::cout << "\t (a, b) = " << dot << std::endl; + // _csytle_dotprod_end double dot_ref = dot; @@ -99,10 +84,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA sequential dot product...\n"; + dot = 0.0; + // _rajaseq_dotprod_start RAJA::ReduceSum seqdot(0.0); - RAJA::forall(RAJA::RangeSegment(0, N), [=] (int i) { + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { seqdot += a[i] * b[i]; }); @@ -119,6 +106,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_OPENMP) std::cout << "\n Running RAJA OpenMP dot product...\n"; + dot = 0.0; + // _rajaomp_dotprod_start RAJA::ReduceSum ompdot(0.0); @@ -138,8 +127,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) + + const int CUDA_BLOCK_SIZE = 256; + std::cout << "\n Running RAJA CUDA dot product...\n"; + dot = 0.0; + // _rajacuda_dotprod_start RAJA::ReduceSum cudot(0.0); @@ -159,13 +153,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_HIP) + + const int HIP_BLOCK_SIZE = 256; + std::cout << "\n Running RAJA HIP dot product...\n"; - int *d_a = memoryManager::allocate_gpu(N); - int *d_b = memoryManager::allocate_gpu(N); + dot = 0.0; + + double *d_a = memoryManager::allocate_gpu(N); + double *d_b = memoryManager::allocate_gpu(N); - hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); - hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_a, a, N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_b, b, N * sizeof(double), hipMemcpyHostToDevice )); // _rajahip_dotprod_start RAJA::ReduceSum hpdot(0.0); @@ -189,8 +188,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_SYCL) + + const int SYCL_BLOCK_SIZE = 256; + std::cout << "\n Running RAJA SYCL dot product...\n"; + dot = 0.0; + // _rajasycl_dotprod_start RAJA::ReduceSum hpdot(0.0); diff --git a/exercises/kernel-matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array.cpp new file mode 100644 index 0000000000..c9e6dfa062 --- /dev/null +++ b/exercises/kernel-matrix-transpose-local-array.cpp @@ -0,0 +1,689 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "memoryManager.hpp" + +/* + * Matrix Transpose Exercise + * + * In this exercise, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At of size N_c x N_r. + * + * This operation is carried out using a local memory tiling + * algorithm. The algorithm first loads matrix entries into an + * iteraion shared tile, a two-dimensional array, and then + * reads from the tile with row and column indices swapped for + * the output matrix. + * + * The algorithm is expressed as a collection of ``outer`` + * and ``inner`` for loops. Iterations of the inner loops will load/read + * data into the tile; while outer loops will iterate over the number + * of tiles needed to carry out the transpose. + * + * RAJA variants of the exercise use RAJA local arrays as tile memory. + * Furthermore, the tiling pattern is handled by RAJA's tile statements. + * For CPU execution, RAJA local arrays are used to improve + * performance via cache blocking. For CUDA GPU execution, + * RAJA shared memory is mapped to CUDA shared memory which + * enables threads in the same thread block to share data. + * + * RAJA features shown: + * - Basic usage of 'RAJA::kernel' abstractions for nested loops + * - Multiple lambdas + * - Options for specifying lambda arguments + * - Tile statement + * - ForICount statement + * - RAJA local arrays + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; + + // + // Define num rows/cols in matrix, tile dimensions, and number of tiles + // + // _mattranspose_localarray_dims_start + constexpr int N_r = 267; + constexpr int N_c = 251; + + constexpr int TILE_DIM = 16; + + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + // _mattranspose_localarray_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_localarray_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_localarray_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + // printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of shared matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _mattranspose_localarray_cstyle_start + // + // (0) Outer loops to iterate over tiles + // + for (int by = 0; by < outer_Dimr; ++by) { + for (int bx = 0; bx < outer_Dimc; ++bx) { + + // Stack-allocated local array for data on a tile + int Tile[TILE_DIM][TILE_DIM]; + + // + // (1) Inner loops to read input matrix tile data into the array + // + // Note: loops are ordered so that input matrix data access + // is stride-1. + // + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Tile[ty][tx] = Aview(row, col); + } + } + } + + // + // (2) Inner loops to write array data into output array tile + // + // Note: loop order is swapped from above so that output matrix + // data access is stride-1. + // + for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Tile[ty][tx]; + } + } + } + + } + } + // _mattranspose_localarray_cstyle_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //----------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::Kernel + // method to carryout the transpose + // + + // Here we define a RAJA local array type. + // The array type is templated on + // 1) Data type + // 2) Index permutation + // 3) Dimensions of the array + // + + // _mattranspose_localarray_start + using TILE_MEM = + RAJA::LocalArray, RAJA::SizeList>; + TILE_MEM Tile_Array; + // _mattranspose_localarray_end + + // **NOTE** Although the LocalArray is constructed + // the array memory has not been allocated. + + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _mattranspose_localarray_raja_start + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + using SEQ_EXEC_POL_I = + RAJA::KernelPolicy< + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + + /// + /// TODO... + /// + /// EXERCISE: Initialize the local memory statement as position 2 + /// in the paramater list. + /// + + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::loop_exec, + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + >, + + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::loop_exec, + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::loop_exec, + RAJA::statement::Lambda<1> + > + > + + > + > + > + >; + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + + RAJA::make_tuple((int)0, (int)0, Tile_Array), + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + } + + ); + */ + // _mattranspose_localarray_raja_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + +#if defined(RAJA_ENABLE_OPENMP) + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix " + "transpose exercise ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + using OPENMP_EXEC_1_POL = + RAJA::KernelPolicy< + // + // (0) Execution policies for outer loops + // These loops iterate over the number of + // tiles needed to carry out the transpose + // + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays in the parameter tuple to intialize. + RAJA::statement::InitLocalMem, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + + /// + /// TODO... + /// + /// EXERCISE: Use two ForICount statements with loop_exec to call the first lambda. + /// + + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + + /// + /// TODO... + /// + /// EXERCISE: Use two ForICount statements with loop_exec to call the second lambda. + /// + > + > + > + >; + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + Tile_Array(ty, tx) = Aview(row, col); + + }, + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + Atview(col, row) = Tile_Array(ty, tx); + + } + ); + */ + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - OpenMP (parallel inner loops) matrix " + "transpose exercise ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + using OPENMP_EXEC_2_POL = + RAJA::KernelPolicy< + // + // (0) Execution policies for outer loops + // These loops iterate over the number of + // tiles needed to carry out the transpose + // + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::omp_parallel_for_exec, + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + >, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount<0, RAJA::statement::Param<0>, RAJA::loop_exec, + RAJA::statement::ForICount<1, RAJA::statement::Param<1>, RAJA::loop_exec, + RAJA::statement::Lambda<1> + > + > + > + > + > + >; + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + Tile_Array(ty, tx) = Aview(row, col); + + }, + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + Atview(col, row) = Tile_Array(ty, tx); + + } + ); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_r, N_c); +#endif + + //--------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + using CUDA_EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + // + // (0) Execution policies for outer loops + // These loops iterate over the number of + // tiles needed to carry out the transpose + // + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<0> + > + >, + // Synchronize threads to ensure all loads + // to the local array are complete + RAJA::statement::CudaSyncThreads, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::cuda_thread_y_direct, + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct, + RAJA::statement::Lambda<1> + > + >, + // Synchronize threads to ensure all reads + // from the local array are complete + RAJA::statement::CudaSyncThreads + > + > + > + > + >; + + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), + + [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + Tile_Array(ty, tx) = Aview(row, col); + + }, + + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + Atview(col, row) = Tile_Array(ty, tx); + + } + ); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + +//--------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; + + int *d_A = memoryManager::allocate_gpu(N_r * N_c); + int *d_At = memoryManager::allocate_gpu(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + RAJA::View> d_Aview(d_A, N_r, N_c); + RAJA::View> d_Atview(d_At, N_c, N_r); + + std::memset(At, 0, N_r * N_c * sizeof(int)); + hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + + using HIP_EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::HipKernel< + // + // (0) Execution policies for outer loops + // These loops iterate over the number of + // tiles needed to carry out the transpose + // + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, + // This statement will initalize local array memory inside a + // kernel. The cpu_tile_mem policy specifies that memory should be + // allocated on the stack. The entries in the RAJA::ParamList + // identify RAJA local arrays to intialize in the parameter tuple. + RAJA::statement::InitLocalMem, + // + // (1) Execution policies for the first set of inner + // loops. These loops copy data from the global matrices + // to the local tile. + // + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<0> + > + >, + // Synchronize threads to ensure all loads + // to the local array are complete + RAJA::statement::HipSyncThreads, + // + // (2) Execution policies for the second set of inner + // loops. These loops copy data from the local tile to + // the global matrix. + // Note: The order of the loops have been + // swapped! This enables us to swap which + // index has unit stride. + // + RAJA::statement::ForICount<0, RAJA::statement::Param<1>, RAJA::hip_thread_y_direct, + RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct, + RAJA::statement::Lambda<1> + > + >, + // Synchronize threads to ensure all reads + // from the local array are complete + RAJA::statement::HipSyncThreads + > + > + > + > + >; + + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), + + [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + Tile_Array(ty, tx) = d_Aview(row, col); + + }, + + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + + d_Atview(col, row) = Tile_Array(ty, tx); + + } + ); + + hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + + + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + //Alias for convenience + using RAJA::Segs; + using RAJA::Offsets; + using RAJA::Params; + + // _mattranspose_localarray_raja_lambdaargs_start + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + using SEQ_EXEC_POL_II = + RAJA::KernelPolicy< + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + + RAJA::statement::InitLocalMem, + + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> > + > + >, + + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> > + > + > + + > + > + > + >; + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + + RAJA::make_tuple(Tile_Array), + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + } + ); + */ + // _mattranspose_localarray_raja_lambdaargs_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +//--------------------------------------------------------------------------// + + return 0; +} + + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + << std::endl; + } + std::cout << "" << std::endl; + } + std::cout << std::endl; +} diff --git a/examples/tut_matrix-transpose-local-array.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp similarity index 87% rename from examples/tut_matrix-transpose-local-array.cpp rename to exercises/kernel-matrix-transpose-local-array_solution.cpp index 1a62446fd5..06841483fa 100644 --- a/examples/tut_matrix-transpose-local-array.cpp +++ b/exercises/kernel-matrix-transpose-local-array_solution.cpp @@ -14,9 +14,9 @@ #include "memoryManager.hpp" /* - * Matrix Transpose Example + * Matrix Transpose Exercise * - * In this example, an input matrix A of dimension N_r x N_c is + * In this exercise, an input matrix A of dimension N_r x N_c is * transposed and returned as a second matrix At of size N_c x N_r. * * This operation is carried out using a local memory tiling @@ -30,7 +30,7 @@ * data into the tile; while outer loops will iterate over the number * of tiles needed to carry out the transpose. * - * RAJA variants of the example use RAJA local arrays as tile memory. + * RAJA variants of the exercise use RAJA local arrays as tile memory. * Furthermore, the tiling pattern is handled by RAJA's tile statements. * For CPU execution, RAJA local arrays are used to improve * performance via cache blocking. For CUDA GPU execution, @@ -51,7 +51,7 @@ // // Define dimensionality of matrices // -const int DIM = 2; +constexpr int DIM = 2; // // Function for checking results @@ -69,19 +69,19 @@ void printResult(RAJA::View> Atview, int N_r, int N_c); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nRAJA shared matrix transpose example...\n"; + std::cout << "\n\nRAJA shared matrix transpose exercise...\n"; // // Define num rows/cols in matrix, tile dimensions, and number of tiles // // _mattranspose_localarray_dims_start - const int N_r = 267; - const int N_c = 251; + constexpr int N_r = 267; + constexpr int N_c = 251; - const int TILE_DIM = 16; + constexpr int TILE_DIM = 16; - const int outer_Dimc = (N_c - 1) / TILE_DIM + 1; - const int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; // _mattranspose_localarray_dims_end // @@ -195,7 +195,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // the array memory has not been allocated. //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; + std::cout << "\n Running RAJA - sequential matrix transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -224,8 +224,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment(0, N_c), - RAJA::RangeSegment(0, N_r)), + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), RAJA::make_tuple((int)0, (int)0, Tile_Array), @@ -235,70 +236,19 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { Atview(col, row) = Tile_Array(ty, tx); + } - }); + ); // _mattranspose_localarray_raja_end checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); - //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - sequential matrix transpose example with args in statement ...\n"; - - std::memset(At, 0, N_r * N_c * sizeof(int)); - - //Alias for convenience - using RAJA::Segs; - using RAJA::Offsets; - using RAJA::Params; - - // _mattranspose_localarray_raja_lambdaargs_start - using SEQ_EXEC_POL_II = - RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, - - RAJA::statement::InitLocalMem, - - RAJA::statement::For<1, RAJA::loop_exec, - RAJA::statement::For<0, RAJA::loop_exec, - RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> > - > - >, - - RAJA::statement::For<0, RAJA::loop_exec, - RAJA::statement::For<1, RAJA::loop_exec, - RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> > - > - > - - > - > - > - >; - - RAJA::kernel_param( RAJA::make_tuple(RAJA::RangeSegment(0, N_c), - RAJA::RangeSegment(0, N_r)), - - RAJA::make_tuple(Tile_Array), - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Tile_Array(ty, tx) = Aview(row, col); - }, - - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Atview(col, row) = Tile_Array(ty, tx); - - }); - // _mattranspose_localarray_raja_lambdaargs_end - - checkResult(Atview, N_c, N_r); - // printResult(Atview, N_c, N_r); #if defined(RAJA_ENABLE_OPENMP) //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix " - "transpose example ...\n"; + "transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -345,27 +295,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >; RAJA::kernel_param( - RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Tile_Array(ty, tx) = Aview(row, col); + Tile_Array(ty, tx) = Aview(row, col); - }, + }, - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Atview(col, row) = Tile_Array(ty, tx); + Atview(col, row) = Tile_Array(ty, tx); - }); + } + ); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); //--------------------------------------------------------------------------// std::cout << "\n Running RAJA - OpenMP (parallel inner loops) matrix " - "transpose example ...\n"; + "transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -412,20 +364,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) >; RAJA::kernel_param( - RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Tile_Array(ty, tx) = Aview(row, col); + Tile_Array(ty, tx) = Aview(row, col); - }, + }, - [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Atview(col, row) = Tile_Array(ty, tx); + Atview(col, row) = Tile_Array(ty, tx); - }); + } + ); checkResult(Atview, N_c, N_r); // printResult(Atview, N_r, N_c); @@ -433,7 +387,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //--------------------------------------------------------------------------// #if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n"; + std::cout << "\n Running RAJA - CUDA matrix transpose exercise ...\n"; std::memset(At, 0, N_r * N_c * sizeof(int)); @@ -489,20 +443,22 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::kernel_param( - RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Tile_Array(ty, tx) = Aview(row, col); + Tile_Array(ty, tx) = Aview(row, col); - }, + }, - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Atview(col, row) = Tile_Array(ty, tx); + Atview(col, row) = Tile_Array(ty, tx); - }); + } + ); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); @@ -512,7 +468,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) //--------------------------------------------------------------------------// - std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; + std::cout << "\n Running RAJA - HIP matrix transpose exercise ...\n"; int *d_A = memoryManager::allocate_gpu(N_r * N_c); int *d_At = memoryManager::allocate_gpu(N_r * N_c); @@ -582,26 +538,82 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::kernel_param( - RAJA::make_tuple(RAJA::RangeSegment(0, N_c), RAJA::RangeSegment(0, N_r)), - RAJA::make_tuple((int)0, (int)0, Tile_Array), + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + RAJA::make_tuple((int)0, (int)0, Tile_Array), - [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=] RAJA_DEVICE (int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - Tile_Array(ty, tx) = d_Aview(row, col); + Tile_Array(ty, tx) = d_Aview(row, col); - }, + }, - [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + [=] RAJA_DEVICE(int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { - d_Atview(col, row) = Tile_Array(ty, tx); + d_Atview(col, row) = Tile_Array(ty, tx); - }); + } + ); hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); checkResult(Atview, N_c, N_r); // printResult(Atview, N_c, N_r); #endif + + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - sequential matrix transpose exercise with args in statement ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + //Alias for convenience + using RAJA::Segs; + using RAJA::Offsets; + using RAJA::Params; + + // _raja_mattranspose_lambdaargs_start + using SEQ_EXEC_POL_II = + RAJA::KernelPolicy< + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + + RAJA::statement::InitLocalMem, + + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0, Segs<0>, Segs<1>, Offsets<0>, Offsets<1>, Params<0> > + > + >, + + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::Lambda<1, Segs<0, 1>, Offsets<0, 1>, Params<0> > + > + > + + > + > + > + >; + + RAJA::kernel_param( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, N_c), + RAJA::TypedRangeSegment(0, N_r)), + + RAJA::make_tuple(Tile_Array), + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + Tile_Array(ty, tx) = Aview(row, col); + }, + + [=](int col, int row, int tx, int ty, TILE_MEM &Tile_Array) { + Atview(col, row) = Tile_Array(ty, tx); + } + ); + // _raja_mattranspose_lambdaargs_start + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); //--------------------------------------------------------------------------// return 0; diff --git a/exercises/kernel-matrix-transpose-tiled.cpp b/exercises/kernel-matrix-transpose-tiled.cpp new file mode 100644 index 0000000000..d513f0041b --- /dev/null +++ b/exercises/kernel-matrix-transpose-tiled.cpp @@ -0,0 +1,382 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Tiled Matrix Transpose Exercise + * + * In this exercise, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At. + * + * This operation is carried out using a tiling algorithm. + * The algorithm iterates over tiles of the matrix A and + * performs a transpose copy without explicitly storing the tile. + * + * The algorithm is expressed as a collection of ``outer`` + * and ``inner`` for loops. Iterations of the inner loop will + * tranpose tile entries; while outer loops will iterate over + * the number of tiles needed to carryout the transpose. + * We do not assume that tiles divide the number of rows and + * and columns of the matrix. + * + * RAJA features shown: + * - Basic usage of 'RAJA::kernel' abstractions for nested loops + * - Tiling statement + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA matrix transpose exercise...\n"; + + // + // Define num rows/cols in matrix, tile dimensions, and number of tiles. + // + // _tiled_mattranspose_dims_start + constexpr int N_r = 56; + constexpr int N_c = 75; + + constexpr int TILE_DIM = 16; + + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + // _tiled_mattranspose_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of tiled matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _tiled_mattranspose_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _tiled_mattranspose_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of tiled matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _cstyle_tiled_mattranspose_start + // + // (0) Outer loops to iterate over tiles + // + for (int by = 0; by < outer_Dimr; ++by) { + for (int bx = 0; bx < outer_Dimc; ++bx) { + // + // (1) Loops to iterate over tile entries + // + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Aview(row, col); + } + } + } + + } + } + // _cstyle_tiled_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + //----------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::kernel method to carryout the + // transpose. + // + // Here, we define RAJA range segments to establish the iteration spaces. + // Further partioning of the iteration space is carried out in the + // tile_fixed statements. Iterations inside a RAJA loop is given by their + // global iteration number. + // + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running sequential tiled matrix transpose ...\n"; + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // The following policy carries out the transpose + // using sequential loops. The template parameter inside + // tile_fixed corresponds to the dimension size of the tile. + // + // _raja_tiled_mattranspose_start + + /// + /// TODO... + /// + /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a + /// tiled matrix transpose. + /// + /// NOTE: We have done this first one for you. + /// + + using TILED_KERNEL_EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + > + > + > + >; + + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); + }); + // _raja_tiled_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops over tiles sequentially while exposing parallelism on + // one of the inner loops. + // + + /// + /// TODO... + /// + /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a + /// tiled matrix transpose. + /// + + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); + }); + */ + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + //----------------------------------------------------------------------------// + + std::cout << "\n Running openmp tiled matrix transpose - collapsed inner loops...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops over tiles sequentially while collapsing inner loops + // into a single OpenMP parallel for loop enabling parallel loads/reads + // to/from the tile. + // + using TILED_KERNEL_EXEC_POL_OMP2 = + RAJA::KernelPolicy< + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Collapse, + RAJA::statement::Lambda<0> + > //closes collapse + > // closes Tile 0 + > // closes Tile 1 + >; // closes policy list + + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running cuda tiled matrix transpose ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _raja_mattranspose_cuda_start + + /// + /// TODO... + /// + /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a + /// tiled matrix transpose. + /// + + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE (int col, int row) { + Atview(col, row) = Aview(row, col); + }); + */ + // _raja_mattranspose_cuda_end + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + std::cout << "\n Running hip tiled matrix transpose ...\n"; + + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); + + RAJA::View> d_Aview(d_A, N_r, N_c); + RAJA::View> d_Atview(d_At, N_c, N_r); + + std::memset(At, 0, N_r * N_c * sizeof(int)); + hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + + using TILED_KERNEL_EXEC_POL_HIP = + RAJA::KernelPolicy< + RAJA::statement::HipKernel< + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, + RAJA::statement::For<1, RAJA::hip_thread_x_direct, + RAJA::statement::For<0, RAJA::hip_thread_y_direct, + RAJA::statement::Lambda<0> + > + > + > + > + > + >; + + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE (int col, int row) { + d_Atview(col, row) = d_Aview(row, col); + }); + + hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + + //----------------------------------------------------------------------------// + + + // + // Clean up. + // + memoryManager::deallocate(A); + memoryManager::deallocate(At); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + // << std::endl; + std::cout<> Atview, int N_r, int N_c); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nRAJA tiled matrix transpose example...\n"; + std::cout << "\n\nRAJA matrix transpose exercise...\n"; // // Define num rows/cols in matrix, tile dimensions, and number of tiles. // // _tiled_mattranspose_dims_start - const int N_r = 56; - const int N_c = 75; + constexpr int N_r = 56; + constexpr int N_c = 75; - const int TILE_DIM = 16; + constexpr int TILE_DIM = 16; - const int outer_Dimc = (N_c - 1) / TILE_DIM + 1; - const int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; // _tiled_mattranspose_dims_end // @@ -101,7 +101,6 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } //printResult(Aview, N_r, N_c); - //----------------------------------------------------------------------------// std::cout << "\n Running C-version of tiled matrix transpose...\n"; @@ -146,8 +145,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // tile_fixed statements. Iterations inside a RAJA loop is given by their // global iteration number. // - RAJA::RangeSegment row_Range(0, N_r); - RAJA::RangeSegment col_Range(0, N_c); + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); //----------------------------------------------------------------------------// std::cout << "\n Running sequential tiled matrix transpose ...\n"; @@ -159,12 +158,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // tile_fixed corresponds to the dimension size of the tile. // // _raja_tiled_mattranspose_start - using KERNEL_EXEC_POL = + using TILED_KERNEL_EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::For<1, RAJA::seq_exec, - RAJA::statement::For<0, RAJA::seq_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<0, RAJA::loop_exec, RAJA::statement::Lambda<0> > > @@ -172,7 +171,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), [=](int col, int row) { Atview(col, row) = Aview(row, col); }); @@ -191,7 +190,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // This policy loops over tiles sequentially while exposing parallelism on // one of the inner loops. // - using KERNEL_EXEC_POL_OMP = + using TILED_KERNEL_EXEC_POL_OMP = RAJA::KernelPolicy< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::omp_parallel_for_exec, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, @@ -204,12 +203,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); @@ -225,10 +221,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // into a single OpenMP parallel for loop enabling parallel loads/reads // to/from the tile. // - using KERNEL_EXEC_POL_OMP2 = + using TILED_KERNEL_EXEC_POL_OMP2 = RAJA::KernelPolicy< - RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::seq_exec, - RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::seq_exec, + RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::loop_exec, + RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::loop_exec, RAJA::statement::Collapse, RAJA::statement::Lambda<0> @@ -237,12 +233,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > // closes Tile 1 >; // closes policy list - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=](int col, int row) { - - Atview(col, row) = Aview(row, col); - + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); }); checkResult(Atview, N_c, N_r); @@ -257,14 +250,15 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(At, 0, N_r * N_c * sizeof(int)); - using KERNEL_EXEC_POL_CUDA = + // _raja_mattranspose_cuda_start + using TILED_KERNEL_EXEC_POL_CUDA = RAJA::KernelPolicy< RAJA::statement::CudaKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_x_direct, RAJA::statement::For<0, RAJA::cuda_thread_y_direct, - RAJA::statement::Lambda<0> + RAJA::statement::Lambda<0> > > > @@ -272,13 +266,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - - Atview(col, row) = Aview(row, col); - + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE (int col, int row) { + Atview(col, row) = Aview(row, col); }); + // _raja_mattranspose_cuda_end checkResult(Atview, N_c, N_r); //printResult(Atview, N_c, N_r); @@ -299,14 +291,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); - using KERNEL_EXEC_POL_HIP = + using TILED_KERNEL_EXEC_POL_HIP = RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::Tile<1, RAJA::tile_fixed, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_x_direct, RAJA::statement::For<0, RAJA::hip_thread_y_direct, - RAJA::statement::Lambda<0> + RAJA::statement::Lambda<0> > > > @@ -314,12 +306,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel( - RAJA::make_tuple(col_Range, row_Range), - [=] RAJA_DEVICE (int col, int row) { - - d_Atview(col, row) = d_Aview(row, col); - + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE (int col, int row) { + d_Atview(col, row) = d_Aview(row, col); }); hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); diff --git a/exercises/kernel-matrix-transpose.cpp b/exercises/kernel-matrix-transpose.cpp new file mode 100644 index 0000000000..01b0ff5b78 --- /dev/null +++ b/exercises/kernel-matrix-transpose.cpp @@ -0,0 +1,272 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Tiled Matrix Transpose Exercise + * + * In this exercise, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At. + * + * RAJA features shown: + * - Basic usage of 'RAJA::kernel' abstractions for nested loops + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA matrix transpose exercise...\n"; + + // + // Define num rows/cols in matrix. + // + // _mattranspose_dims_start + constexpr int N_r = 56; + constexpr int N_c = 75; + // _mattranspose_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _cstyle_mattranspose_start + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Atview(col, row) = Aview(row, col); + } + } + // _cstyle_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //----------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::kernel method to carryout the + // transpose. + // + // Here, we define RAJA range segments to establish the iteration spaces. + // Iterations inside a RAJA loop is given by their global iteration number. + // +//RAJA::TypedRangeSegment row_Range(0, N_r); +//RAJA::TypedRangeSegment col_Range(0, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running sequential matrix transpose ...\n"; + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // The following policy carries out the transpose + // using sequential loops. + // + // _raja_mattranspose_start + + /// + /// TODO... + /// + /// EXERCISE: Implement a sequential RAJA::kernel execution policy for a + /// basic matrix transpose. + /// + /// Uncomment 'row_Range' and 'col_Range' objects above so they + /// can be used in the kernel. + /// + + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); + }); + */ + // _raja_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops sequentially while exposing parallelism on + // one of the inner loops. + // + + /// + /// TODO... + /// + /// EXERCISE: Implement an openmp RAJA::kernel execution policy for a + /// basic matrix transpose. + /// + /// Uncomment 'row_Range' and 'col_Range' objects above so they + /// can be used in the kernel. + /// + + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); + }); + */ + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running cuda matrix transpose ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _raja_mattranspose_cuda_start + + /// + /// TODO... + /// + /// EXERCISE: Implement a CUDA RAJA::kernel execution policy for a + /// basic matrix transpose. + /// + /// Uncomment 'row_Range' and 'col_Range' objects above so they + /// can be used in the kernel. + /// + + /// + /// TODO... + /// + /// EXERCISE: Uncomment this code block. + /// + /* + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE (int col, int row) { + Atview(col, row) = Aview(row, col); + }); + */ + // _raja_mattranspose_cuda_end + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + +//----------------------------------------------------------------------------// + + // + // Clean up. + // + memoryManager::deallocate(A); + memoryManager::deallocate(At); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + // << std::endl; + std::cout< +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Tiled Matrix Transpose Exercise + * + * In this exercise, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At. + * + * RAJA features shown: + * - Basic usage of 'RAJA::kernel' abstractions for nested loops + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA matrix transpose exercise...\n"; + + // + // Define num rows/cols in matrix. + // + // _mattranspose_dims_start + constexpr int N_r = 56; + constexpr int N_c = 75; + // _mattranspose_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _cstyle_mattranspose_start + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Atview(col, row) = Aview(row, col); + } + } + // _cstyle_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //--------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::kernel method to carryout the + // transpose. + // + // Here, we define RAJA range segments to establish the iteration spaces. + // Iterations inside a RAJA loop is given by their global iteration number. + // + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running sequential matrix transpose ...\n"; + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // The following policy carries out the transpose + // using sequential loops. + // + // _raja_mattranspose_start + using KERNEL_EXEC_POL = + RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::loop_exec, + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + > + >; + + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); + }); + // _raja_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops sequentially while exposing parallelism on + // one of the inner loops. + // + using KERNEL_EXEC_POL_OMP = + RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::omp_parallel_for_exec, + RAJA::statement::For<0, RAJA::loop_exec, + RAJA::statement::Lambda<0> + > + > + >; + + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=](int col, int row) { + Atview(col, row) = Aview(row, col); + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running cuda matrix transpose ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _raja_mattranspose_cuda_start + using KERNEL_EXEC_POL_CUDA = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<1, RAJA::cuda_thread_x_loop, + RAJA::statement::For<0, RAJA::cuda_thread_y_loop, + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::kernel( RAJA::make_tuple(col_Range, row_Range), + [=] RAJA_DEVICE (int col, int row) { + Atview(col, row) = Aview(row, col); + }); + // _raja_mattranspose_cuda_end + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + +//----------------------------------------------------------------------------// + + // + // Clean up. + // + memoryManager::deallocate(A); + memoryManager::deallocate(At); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + // << std::endl; + std::cout< +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" + +#include "camp/resource.hpp" + +#include "memoryManager.hpp" + +/* + * RAJA::kernel execution policies + * + * In this exercise, you will use a variety of nested-loop execution + * policies to initalize entries in a three-dimensional tensor. The + * goal of the exercise is to gain familiarity with RAJA::kernel + * execution policies for various RAJA execution back-ends. + * + * RAJA features you will use: + * - `RAJA::kernel` kernel execution template method and exec policies + * - Simple RAJA View/Layout + * - RAJA Range segment + * + * If CUDA is enabled, CUDA unified memory is used. + * If HIP is enabled, HIP global device memory is used, with explicit + * host-device mem copy operations. + */ + +#if defined(RAJA_ENABLE_CUDA) +// _cuda_tensorinit_kernel_start +template< int i_block_size, int j_block_size, int k_block_size > +__launch_bounds__(i_block_size*j_block_size*k_block_size) +__global__ void nested_init(double* a, double c, int N) +{ + int i = blockIdx.x * i_block_size + threadIdx.x; + int j = blockIdx.y * j_block_size + threadIdx.y; + int k = blockIdx.z; + + if ( i < N && j < N && k < N ) { + a[i+N*(j+N*k)] = c * i * j * k ; + } +} +// _cuda_tensorinit_kernel_end +#endif + +// +// Function to check result. +// +void checkResult(double* a, double* aref, const int n); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; + +// _init_define_start +// +// 3D tensor has N^3 entries +// + constexpr int N = 100; + constexpr int N_tot = N * N * N; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); +// _init_define_end + +//----------------------------------------------------------------------------// +// C-style sequential variant establishes reference solution to compare with. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; + +// _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + a_ref[i+N*(j+N*k)] = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_seq_end + + +//----------------------------------------------------------------------------// +// We introduce a RAJA View to wrap the tensor data pointer and simplify +// multi-dimensional indexing. +// We use this in the rest of the examples in this file. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init...\n"; + +// _3D_raja_view_start + RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); +// _3D_raja_view_end + +// _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_view_seq_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + /// + /// TODO... + /// + /// EXERCISE: Implement a sequential RAJA::kernel based version of the + /// the tensor initialization kernel. Hint: recall the + /// kernelintro-nested-loop-reorder.cpp exercise file used in + /// the previous tutorial section. + /// + + checkResult(a, a_ref, N_tot); + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// C-style and RAJA OpenMP multithreading variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // _cstyle_tensorinit_omp_outer_start + #pragma omp parallel for + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_omp_outer_start + using EXEC_POL2 = + RAJA::KernelPolicy< + RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::For<0, RAJA::loop_exec, // i + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=]( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // _cstyle_tensorinit_omp_collapse_start + #pragma omp parallel for collapse(3) + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_omp_collapse_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_omp_collapse_start + using EXEC_POL3 = + RAJA::KernelPolicy< + RAJA::statement::Collapse, // k, j, i + RAJA::statement::Lambda<0> + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=]( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_omp_collapse_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + /// + /// TODO... + /// + /// EXERCISE: Implement an OpenMP RAJA::kernel based version of the + /// kernel that collapses the outer two (k, j) loops and + /// runs the inner 'i' loop sequentially. Hint: adjust the + /// entries in the 'ArgList' above and insert a 'For' statement + /// statement to execute the inner loop. + /// + + checkResult(a, a_ref, N_tot); + +#endif // if defined(RAJA_ENABLE_OPENMP) + + +#if defined(RAJA_ENABLE_CUDA) + +//----------------------------------------------------------------------------// +// C-style and RAJA CUDA GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_cuda_start + using EXEC_POL5 = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k + RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // j + RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i + RAJA::statement::Lambda<0> + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_cuda_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // + // Define total thread-block size and size of each block dimension + // +// _cuda_blockdim_start + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; +// _cuda_blockdim_end + +// _raja_tensorinit_cuda_tiled_direct_start + using EXEC_POL6 = + RAJA::KernelPolicy< + RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, + RAJA::statement::Tile<1, RAJA::tile_fixed, + RAJA::cuda_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::cuda_block_x_direct, + RAJA::statement::For<2, RAJA::cuda_block_z_direct, // k + RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // j + RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_cuda_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _cuda_tensorinit_tiled_direct_start + dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); + static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + "Invalid block_size"); + + dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); + + nested_init + <<>>(a, c, N); + cudaErrchk( cudaGetLastError() ); + cudaErrchk(cudaDeviceSynchronize()); +// _cuda_tensorinit_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +#endif // if defined(RAJA_ENABLE_CUDA) + + +#if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// +// RAJA HIP GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + double *d_a = memoryManager::allocate_gpu(N_tot); + +// _3D_raja_device_view_start + RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); +// _3D_raja_device_view_end + + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_start + using EXEC_POL7 = + RAJA::KernelPolicy< + RAJA::statement::HipKernel< + RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k + RAJA::statement::For<1, RAJA::hip_thread_y_loop, // j + RAJA::statement::For<0, RAJA::hip_thread_x_loop, // i + RAJA::statement::Lambda<0> + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_hip_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; + + // + // Define total thread-block size and size of each block dimension + // + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_tiled_direct_start + using EXEC_POL8 = + RAJA::KernelPolicy< + RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, + RAJA::statement::Tile<1, RAJA::tile_fixed, + RAJA::hip_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::hip_block_x_direct, + RAJA::statement::For<2, RAJA::hip_block_z_direct, // k + RAJA::statement::For<1, RAJA::hip_thread_y_direct, // j + RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_hip_tiled_direct_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + + memoryManager::deallocate_gpu(d_a); + +#endif // if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// + + // Clean up... + memoryManager::deallocate(a); + memoryManager::deallocate(a_ref); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to compare result to reference and print result P/F. +// +void checkResult(double* a, double* aref, const int n) +{ + bool correct = true; + + int i = 0; + while ( correct && (i < n) ) { + correct = std::abs(a[i] - aref[i]) < 10e-12; + i++; + } + + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} diff --git a/exercises/kernelintro-execpols_solution.cpp b/exercises/kernelintro-execpols_solution.cpp new file mode 100644 index 0000000000..50c360dde6 --- /dev/null +++ b/exercises/kernelintro-execpols_solution.cpp @@ -0,0 +1,533 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" + +#include "camp/resource.hpp" + +#include "memoryManager.hpp" + +/* + * RAJA::kernel execution policies + * + * In this exercise, you will use a variety of nested-loop execution + * policies to initalize entries in a three-dimensional tensor. The + * goal of the exercise is to gain familiarity with RAJA::kernel + * execution policies for various RAJA execution back-ends. + * + * RAJA features you will use: + * - `RAJA::kernel` kernel execution template method and exec policies + * - Simple RAJA View/Layout + * - RAJA Range segment + * + * If CUDA is enabled, CUDA unified memory is used. + * If HIP is enabled, HIP global device memory is used, with explicit + * host-device mem copy operations. + */ + +#if defined(RAJA_ENABLE_CUDA) +// _cuda_tensorinit_kernel_start +template< int i_block_size, int j_block_size, int k_block_size > +__launch_bounds__(i_block_size*j_block_size*k_block_size) +__global__ void nested_init(double* a, double c, int N) +{ + int i = blockIdx.x * i_block_size + threadIdx.x; + int j = blockIdx.y * j_block_size + threadIdx.y; + int k = blockIdx.z; + + if ( i < N && j < N && k < N ) { + a[i+N*(j+N*k)] = c * i * j * k ; + } +} +// _cuda_tensorinit_kernel_end +#endif + +// +// Function to check result. +// +void checkResult(double* a, double* aref, const int n); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; + +// _init_define_start +// +// 3D tensor has N^3 entries +// + constexpr int N = 100; + constexpr int N_tot = N * N * N; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); +// _init_define_end + +//----------------------------------------------------------------------------// +// C-style sequential variant establishes reference solution to compare with. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; + +// _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + a_ref[i+N*(j+N*k)] = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_seq_end + + +//----------------------------------------------------------------------------// +// We introduce a RAJA View to wrap the tensor data pointer and simplify +// multi-dimensional indexing. +// We use this in the rest of the examples in this file. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init...\n"; + +// _3D_raja_view_start + RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); +// _3D_raja_view_end + +// _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_view_seq_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_seq_start + using EXEC_POL1 = + RAJA::KernelPolicy< + RAJA::statement::For<2, RAJA::loop_exec, // k + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::For<0, RAJA::loop_exec,// i + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=]( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_seq_end + + checkResult(a, a_ref, N_tot); + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// C-style and RAJA OpenMP multithreading variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // _cstyle_tensorinit_omp_outer_start + #pragma omp parallel for + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_omp_outer_start + using EXEC_POL2 = + RAJA::KernelPolicy< + RAJA::statement::For<2, RAJA::omp_parallel_for_exec, // k + RAJA::statement::For<1, RAJA::loop_exec, // j + RAJA::statement::For<0, RAJA::loop_exec, // i + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=]( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style OpenMP collapse (3) tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // _cstyle_tensorinit_omp_collapse_start + #pragma omp parallel for collapse(3) + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_omp_collapse_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP collapse(3) tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_omp_collapse_start + using EXEC_POL3 = + RAJA::KernelPolicy< + RAJA::statement::Collapse, // k, j, i + RAJA::statement::Lambda<0> + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=]( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_omp_collapse_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP collapse(2) tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_omp_collapse_start + using EXEC_POL4 = + RAJA::KernelPolicy< + RAJA::statement::Collapse, // k, j + RAJA::statement::For<0, RAJA::loop_exec, // i + RAJA::statement::Lambda<0> + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=]( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_omp_collapse_end + + checkResult(a, a_ref, N_tot); + +#endif // if defined(RAJA_ENABLE_OPENMP) + + +#if defined(RAJA_ENABLE_CUDA) + +//----------------------------------------------------------------------------// +// C-style and RAJA CUDA GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_cuda_start + using EXEC_POL5 = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<2, RAJA::cuda_thread_z_loop, // k + RAJA::statement::For<1, RAJA::cuda_thread_y_loop, // j + RAJA::statement::For<0, RAJA::cuda_thread_x_loop, // i + RAJA::statement::Lambda<0> + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_cuda_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // + // Define total thread-block size and size of each block dimension + // +// _cuda_blockdim_start + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; +// _cuda_blockdim_end + +// _raja_tensorinit_cuda_tiled_direct_start + using EXEC_POL6 = + RAJA::KernelPolicy< + RAJA::statement::CudaKernelFixed< i_block_sz * j_block_sz * k_block_sz, + RAJA::statement::Tile<1, RAJA::tile_fixed, + RAJA::cuda_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::cuda_block_x_direct, + RAJA::statement::For<2, RAJA::cuda_block_z_direct, // k + RAJA::statement::For<1, RAJA::cuda_thread_y_direct, // j + RAJA::statement::For<0, RAJA::cuda_thread_x_direct, // i + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_cuda_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _cuda_tensorinit_tiled_direct_start + dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); + static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + "Invalid block_size"); + + dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); + + nested_init + <<>>(a, c, N); + cudaErrchk( cudaGetLastError() ); + cudaErrchk(cudaDeviceSynchronize()); +// _cuda_tensorinit_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +#endif // if defined(RAJA_ENABLE_CUDA) + + +#if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// +// RAJA HIP GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + double *d_a = memoryManager::allocate_gpu(N_tot); + +// _3D_raja_device_view_start + RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); +// _3D_raja_device_view_end + + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_start + using EXEC_POL7 = + RAJA::KernelPolicy< + RAJA::statement::HipKernel< + RAJA::statement::For<2, RAJA::hip_thread_z_loop, // k + RAJA::statement::For<1, RAJA::hip_thread_y_loop, // j + RAJA::statement::For<0, RAJA::hip_thread_x_loop, // i + RAJA::statement::Lambda<0> + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_hip_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; + + // + // Define total thread-block size and size of each block dimension + // + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_tiled_direct_start + using EXEC_POL8 = + RAJA::KernelPolicy< + RAJA::statement::HipKernelFixed< i_block_sz * j_block_sz * k_block_sz, + RAJA::statement::Tile<1, RAJA::tile_fixed, + RAJA::hip_block_y_direct, + RAJA::statement::Tile<0, RAJA::tile_fixed, + RAJA::hip_block_x_direct, + RAJA::statement::For<2, RAJA::hip_block_z_direct, // k + RAJA::statement::For<1, RAJA::hip_thread_y_direct, // j + RAJA::statement::For<0, RAJA::hip_thread_x_direct, // i + RAJA::statement::Lambda<0> + > + > + > + > + > + > + >; + + RAJA::kernel( + RAJA::make_tuple( RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N), + RAJA::TypedRangeSegment(0, N) ), + + [=] __device__ ( int i, int j, int k) { + d_aView(i, j, k) = c * i * j * k ; + } + ); +// _raja_tensorinit_hip_tiled_direct_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + + memoryManager::deallocate_gpu(d_a); + +#endif // if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// + + // Clean up... + memoryManager::deallocate(a); + memoryManager::deallocate(a_ref); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to compare result to reference and print result P/F. +// +void checkResult(double* a, double* aref, const int n) +{ + bool correct = true; + + int i = 0; + while ( correct && (i < n) ) { + correct = std::abs(a[i] - aref[i]) < 10e-12; + i++; + } + + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} diff --git a/exercises/kernelintro-nested-loop-reorder.cpp b/exercises/kernelintro-nested-loop-reorder.cpp new file mode 100644 index 0000000000..c9327ecc56 --- /dev/null +++ b/exercises/kernelintro-nested-loop-reorder.cpp @@ -0,0 +1,184 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include + +#include "RAJA/RAJA.hpp" + +/* + * Nested Loop Basics and Loop Reordering (RAJA::kernel) + * + * In this exercise, we introduce basic RAJA::kernel mechanics for executing + * nested loop kernels, including using execution policies to permute the + * order of loops in a loop nest. The exercise performs no actual + * computation and just prints out loop indices to show different + * loop ordering. Also, to avoid difficulty in interpreting parallel + * output, the execution policies use sequential execution. + * + * RAJA features shown: + * - 'RAJA::kernel' loop abstractions and execution policies + * - 'RAJA::TypedRangeSegment' iteration spaces + * - Strongly-typed loop indices + */ + +// +// Define three named loop index integer types used in the triply-nested loops. +// These will trigger compilation errors if lambda index argument ordering +// and types do not match the typed range index ordering. See final +// example in this file. +// +// _raja_typed_indices_start +RAJA_INDEX_VALUE_T(KIDX, int, "KIDX"); +RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); +RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); +// _raja_typed_indices_end + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + // _range_min_max_start + constexpr int imin = 0; + constexpr int imax = 2; + constexpr int jmin = 1; + constexpr int jmax = 3; + constexpr int kmin = 2; + constexpr int kmax = 4; + // _range_min_max_end + +// +// The RAJA variants of the loop nest use the following typed range segments +// based on the typed indices defined above, outside of main(). +// + // _raja_typed_index_ranges_start + RAJA::TypedRangeSegment KRange(kmin, kmax); + RAJA::TypedRangeSegment JRange(jmin, jmax); + RAJA::TypedRangeSegment IRange(imin, imax); + // _raja_typed_index_ranges_end + + + std::cout << "\n\nRAJA::kernel nested loop reorder example...\n"; + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + + // _cstyle_kji_loops_start + for (int k = kmin; k < kmax; ++k) { + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { + printf( " (%d, %d, %d) \n", i, j, k); + } + } + } + // _cstyle_kji_loops_end + +//----------------------------------------------------------------------------// + + std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + + // _raja_kji_loops_start + using KJI_EXECPOL = RAJA::KernelPolicy< + RAJA::statement::For<2, RAJA::seq_exec, // k + RAJA::statement::For<1, RAJA::seq_exec, // j + RAJA::statement::For<0, RAJA::seq_exec,// i + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), + [=] (IIDX i, JIDX j, KIDX k) { + printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); + }); + // _raja_kji_loops_end + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + + // _cstyle_jik_loops_start + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { + for (int k = kmin; k < kmax; ++k) { + printf( " (%d, %d, %d) \n", i, j, k); + } + } + } + // _cstyle_jik_loops_end + +//----------------------------------------------------------------------------// + std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + + /// + /// TODO... + /// + /// EXERCISE: Make a RAJA version of the kernel with j on outer loop, + /// i on middle loop, and k on inner loop + /// + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + + // _cstyle_ikj_loops_start + for (int i = imin; i < imax; ++i) { + for (int k = kmin; k < kmax; ++k) { + for (int j = jmin; j < jmax; ++j) { + printf( " (%d, %d, %d) \n", i, j, k); + } + } + } + // _cstyle_ikj_loops_end + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + + /// + /// TODO... + /// + /// EXERCISE: Make a RAJA version of the kernel with i on outer loop, + /// k on middle loop, and j on inner loop + /// + + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + +#if 0 // Enable this code block to generate compiler error. +//----------------------------------------------------------------------------// +// The following demonstrates that code will not compile if lambda argument +// types/order do not match the types/order For statements in the execution +// policy. To see this, enable this code section and try to compile this file. +//----------------------------------------------------------------------------// + + // _raja_compile_error_start + RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), + [=] (JIDX i, IIDX j, KIDX k) { + printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); + }); + // _raja_compile_error_end + +#endif + + std::cout << "\n DONE!...\n"; + + return 0; +} + diff --git a/exercises/tutorial_halfday/ex7_nested-loop-reorder_solution.cpp b/exercises/kernelintro-nested-loop-reorder_solution.cpp similarity index 52% rename from exercises/tutorial_halfday/ex7_nested-loop-reorder_solution.cpp rename to exercises/kernelintro-nested-loop-reorder_solution.cpp index 2461e0e40a..14ef279f73 100644 --- a/exercises/tutorial_halfday/ex7_nested-loop-reorder_solution.cpp +++ b/exercises/kernelintro-nested-loop-reorder_solution.cpp @@ -11,64 +11,81 @@ #include "RAJA/RAJA.hpp" /* - * EXERCISE #6: Nested Loop Reordering + * Nested Loop Basics and Loop Reordering (RAJA::kernel) * - * In this exercise, you will use RAJA::kernel execution policies - * to permute the order of loops in a triple loop nest. In particular, - * you will reorder loop statements in execution policies. The exercise - * does no actual computation and just prints out the loop indices to show - * the different orderings. - * - * To avoid the complexity of interpreting parallel output, the execution - * policies you will write will use sequential execution. + * In this exercise, we introduce basic RAJA::kernel mechanics for executing + * nested loop kernels, including using execution policies to permute the + * order of loops in a loop nest. The exercise performs no actual + * computation and just prints out loop indices to show different + * loop ordering. Also, to avoid difficulty in interpreting parallel + * output, the execution policies use sequential execution. * * RAJA features shown: - * - Index range segment * - 'RAJA::kernel' loop abstractions and execution policies - * - Nested loop reordering + * - 'RAJA::TypedRangeSegment' iteration spaces * - Strongly-typed loop indices */ // -// Define three named loop index types used in the triply-nested loops. +// Define three named loop index integer types used in the triply-nested loops. // These will trigger compilation errors if lambda index argument ordering // and types do not match the typed range index ordering. See final // example in this file. // -RAJA_INDEX_VALUE(KIDX, "KIDX"); -RAJA_INDEX_VALUE(JIDX, "JIDX"); -RAJA_INDEX_VALUE(IIDX, "IIDX"); +// _raja_typed_indices_start +RAJA_INDEX_VALUE_T(KIDX, int, "KIDX"); +RAJA_INDEX_VALUE_T(JIDX, int, "JIDX"); +RAJA_INDEX_VALUE_T(IIDX, int, "IIDX"); +// _raja_typed_indices_end int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nExercise #7: RAJA nested loop reorder example...\n"; + // _range_min_max_start + constexpr int imin = 0; + constexpr int imax = 2; + constexpr int jmin = 1; + constexpr int jmax = 3; + constexpr int kmin = 2; + constexpr int kmax = 4; + // _range_min_max_end + +// +// The RAJA variants of the loop nest use the following typed range segments +// based on the typed indices defined above, outside of main(). +// + // _raja_typed_index_ranges_start + RAJA::TypedRangeSegment KRange(kmin, kmax); + RAJA::TypedRangeSegment JRange(jmin, jmax); + RAJA::TypedRangeSegment IRange(imin, imax); + // _raja_typed_index_ranges_end + + + std::cout << "\n\nRAJA::kernel nested loop reorder example...\n"; + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// - std::cout << "\n Running C-style loop nest with loop ordering: K-outer, J-middle, I-inner" + std::cout << "\n Running C-style nested loop order: K-outer, J-middle, I-inner" << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - for (int k = 2; k < 4; ++k) { - for (int j = 1; j < 3; ++j) { - for (int i = 0; i < 2; ++i) { + // _cstyle_kji_loops_start + for (int k = kmin; k < kmax; ++k) { + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { printf( " (%d, %d, %d) \n", i, j, k); } } } + // _cstyle_kji_loops_end -// -// The RAJA variants of the loop nest used following typed range segments -// based on the typed indices defined above, outside of main(). -// - RAJA::TypedRangeSegment KRange(2, 4); - RAJA::TypedRangeSegment JRange(1, 3); - RAJA::TypedRangeSegment IRange(0, 2); - //----------------------------------------------------------------------------// - std::cout << "\n\n Running RAJA nested loop example (K-outer, J-middle, I-inner)" + std::cout << "\n\n Running RAJA nested loop order (K-outer, J-middle, I-inner)" << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + // _raja_kji_loops_start using KJI_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<2, RAJA::seq_exec, // k RAJA::statement::For<1, RAJA::seq_exec, // j @@ -83,13 +100,29 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) [=] (IIDX i, JIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); + // _raja_kji_loops_end - +//----------------------------------------------------------------------------// //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA nested loop example (J-outer, I-middle, K-inner)" + std::cout << "\n Running C-style nested loop order: J-outer, I-middle, K-inner" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + + // _cstyle_jik_loops_start + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { + for (int k = kmin; k < kmax; ++k) { + printf( " (%d, %d, %d) \n", i, j, k); + } + } + } + // _cstyle_jik_loops_end + +//----------------------------------------------------------------------------// + std::cout << "\n Running RAJA nested loop order (J-outer, I-middle, K-inner)" << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + // _raja_jik_loops_start using JIK_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<1, RAJA::seq_exec, // j RAJA::statement::For<0, RAJA::seq_exec, // i @@ -104,13 +137,30 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) [=] (IIDX i, JIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); + // _raja_jik_loops_end + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style nested loop order: I-outer, K-middle, J-inner" + << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + // _cstyle_ikj_loops_start + for (int i = imin; i < imax; ++i) { + for (int k = kmin; k < kmax; ++k) { + for (int j = jmin; j < jmax; ++j) { + printf( " (%d, %d, %d) \n", i, j, k); + } + } + } + // _cstyle_ikj_loops_end //----------------------------------------------------------------------------// - std::cout << "\n Running RAJA nested loop example (I-outer, K-middle, J-inner)" + std::cout << "\n Running RAJA nested loop order (I-outer, K-middle, J-inner)" << "...\n\n" << " (I, J, K)\n" << " ---------\n"; + // _raja_ikj_loops_start using IKJ_EXECPOL = RAJA::KernelPolicy< RAJA::statement::For<0, RAJA::seq_exec, // i RAJA::statement::For<2, RAJA::seq_exec, // k @@ -125,19 +175,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) [=] (IIDX i, JIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); + // _raja_ikj_loops_end -#if 0 +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + +#if 0 // Enable this code block to generate compiler error. //----------------------------------------------------------------------------// // The following demonstrates that code will not compile if lambda argument // types/order do not match the types/order For statements in the execution // policy. To see this, enable this code section and try to compile this file. //----------------------------------------------------------------------------// + // _raja_compile_error_start RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), [=] (JIDX i, IIDX j, KIDX k) { printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); }); + // _raja_compile_error_end #endif diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp new file mode 100644 index 0000000000..06fe36d53a --- /dev/null +++ b/exercises/launch-matrix-transpose-local-array.cpp @@ -0,0 +1,442 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "memoryManager.hpp" + +/* + * Matrix Transpose Example + * + * In this example, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At of size N_c x N_r. + * + * This operation is carried out using a local memory tiling + * algorithm. The algorithm first loads matrix entries into an + * iteraion shared tile, a two-dimensional array, and then + * reads from the tile with row and column indices swapped for + * the output matrix. + * + * The algorithm is expressed as a collection of ``outer`` + * and ``inner`` for loops. Iterations of the inner loops will load/read + * data into the tile; while outer loops will iterate over the number + * of tiles needed to carry out the transpose. + * + * RAJA variants of the example use RAJA_TEAM_SHARED as tile memory. + * Furthermore, the tiling pattern is handled by RAJA's tile methods. + * For CPU execution, RAJA_TEAM_SHARED are used to improve + * performance via cache blocking. For CUDA GPU execution, + * RAJA shared memory is mapped to CUDA shared memory which + * enables threads in the same thread block to share data. + * + * RAJA features shown: + * - Basic usage of 'RAJA::launch' abstractions for nested loops + * - tile methods + * - loop_icount methods + * - RAJA_TEAM_SHARED + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices and tile size +// +const int DIM = 2; +#define TILE_DIM (16) // #define to appease msvc + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA shared matrix transpose example...\n"; + + // + // Define num rows/cols in matrix, tile dimensions, and number of tiles + // + // _mattranspose_localarray_dims_start + constexpr int N_r = 267; + constexpr int N_c = 251; + + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + // _mattranspose_localarray_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_localarray_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_localarray_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + // printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of shared matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _mattranspose_localarray_cstyle_start + // + // (0) Outer loops to iterate over tiles + // + for (int by = 0; by < outer_Dimr; ++by) { + for (int bx = 0; bx < outer_Dimc; ++bx) { + + // Stack-allocated local array for data on a tile + int Tile[TILE_DIM][TILE_DIM]; + + // + // (1) Inner loops to read input matrix tile data into the array + // + // Note: loops are ordered so that input matrix data access + // is stride-1. + // + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Tile[ty][tx] = Aview(row, col); + } + } + } + + // + // (2) Inner loops to write array data into output array tile + // + // Note: loop order is swapped from above so that output matrix + // data access is stride-1. + // + for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Tile[ty][tx]; + } + } + } + + } + } + // _mattranspose_localarray_cstyle_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _mattranspose_localarray_raja_start + using loop_pol_1 = RAJA::LoopPolicy; + using launch_policy_1 = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + /// + /// TODO ... + /// + /// Exercise Implement loop_icount methods to load tiles of the + /// input matrix into the RAJA_TEAM_SHARED memory array + /// + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + + }); + // _mattranspose_localarray_raja_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +#if defined(RAJA_ENABLE_OPENMP) + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix " + "transpose example ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops over tiles sequentially while exposing parallelism on + // one of the inner loops. + // + + /// + /// TODO... + /// + /// EXERCISE: Implement an omp_pol_2 type that will distribute loop iterations + /// within the omp parallel region. + /// + + //using loop_pol_2 = RAJA::LoopPolicy; + using launch_policy_2 = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { + + /* + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + + Tile_Array[ty][tx] = Aview(row, col); + + }); + }); + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + */ + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + + //--------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + /// TODO... + /// + /// EXERCISE: Define loop policies to mapp loop iterations to blocks, threads directly + /// + + const bool cuda_async = false; + using cuda_launch_policy = RAJA::LaunchPolicy>; + + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { + /* + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + + Tile_Array[ty][tx] = Aview(row, col); + + }); + }); + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + */ + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + +//--------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; + + int *d_A = memoryManager::allocate_gpu(N_r * N_c); + int *d_At = memoryManager::allocate_gpu(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + RAJA::View> d_Aview(d_A, N_r, N_c); + RAJA::View> d_Atview(d_At, N_c, N_r); + + std::memset(At, 0, N_r * N_c * sizeof(int)); + hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + using hip_teams_y = RAJA::LoopPolicy; + using hip_teams_x = RAJA::LoopPolicy; + + using hip_threads_y = RAJA::LoopPolicy; + using hip_threads_x = RAJA::LoopPolicy; + + const bool hip_async = false; + using hip_launch_policy = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + + Tile_Array[ty][tx] = d_Aview(row, col); + + }); + }); + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + d_Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + + }); + + hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + +//--------------------------------------------------------------------------// + + return 0; +} + + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + << std::endl; + } + std::cout << "" << std::endl; + } + std::cout << std::endl; +} diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp new file mode 100644 index 0000000000..536d21bfbe --- /dev/null +++ b/exercises/launch-matrix-transpose-local-array_solution.cpp @@ -0,0 +1,437 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "memoryManager.hpp" + +/* + * Matrix Transpose Example + * + * In this example, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At of size N_c x N_r. + * + * This operation is carried out using a local memory tiling + * algorithm. The algorithm first loads matrix entries into an + * iteraion shared tile, a two-dimensional array, and then + * reads from the tile with row and column indices swapped for + * the output matrix. + * + * The algorithm is expressed as a collection of ``outer`` + * and ``inner`` for loops. Iterations of the inner loops will load/read + * data into the tile; while outer loops will iterate over the number + * of tiles needed to carry out the transpose. + * + * RAJA variants of the example use RAJA_TEAM_SHARED as tile memory. + * Furthermore, the tiling pattern is handled by RAJA's tile methods. + * For CPU execution, RAJA_TEAM_SHARED are used to improve + * performance via cache blocking. For CUDA GPU execution, + * RAJA shared memory is mapped to CUDA shared memory which + * enables threads in the same thread block to share data. + * + * RAJA features shown: + * - Basic usage of 'RAJA::launch' abstractions for nested loops + * - tile methods + * - loop_icount methods + * - RAJA_TEAM_SHARED + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices and tile size +// +const int DIM = 2; +#define TILE_DIM (16) // #define to appease msvc + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA shared matrix transpose example...\n"; + + // + // Define num rows/cols in matrix, tile dimensions, and number of tiles + // + // _mattranspose_localarray_dims_start + constexpr int N_r = 267; + constexpr int N_c = 251; + + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + // _mattranspose_localarray_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_localarray_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_localarray_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + // printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of shared matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _mattranspose_localarray_cstyle_start + // + // (0) Outer loops to iterate over tiles + // + for (int by = 0; by < outer_Dimr; ++by) { + for (int bx = 0; bx < outer_Dimc; ++bx) { + + // Stack-allocated local array for data on a tile + int Tile[TILE_DIM][TILE_DIM]; + + // + // (1) Inner loops to read input matrix tile data into the array + // + // Note: loops are ordered so that input matrix data access + // is stride-1. + // + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Tile[ty][tx] = Aview(row, col); + } + } + } + + // + // (2) Inner loops to write array data into output array tile + // + // Note: loop order is swapped from above so that output matrix + // data access is stride-1. + // + for (int tx = 0; tx < TILE_DIM; ++tx) { + for (int ty = 0; ty < TILE_DIM; ++ty) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Tile[ty][tx]; + } + } + } + + } + } + // _mattranspose_localarray_cstyle_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA - sequential matrix transpose example ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _mattranspose_localarray_raja_start + using loop_pol_1 = RAJA::LoopPolicy; + using launch_policy_1 = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + + Tile_Array[ty][tx] = Aview(row, col); + + }); + }); + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + + }); + // _mattranspose_localarray_raja_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +#if defined(RAJA_ENABLE_OPENMP) + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - OpenMP (parallel outer loop) matrix " + "transpose example ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops over tiles sequentially while exposing parallelism on + // one of the inner loops. + // + using omp_pol_2 = RAJA::LoopPolicy; + using loop_pol_2 = RAJA::LoopPolicy; + using launch_policy_2 = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + + Tile_Array[ty][tx] = Aview(row, col); + + }); + }); + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + + //--------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA - CUDA matrix transpose example ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + using cuda_teams_y = RAJA::LoopPolicy; + using cuda_teams_x = RAJA::LoopPolicy; + + using cuda_threads_y = RAJA::LoopPolicy; + using cuda_threads_x = RAJA::LoopPolicy; + + const bool cuda_async = false; + using cuda_launch_policy = RAJA::LaunchPolicy>; + + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + + Tile_Array[ty][tx] = Aview(row, col); + + }); + }); + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + +//--------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + //--------------------------------------------------------------------------// + std::cout << "\n Running RAJA - HIP matrix transpose example ...\n"; + + int *d_A = memoryManager::allocate_gpu(N_r * N_c); + int *d_At = memoryManager::allocate_gpu(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + RAJA::View> d_Aview(d_A, N_r, N_c); + RAJA::View> d_Atview(d_At, N_c, N_r); + + std::memset(At, 0, N_r * N_c * sizeof(int)); + hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + using hip_teams_y = RAJA::LoopPolicy; + using hip_teams_x = RAJA::LoopPolicy; + + using hip_threads_y = RAJA::LoopPolicy; + using hip_threads_x = RAJA::LoopPolicy; + + const bool hip_async = false; + using hip_launch_policy = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_r), [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, RAJA::TypedRangeSegment(0, N_c), [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM]; + + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + + Tile_Array[ty][tx] = d_Aview(row, col); + + }); + }); + + RAJA::loop_icount(ctx, col_tile, [&] (int col, int tx) { + RAJA::loop_icount(ctx, row_tile, [&] (int row, int ty) { + + d_Atview(col, row) = Tile_Array[ty][tx]; + + }); + }); + + }); + }); + + }); + + hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif + +//--------------------------------------------------------------------------// + + return 0; +} + + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + << std::endl; + } + std::cout << "" << std::endl; + } + std::cout << std::endl; +} diff --git a/exercises/launch-matrix-transpose-tiled.cpp b/exercises/launch-matrix-transpose-tiled.cpp new file mode 100644 index 0000000000..86a88413b7 --- /dev/null +++ b/exercises/launch-matrix-transpose-tiled.cpp @@ -0,0 +1,422 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Tiled Matrix Transpose Example + * + * In this example, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At. + * + * This operation is carried out using a tiling algorithm. + * The algorithm iterates over tiles of the matrix A and + * performs a transpose copy without explicitly storing the tile. + * + * The algorithm is expressed as a collection of ``outer`` + * and ``inner`` for loops. Iterations of the inner loop will + * tranpose tile entries; while outer loops will iterate over + * the number of tiles needed to carryout the transpose. + * We do not assume that tiles divide the number of rows and + * and columns of the matrix. + * + * RAJA features shown: + * - Basic usage of 'RAJA::launch' abstractions for nested loops + * - tiling method + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA tiled matrix transpose example...\n"; + + // + // Define num rows/cols in matrix, tile dimensions, and number of tiles. + // + // _tiled_mattranspose_dims_start + constexpr int N_r = 56; + constexpr int N_c = 75; + + constexpr int TILE_DIM = 16; + + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + // _tiled_mattranspose_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of tiled matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _tiled_mattranspose_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _tiled_mattranspose_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of tiled matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _cstyle_tiled_mattranspose_start + // + // (0) Outer loops to iterate over tiles + // + for (int by = 0; by < outer_Dimr; ++by) { + for (int bx = 0; bx < outer_Dimc; ++bx) { + // + // (1) Loops to iterate over tile entries + // + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Aview(row, col); + } + } + } + + } + } + // _cstyle_tiled_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + //----------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::kernel method to carryout the + // transpose. + // + // Here, we define RAJA range segments to establish the iteration spaces. + // Further partioning of the iteration space is carried out in the + // tile_fixed statements. Iterations inside a RAJA loop is given by their + // global iteration number. + // + +/// +/// TODO: Uncomment these range segments so you can use them in the +/// non-HIP exercises in this file. +/* + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); +*/ + + //----------------------------------------------------------------------------// + std::cout << "\n Running sequential tiled matrix transpose ...\n"; + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // The following policy carries out the transpose + // using sequential loops. The template parameter inside + // tile_fixed corresponds to the dimension size of the tile. + // + // _raja_tiled_mattranspose_start + //using loop_pol_1 = RAJA::LoopPolicy; + using launch_policy_1 = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { + + /* + RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + + /// + /// TODO... + /// + /// EXERCISE: Implement a loop method that takes a col_tile and + /// returns the global index to the column iteration + /// + /// Uncomment the statement below to run the kernel and check the + /// result. + /// + + //Atview(col, row) = Aview(row, col); + + }); + + }); + }); + */ + }); + // _raja_tiled_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops over tiles sequentially while exposing parallelism on + // one of the inner loops. + // + //using omp_for_pol_2 = RAJA::LoopPolicy; + //using loop_pol_2 = RAJA::LoopPolicy; + + /// + /// TODO... + /// + /// EXERCISE: Create a launch_policy_2 that will create an omp parallel region + /// + /// Uncomment the kernel below to run it and check the result. + /// + /// + + /* + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + }); + + }); +*/ + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running cuda tiled matrix transpose ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + /* + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + using cuda_teams_y = RAJA::LoopPolicy; + using cuda_teams_x = RAJA::LoopPolicy; + + using cuda_threads_y = RAJA::LoopPolicy; + using cuda_threads_x = RAJA::LoopPolicy; + */ + + /// TODO... + /// + /// EXERCISE: Implement the cuda launch policy to dispatch the kernel below + /// on the GPU + /// + /// When you uncomment kernel code below, you will also need to + /// uncomment variables above that are used within it. + /// + +/* + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + }); + + }); +*/ + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + std::cout << "\n Running hip tiled matrix transpose ...\n"; + + RAJA::TypedRangeSegment row_Range2(0, N_r); + RAJA::TypedRangeSegment col_Range2(0, N_c); + + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); + + RAJA::View> d_Aview(d_A, N_r, N_c); + RAJA::View> d_Atview(d_At, N_c, N_r); + + std::memset(At, 0, N_r * N_c * sizeof(int)); + hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + using hip_teams_y = RAJA::LoopPolicy; + using hip_teams_x = RAJA::LoopPolicy; + + using hip_threads_y = RAJA::LoopPolicy; + using hip_threads_x = RAJA::LoopPolicy; + + const bool hip_async = false; + using hip_launch_policy = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, row_Range2, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range2, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + }); + + }); + + hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + + //----------------------------------------------------------------------------// + + + // + // Clean up. + // + memoryManager::deallocate(A); + memoryManager::deallocate(At); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + // << std::endl; + std::cout< +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Tiled Matrix Transpose Example + * + * In this example, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At. + * + * This operation is carried out using a tiling algorithm. + * The algorithm iterates over tiles of the matrix A and + * performs a transpose copy without explicitly storing the tile. + * + * The algorithm is expressed as a collection of ``outer`` + * and ``inner`` for loops. Iterations of the inner loop will + * tranpose tile entries; while outer loops will iterate over + * the number of tiles needed to carryout the transpose. + * We do not assume that tiles divide the number of rows and + * and columns of the matrix. + * + * RAJA features shown: + * - Basic usage of 'RAJA::launch' abstractions for nested loops + * - tiling method + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA tiled matrix transpose example...\n"; + + // + // Define num rows/cols in matrix, tile dimensions, and number of tiles. + // + // _tiled_mattranspose_dims_start + constexpr int N_r = 56; + constexpr int N_c = 75; + + constexpr int TILE_DIM = 16; + + constexpr int outer_Dimc = (N_c - 1) / TILE_DIM + 1; + constexpr int outer_Dimr = (N_r - 1) / TILE_DIM + 1; + // _tiled_mattranspose_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of tiled matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _tiled_mattranspose_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _tiled_mattranspose_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of tiled matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _cstyle_tiled_mattranspose_start + // + // (0) Outer loops to iterate over tiles + // + for (int by = 0; by < outer_Dimr; ++by) { + for (int bx = 0; bx < outer_Dimc; ++bx) { + // + // (1) Loops to iterate over tile entries + // + for (int ty = 0; ty < TILE_DIM; ++ty) { + for (int tx = 0; tx < TILE_DIM; ++tx) { + + int col = bx * TILE_DIM + tx; // Matrix column index + int row = by * TILE_DIM + ty; // Matrix row index + + // Bounds check + if (row < N_r && col < N_c) { + Atview(col, row) = Aview(row, col); + } + } + } + + } + } + // _cstyle_tiled_mattranspose_end + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); + //----------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::kernel method to carryout the + // transpose. + // + // Here, we define RAJA range segments to establish the iteration spaces. + // Further partioning of the iteration space is carried out in the + // tile_fixed statements. Iterations inside a RAJA loop is given by their + // global iteration number. + // + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running sequential tiled matrix transpose ...\n"; + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // The following policy carries out the transpose + // using sequential loops. The template parameter inside + // tile_fixed corresponds to the dimension size of the tile. + // + // _raja_tiled_mattranspose_start + using loop_pol_1 = RAJA::LoopPolicy; + using launch_policy_1 = RAJA::LaunchPolicy; + + RAJA::launch(RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + }); + + }); + // _raja_tiled_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running openmp tiled matrix transpose - parallel top inner loop...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops over tiles sequentially while exposing parallelism on + // one of the inner loops. + // + using omp_for_pol_2 = RAJA::LoopPolicy; + using loop_pol_2 = RAJA::LoopPolicy; + using launch_policy_2 = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when running on the cpu + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + }); + + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running cuda tiled matrix transpose ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + // _raja_mattranspose_cuda_start + using cuda_teams_y = RAJA::LoopPolicy; + using cuda_teams_x = RAJA::LoopPolicy; + + using cuda_threads_y = RAJA::LoopPolicy; + using cuda_threads_x = RAJA::LoopPolicy; + + const bool cuda_async = false; + using cuda_launch_policy = RAJA::LaunchPolicy>; + + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile(ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + }); + + }); + // _raja_mattranspose_cuda_end + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + std::cout << "\n Running hip tiled matrix transpose ...\n"; + + int* d_A = memoryManager::allocate_gpu(N_r * N_c); + int* d_At = memoryManager::allocate_gpu(N_r * N_c); + + RAJA::View> d_Aview(d_A, N_r, N_c); + RAJA::View> d_Atview(d_At, N_c, N_r); + + std::memset(At, 0, N_r * N_c * sizeof(int)); + hipErrchk(hipMemcpy( d_A, A, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_At, At, N_r * N_c * sizeof(int), hipMemcpyHostToDevice )); + + constexpr int c_block_sz = TILE_DIM; + constexpr int r_block_sz = TILE_DIM; + const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz); + const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz); + + using hip_teams_y = RAJA::LoopPolicy; + using hip_teams_x = RAJA::LoopPolicy; + + using hip_threads_y = RAJA::LoopPolicy; + using hip_threads_x = RAJA::LoopPolicy; + + const bool hip_async = false; + using hip_launch_policy = RAJA::LaunchPolicy>; + + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r), + RAJA::Threads(c_block_sz, r_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::tile(ctx, TILE_DIM, row_Range, [&] (RAJA::TypedRangeSegment const &row_tile) { + + RAJA::tile (ctx, TILE_DIM, col_Range, [&] (RAJA::TypedRangeSegment const &col_tile) { + + RAJA::loop(ctx, row_tile, [&] (int row) { + RAJA::loop(ctx, col_tile, [&] (int col) { + + d_Atview(col, row) = d_Aview(row, col); + + }); + }); + + }); + }); + + }); + + hipErrchk(hipMemcpy( At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost )); + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + + //----------------------------------------------------------------------------// + + + // + // Clean up. + // + memoryManager::deallocate(A); + memoryManager::deallocate(At); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + // << std::endl; + std::cout< +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Tiled Matrix Transpose Exercise + * + * In this exercise, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At. + * + * RAJA features shown: + * - Basic usage of 'RAJA::launch' abstractions for nested loops + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA matrix transpose exercise...\n"; + + // + // Define num rows/cols in matrix. + // + // _mattranspose_dims_start + constexpr int N_r = 56; + constexpr int N_c = 75; + // _mattranspose_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _cstyle_mattranspose_start + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Atview(col, row) = Aview(row, col); + } + } + // _cstyle_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //----------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::kernel method to carryout the + // transpose. + // + // Here, we define RAJA range segments to establish the iteration spaces. + // Iterations inside a RAJA loop is given by their global iteration number. + // + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running sequential matrix transpose ...\n"; + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // The following policy carries out the transpose + // using sequential loops. + // + // _raja_mattranspose_start + using loop_policy_seq = RAJA::LoopPolicy; + using launch_policy_seq = RAJA::LaunchPolicy; + + RAJA::launch + (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, row_Range, [&] (int /*row*/) { + RAJA::loop(ctx, col_Range, [&] (int /*col*/) { + + /// TODO... + /// + /// EXERCISE: Implement the kernel body for the transpose operation + /// + + }); + }); + + }); + // _raja_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops sequentially while exposing parallelism on + // one of the inner loops. + + //uncomment to use in example below + //using loop_policy_omp = RAJA::LoopPolicy; + using launch_policy_omp = RAJA::LaunchPolicy; + + RAJA::launch(RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { + + + /// TODO... + /// + /// EXERCISE: Implement the loops to apply omp parallism and sequential + /// execution on the column and row loops respectively + /// + + //Atview(col, row) = Aview(row, col); + + + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running cuda matrix transpose ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _raja_mattranspose_cuda_start + using cuda_thread_x = RAJA::LoopPolicy; + using cuda_thread_y = RAJA::LoopPolicy; + + const bool async = false; //execute asynchronously + using launch_policy_cuda = RAJA::LaunchPolicy>; + + RAJA::launch( + RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, row_Range, [&] (int row) { + RAJA::loop(ctx, col_Range, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + // _raja_mattranspose_cuda_end + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + +//----------------------------------------------------------------------------// + + // + // Clean up. + // + memoryManager::deallocate(A); + memoryManager::deallocate(At); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + // << std::endl; + std::cout< +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Tiled Matrix Transpose Exercise + * + * In this exercise, an input matrix A of dimension N_r x N_c is + * transposed and returned as a second matrix At. + * + * RAJA features shown: + * - Basic usage of 'RAJA::launch' abstractions for nested loops + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +// +// Define dimensionality of matrices +// +constexpr int DIM = 2; + +// +// Function for checking results +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c); + +// +// Function for printing results +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA matrix transpose exercise...\n"; + + // + // Define num rows/cols in matrix. + // + // _mattranspose_dims_start + constexpr int N_r = 56; + constexpr int N_c = 75; + // _mattranspose_dims_end + + // + // Allocate matrix data + // + int *A = memoryManager::allocate(N_r * N_c); + int *At = memoryManager::allocate(N_r * N_c); + + // + // In the following implementations of matrix transpose, we + // use RAJA 'View' objects to access the matrix data. A RAJA view + // holds a pointer to a data array and enables multi-dimensional indexing + // into the data. + // + // _mattranspose_views_start + RAJA::View> Aview(A, N_r, N_c); + RAJA::View> Atview(At, N_c, N_r); + // _mattranspose_views_end + + // + // Initialize matrix data + // + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(row, col) = col; + } + } + //printResult(Aview, N_r, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running C-version of matrix transpose...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _cstyle_mattranspose_start + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Atview(col, row) = Aview(row, col); + } + } + // _cstyle_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); + + //----------------------------------------------------------------------------// + + // + // The following RAJA variants use the RAJA::kernel method to carryout the + // transpose. + // + // Here, we define RAJA range segments to establish the iteration spaces. + // Iterations inside a RAJA loop is given by their global iteration number. + // + RAJA::TypedRangeSegment row_Range(0, N_r); + RAJA::TypedRangeSegment col_Range(0, N_c); + + //----------------------------------------------------------------------------// + std::cout << "\n Running sequential matrix transpose ...\n"; + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // The following policy carries out the transpose + // using sequential loops. + // + // _raja_mattranspose_start + using loop_policy_seq = RAJA::LoopPolicy; + using launch_policy_seq = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, row_Range, [&] (int row) { + RAJA::loop(ctx, col_Range, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + // _raja_mattranspose_end + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running openmp matrix transpose - parallel top inner loop...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // + // This policy loops sequentially while exposing parallelism on + // one of the inner loops. + // + using loop_policy_omp = RAJA::LoopPolicy; + using launch_policy_omp = RAJA::LaunchPolicy; + + RAJA::launch( + RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, row_Range, [&] (int row) { + RAJA::loop(ctx, col_Range, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + + checkResult(Atview, N_c, N_r); + // printResult(Atview, N_c, N_r); +#endif +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running cuda matrix transpose ...\n"; + + std::memset(At, 0, N_r * N_c * sizeof(int)); + + // _raja_mattranspose_cuda_start + using cuda_thread_x = RAJA::LoopPolicy; + using cuda_thread_y = RAJA::LoopPolicy; + + const bool async = false; //execute asynchronously + using launch_policy_cuda = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(1), RAJA::Threads(16,16)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, row_Range, [&] (int row) { + RAJA::loop(ctx, col_Range, [&] (int col) { + + Atview(col, row) = Aview(row, col); + + }); + }); + + }); + // _raja_mattranspose_cuda_end + + checkResult(Atview, N_c, N_r); + //printResult(Atview, N_c, N_r); +#endif + +//----------------------------------------------------------------------------// + + // + // Clean up. + // + memoryManager::deallocate(A); + memoryManager::deallocate(At); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(RAJA::View> Atview, int N_r, int N_c) +{ + bool match = true; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + if (Atview(row, col) != row) { + match = false; + } + } + } + if (match) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +// +// Function to print result. +// +template +void printResult(RAJA::View> Atview, int N_r, int N_c) +{ + std::cout << std::endl; + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + // std::cout << "At(" << row << "," << col << ") = " << Atview(row, col) + // << std::endl; + std::cout< +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" + +#include "camp/resource.hpp" + +#include "memoryManager.hpp" + +/* + * RAJA::Launch execution policies + * + * In this exercise, you will use a variety of nested-loop execution + * policies to initalize entries in a three-dimensional tensor. The + * goal of the exercise is to gain familiarity with RAJA::Launch + * execution policies for various RAJA execution back-ends. + * + * RAJA features you will use: + * - `RAJA::Launch` kernel execution template method and exec policies + * - Simple RAJA View/Layout + * - RAJA Range segment + * + * If CUDA is enabled, CUDA unified memory is used. + * If HIP is enabled, HIP global device memory is used, with explicit + * host-device mem copy operations. + */ + +#if defined(RAJA_ENABLE_CUDA) +// _cuda_tensorinit_kernel_start +template< int i_block_size, int j_block_size, int k_block_size > +__launch_bounds__(i_block_size*j_block_size*k_block_size) +__global__ void nested_init(double* a, double c, int N) +{ + int i = blockIdx.x * i_block_size + threadIdx.x; + int j = blockIdx.y * j_block_size + threadIdx.y; + int k = blockIdx.z; + + if ( i < N && j < N && k < N ) { + a[i+N*(j+N*k)] = c * i * j * k ; + } +} +// _cuda_tensorinit_kernel_end +#endif + +// +// Function to check result. +// +void checkResult(double* a, double* aref, const int n); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; + +// _init_define_start +// +// 3D tensor has N^3 entries +// + constexpr int N = 100; + constexpr int N_tot = N * N * N; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); +// _init_define_end + +//----------------------------------------------------------------------------// +// C-style sequential variant establishes reference solution to compare with. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; + +// _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + a_ref[i+N*(j+N*k)] = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_seq_end + + +//----------------------------------------------------------------------------// +// We introduce a RAJA View to wrap the tensor data pointer and simplify +// multi-dimensional indexing. +// We use this in the rest of the examples in this file. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init...\n"; + +// _3D_raja_view_start + RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); +// _3D_raja_view_end + +// _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_view_seq_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + /// + /// TODO... + /// + /// EXERCISE: Complete sequential RAJA::launch based version of the + /// the tensor initialization kernel. + /// + +// _raja_tensorinit_seq_start + //using loop_policy_1 = RAJA::LoopPolicy; + using launch_policy_1 = RAJA::LaunchPolicy; + + RAJA::launch + (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { + /* + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + + //Add additional loop methods to complete the kernel + + }); + */ + }); +// _raja_tensorinit_seq_end + + checkResult(a, a_ref, N_tot); + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// C-style and RAJA OpenMP multithreading variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // _cstyle_tensorinit_omp_outer_start + #pragma omp parallel for + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + /// + /// TODO... + /// + /// EXERCISE: Complete an OpenMP RAJA::launch based version of the + /// kernel that creates a parallel outer loop. + /// + +// _raja_tensorinit_omp_outer_start + /* + using omp_policy_2 = RAJA::LoopPolicy; + using loop_policy_2 = RAJA::LoopPolicy; + */ + using launch_policy_2 = RAJA::LaunchPolicy; + + RAJA::launch + (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext /*ctx*/) { + + //TODO: Use the omp_policy_2 to distribute loop iterations + //in a RAJA::loop method + /* + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { + + + }); + }); + */ + + }); +// _raja_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); +#endif +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + + // + // Define total thread-block size and size of each block dimension + // +// _cuda_blockdim_start + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; + + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); +// _cuda_blockdim_end + +//----------------------------------------------------------------------------// +// C-style and RAJA CUDA GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_cuda_start + using cuda_teams_z_3 = RAJA::LoopPolicy; + using cuda_global_thread_y_3 = RAJA::LoopPolicy; + using cuda_global_thread_x_3 = RAJA::LoopPolicy; + + const bool async_3 = false; + using launch_policy_3 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { + + aView(i, j, k) = c * i * j * k ; + + }); + }); + }); + }); + +// _raja_tensorinit_cuda_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_cuda_tiled_direct_start + using cuda_teams_z_4 = RAJA::LoopPolicy; + using cuda_teams_y_4 = RAJA::LoopPolicy; + using cuda_teams_x_4 = RAJA::LoopPolicy; + + using cuda_threads_y_4 = RAJA::LoopPolicy; + using cuda_threads_x_4 = RAJA::LoopPolicy; + + const bool async_4 = false; + using launch_policy_4 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + + RAJA::tile + (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { + + RAJA::tile + (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { + + RAJA::loop(ctx, j_tile, [&] (int j) { + RAJA::loop(ctx, i_tile, [&] (int i) { + + aView(i, j, k) = c * i * j * k ; + + }); + }); + + }); + }); + + }); + }); +// _raja_tensorinit_cuda_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _cuda_tensorinit_tiled_direct_start + dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); + static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + "Invalid block_size"); + + dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); + + nested_init + <<>>(a, c, N); + cudaErrchk( cudaGetLastError() ); + cudaErrchk(cudaDeviceSynchronize()); +// _cuda_tensorinit_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +#endif // if defined(RAJA_ENABLE_CUDA) + + +#if defined(RAJA_ENABLE_HIP) + + // + // Define total thread-block size and size of each block dimension + // + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; + + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); + +//----------------------------------------------------------------------------// +// RAJA HIP GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + double *d_a = memoryManager::allocate_gpu(N_tot); + +// _3D_raja_device_view_start + RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); +// _3D_raja_deviceview_end + + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_start + using hip_teams_z_5 = RAJA::LoopPolicy; + using hip_global_thread_y_5 = RAJA::LoopPolicy; + using hip_global_thread_x_5 = RAJA::LoopPolicy; + + const bool async_5 = false; + using launch_policy_5 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { + + d_aView(i, j, k) = c * i * j * k ; + + }); + }); + }); + + }); +// _raja_tensorinit_hip_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_tiled_direct_start + using hip_teams_z_6 = RAJA::LoopPolicy; + using hip_teams_y_6 = RAJA::LoopPolicy; + using hip_teams_x_6 = RAJA::LoopPolicy; + + using hip_threads_y_6 = RAJA::LoopPolicy; + using hip_threads_x_6 = RAJA::LoopPolicy; + + const bool async_6 = false; + using launch_policy_6 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + + RAJA::tile + (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { + + RAJA::tile + (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { + + RAJA::loop(ctx, j_tile, [&] (int j) { + RAJA::loop(ctx, i_tile, [&] (int i) { + + d_aView(i, j, k) = c * i * j * k ; + + }); + }); + + }); + }); + + }); + }); +// _raja_tensorinit_hip_tiled_direct_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + + memoryManager::deallocate_gpu(d_a); + +#endif // if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// + + // Clean up... + memoryManager::deallocate(a); + memoryManager::deallocate(a_ref); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to compare result to reference and print result P/F. +// +void checkResult(double* a, double* aref, const int n) +{ + bool correct = true; + + int i = 0; + while ( correct && (i < n) ) { + correct = std::abs(a[i] - aref[i]) < 10e-12; + i++; + } + + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} diff --git a/exercises/launchintro-execpols_solution.cpp b/exercises/launchintro-execpols_solution.cpp new file mode 100644 index 0000000000..0dfda9f9f0 --- /dev/null +++ b/exercises/launchintro-execpols_solution.cpp @@ -0,0 +1,473 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" + +#include "camp/resource.hpp" + +#include "memoryManager.hpp" + +/* + * RAJA::Launch execution policies + * + * In this exercise, you will use a variety of nested-loop execution + * policies to initalize entries in a three-dimensional tensor. The + * goal of the exercise is to gain familiarity with RAJA::Launch + * execution policies for various RAJA execution back-ends. + * + * RAJA features you will use: + * - `RAJA::Launch` kernel execution template method and exec policies + * - Simple RAJA View/Layout + * - RAJA Range segment + * + * If CUDA is enabled, CUDA unified memory is used. + * If HIP is enabled, HIP global device memory is used, with explicit + * host-device mem copy operations. + */ + +#if defined(RAJA_ENABLE_CUDA) +// _cuda_tensorinit_kernel_start +template< int i_block_size, int j_block_size, int k_block_size > +__launch_bounds__(i_block_size*j_block_size*k_block_size) +__global__ void nested_init(double* a, double c, int N) +{ + int i = blockIdx.x * i_block_size + threadIdx.x; + int j = blockIdx.y * j_block_size + threadIdx.y; + int k = blockIdx.z; + + if ( i < N && j < N && k < N ) { + a[i+N*(j+N*k)] = c * i * j * k ; + } +} +// _cuda_tensorinit_kernel_end +#endif + +// +// Function to check result. +// +void checkResult(double* a, double* aref, const int n); + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nExercise: RAJA::kernel execution policies tensor init...\n"; + +// _init_define_start +// +// 3D tensor has N^3 entries +// + constexpr int N = 100; + constexpr int N_tot = N * N * N; + constexpr double c = 0.0001; + double* a = memoryManager::allocate(N_tot); + double* a_ref = memoryManager::allocate(N_tot); +// _init_define_end + +//----------------------------------------------------------------------------// +// C-style sequential variant establishes reference solution to compare with. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init: create reference solution ...\n"; + +// _cstyle_tensorinit_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + a_ref[i+N*(j+N*k)] = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_seq_end + + +//----------------------------------------------------------------------------// +// We introduce a RAJA View to wrap the tensor data pointer and simplify +// multi-dimensional indexing. +// We use this in the rest of the examples in this file. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style sequential tensor init...\n"; + +// _3D_raja_view_start + RAJA::View< double, RAJA::Layout<3, int> > aView(a, N, N, N); +// _3D_raja_view_end + +// _cstyle_tensorinit_view_seq_start + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_view_seq_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_seq_start + using loop_policy_1 = RAJA::LoopPolicy; + using launch_policy_1 = RAJA::LaunchPolicy; + + RAJA::launch + (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { + + aView(i, j, k) = c * i * j * k ; + + }); + }); + }); + }); +// _raja_tensorinit_seq_end + + checkResult(a, a_ref, N_tot); + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// C-style and RAJA OpenMP multithreading variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-style OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + + // _cstyle_tensorinit_omp_outer_start + #pragma omp parallel for + for (int k = 0; k < N; ++k ) { + for (int j = 0; j < N; ++j ) { + for (int i = 0; i < N; ++i ) { + aView(i, j, k) = c * i * j * k ; + } + } + } +// _cstyle_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA OpenMP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_omp_outer_start + using omp_policy_2 = RAJA::LoopPolicy; + using loop_policy_2 = RAJA::LoopPolicy; + using launch_policy_2 = RAJA::LaunchPolicy; + + RAJA::launch + (RAJA::LaunchParams(), //LaunchParams may be empty when running on the host + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { + + aView(i, j, k) = c * i * j * k ; + + }); + }); + }); + }); +// _raja_tensorinit_omp_outer_end + + checkResult(a, a_ref, N_tot); +#endif +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + + // + // Define total thread-block size and size of each block dimension + // +// _cuda_blockdim_start + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; + + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); +// _cuda_blockdim_end + +//----------------------------------------------------------------------------// +// C-style and RAJA CUDA GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_cuda_start + using cuda_teams_z_3 = RAJA::LoopPolicy; + using cuda_global_thread_y_3 = RAJA::LoopPolicy; + using cuda_global_thread_x_3 = RAJA::LoopPolicy; + + const bool async_3 = false; + using launch_policy_3 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i ,n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { + + aView(i, j, k) = c * i * j * k ; + + }); + }); + }); + }); + +// _raja_tensorinit_cuda_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _raja_tensorinit_cuda_tiled_direct_start + using cuda_teams_z_4 = RAJA::LoopPolicy; + using cuda_teams_y_4 = RAJA::LoopPolicy; + using cuda_teams_x_4 = RAJA::LoopPolicy; + + using cuda_threads_y_4 = RAJA::LoopPolicy; + using cuda_threads_x_4 = RAJA::LoopPolicy; + + const bool async_4 = false; + using launch_policy_4 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + + RAJA::tile + (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { + + RAJA::tile + (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { + + RAJA::loop(ctx, j_tile, [&] (int j) { + RAJA::loop(ctx, i_tile, [&] (int i) { + + aView(i, j, k) = c * i * j * k ; + + }); + }); + + }); + }); + + }); + }); +// _raja_tensorinit_cuda_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + +// _cuda_tensorinit_tiled_direct_start + dim3 nthreads_per_block(i_block_sz, j_block_sz, k_block_sz); + static_assert(i_block_sz*j_block_sz*k_block_sz == block_size, + "Invalid block_size"); + + dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(N, i_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, j_block_sz)), + static_cast(RAJA_DIVIDE_CEILING_INT(N, k_block_sz))); + + nested_init + <<>>(a, c, N); + cudaErrchk( cudaGetLastError() ); + cudaErrchk(cudaDeviceSynchronize()); +// _cuda_tensorinit_tiled_direct_end + + checkResult(a, a_ref, N_tot); + +#endif // if defined(RAJA_ENABLE_CUDA) + + +#if defined(RAJA_ENABLE_HIP) + + // + // Define total thread-block size and size of each block dimension + // + constexpr int block_size = 256; + constexpr int i_block_sz = 32; + constexpr int j_block_sz = block_size / i_block_sz; + constexpr int k_block_sz = 1; + + const int n_blocks_i = RAJA_DIVIDE_CEILING_INT(N,i_block_sz); + const int n_blocks_j = RAJA_DIVIDE_CEILING_INT(N,j_block_sz); + const int n_blocks_k = RAJA_DIVIDE_CEILING_INT(N,k_block_sz); + +//----------------------------------------------------------------------------// +// RAJA HIP GPU variants. +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + double *d_a = memoryManager::allocate_gpu(N_tot); + +// _3D_raja_device_view_start + RAJA::View< double, RAJA::Layout<3, int> > d_aView(d_a, N, N, N); +// _3D_raja_deviceview_end + + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_start + using hip_teams_z_5 = RAJA::LoopPolicy; + using hip_global_thread_y_5 = RAJA::LoopPolicy; + using hip_global_thread_x_5 = RAJA::LoopPolicy; + + const bool async_5 = false; + using launch_policy_5 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int j) { + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int i) { + + d_aView(i, j, k) = c * i * j * k ; + + }); + }); + }); + + }); +// _raja_tensorinit_hip_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA HIP tensor init tiled-direct...\n"; + + // set tensor data to zero to ensure we initializing it correctly. + std::memset(a, 0, N_tot * sizeof(double)); + hipErrchk(hipMemcpy( d_a, a, N_tot * sizeof(double), hipMemcpyHostToDevice )); + +// _raja_tensorinit_hip_tiled_direct_start + using hip_teams_z_6 = RAJA::LoopPolicy; + using hip_teams_y_6 = RAJA::LoopPolicy; + using hip_teams_x_6 = RAJA::LoopPolicy; + + using hip_threads_y_6 = RAJA::LoopPolicy; + using hip_threads_x_6 = RAJA::LoopPolicy; + + const bool async_6 = false; + using launch_policy_6 = RAJA::LaunchPolicy>; + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(n_blocks_i, n_blocks_j, n_blocks_k), + RAJA::Threads(i_block_sz, j_block_sz, k_block_sz)), + [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::TypedRangeSegment(0, N), [&] (int k) { + + RAJA::tile + (ctx, j_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &j_tile) { + + RAJA::tile + (ctx, i_block_sz, RAJA::TypedRangeSegment(0, N), [&] (RAJA::TypedRangeSegment const &i_tile) { + + RAJA::loop(ctx, j_tile, [&] (int j) { + RAJA::loop(ctx, i_tile, [&] (int i) { + + d_aView(i, j, k) = c * i * j * k ; + + }); + }); + + }); + }); + + }); + }); +// _raja_tensorinit_hip_tiled_direct_end + + hipErrchk(hipMemcpy( a, d_a, N_tot * sizeof(double), hipMemcpyDeviceToHost )); + checkResult(a, a_ref, N_tot); + + memoryManager::deallocate_gpu(d_a); + +#endif // if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// + + // Clean up... + memoryManager::deallocate(a); + memoryManager::deallocate(a_ref); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to compare result to reference and print result P/F. +// +void checkResult(double* a, double* aref, const int n) +{ + bool correct = true; + + int i = 0; + while ( correct && (i < n) ) { + correct = std::abs(a[i] - aref[i]) < 10e-12; + i++; + } + + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} diff --git a/exercises/memoryManager.hpp b/exercises/memoryManager.hpp new file mode 100644 index 0000000000..ef0a430f1b --- /dev/null +++ b/exercises/memoryManager.hpp @@ -0,0 +1,101 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef EXAMPLES_MEMORYMANAGER_HPP +#define EXAMPLES_MEMORYMANAGER_HPP + +#include "RAJA/RAJA.hpp" + +#if defined(RAJA_ENABLE_CUDA) +#include "RAJA/policy/cuda/raja_cudaerrchk.hpp" +#endif + +#if defined(RAJA_ENABLE_HIP) +#include "RAJA/policy/hip/raja_hiperrchk.hpp" +#endif + +/* + As RAJA does not manage memory we include a general purpose memory + manager which may be used to perform c++ style allocation/deallocation + or allocate/deallocate CUDA unified memory. The type of memory allocated + is dependent on how RAJA was configured. +*/ +namespace memoryManager +{ + +#if defined(RAJA_ENABLE_SYCL) + static camp::resources::Resource* sycl_res; +#endif + +template +T *allocate(RAJA::Index_type size) +{ + T *ptr; +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk( + cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal)); +#elif defined(RAJA_ENABLE_HIP) + hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); +#elif defined(RAJA_ENABLE_SYCL) + ptr = sycl_res->allocate(size, camp::resources::MemoryAccess::Managed); +#else + ptr = new T[size]; +#endif + return ptr; +} + +template +void deallocate(T *&ptr) +{ + if (ptr) { +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaFree(ptr)); +#elif defined(RAJA_ENABLE_HIP) + hipErrchk(hipFree(ptr)); +#elif defined(RAJA_ENABLE_SYCL) + sycl_res->deallocate(ptr); +#else + delete[] ptr; +#endif + ptr = nullptr; + } +} + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL) + template + T *allocate_gpu(RAJA::Index_type size) + { + T *ptr; +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaMalloc((void **)&ptr, sizeof(T) * size)); +#elif defined(RAJA_ENABLE_HIP) + hipErrchk(hipMalloc((void **)&ptr, sizeof(T) * size)); +#elif defined(RAJA_ENABLE_SYCL) + auto qu = sycl_res->get().get_queue(); + ptr = cl::sycl::malloc_device(size, *qu); +#endif + return ptr; + } + + template + void deallocate_gpu(T *&ptr) + { + if (ptr) { +#if defined(RAJA_ENABLE_CUDA) + cudaErrchk(cudaFree(ptr)); +#elif defined(RAJA_ENABLE_HIP) + hipErrchk(hipFree(ptr)); +#elif defined(RAJA_ENABLE_SYCL) + sycl_res->deallocate(ptr); +#endif + ptr = nullptr; + } + } +#endif + +}; // namespace memoryManager +#endif diff --git a/exercises/offset-layout-stencil.cpp b/exercises/offset-layout-stencil.cpp new file mode 100644 index 0000000000..e12fc0a268 --- /dev/null +++ b/exercises/offset-layout-stencil.cpp @@ -0,0 +1,399 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "memoryManager.hpp" + +/* + * Offset Layout Stencil Exercise + * + * This exercise applies a five-point stencil to the interior cells of a + * lattice and stores the resulting sums in a second lattice of equal size. + * You can think of the lattice as representing the centers of cells on a + * two-dimensional Cartesian mesh. + * + * The five-point stencil accumulates values of a cell and its four neighbors. + * Assuming the cells of a lattice may be accessed through a row/col fashion, + * the stencil may be expressed as the following sum: + * + * output(row, col) = input(row, col) + + * input(row - 1, col) + input(row + 1, col) + + * input(row, col - 1) + input(row, col + 1) + * + * We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros + * for a lattice of size (N_r + 2) x (N_c + 2). + * + * In the case of N_r = N_c = 3, the input lattice values are: + * + * --------------------- + * | 0 | 0 | 0 | 0 | 0 | + * --------------------- + * | 0 | 1 | 1 | 1 | 0 | + * --------------------- + * | 0 | 1 | 1 | 1 | 0 | + * --------------------- + * | 0 | 1 | 1 | 1 | 0 | + * --------------------- + * | 0 | 0 | 0 | 0 | 0 | + * --------------------- + * + * after the computation, we expect the output lattice to have values: + * + * --------------------- + * | 0 | 0 | 0 | 0 | 0 | + * --------------------- + * | 0 | 3 | 4 | 3 | 0 | + * --------------------- + * | 0 | 4 | 5 | 4 | 0 | + * --------------------- + * | 0 | 3 | 4 | 3 | 0 | + * --------------------- + * | 0 | 0 | 0 | 0 | 0 | + * --------------------- + * + * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to + * simplify the indexing to perform the stencil calculation. For the + * purposes of discussion, we enumerate the lattice in the following manner: + * + * -------------------------------------------------- + * | (-1, 3) | (0, 3) | (1, 3) | (2, 3) | (3, 3) | + * -------------------------------------------------- + * | (-1, 2) | (0, 2) | (1, 2) | (2, 2) | (3, 2) | + * -------------------------------------------------- + * | (-1, 1) | (0, 1) | (1, 1) | (2, 1) | (3, 1) | + * -------------------------------------------------- + * | (-1, 0) | (0, 0) | (1, 0) | (2, 0) | (3, 0) | + * --------------------------------------------------- + * | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | + * --------------------------------------------------- + * + * Notably (0, 0) corresponds to the bottom left corner of the stencil + * interior region to which we apply stencil. + * + * RAJA features shown: + * - RAJA::kernel kernel execution method and execution policies + * - RAJA::View + * - RAJA::Layout + * + * For the CUDA implementation, we use unified memory to hold the lattice data. + * For HIP, we use explicit host-device memory and manually copy data between + * the two. + */ + +/* + * Define number of threads in x and y dimensions of a GPU thread block + */ +#if defined(RAJA_ENABLE_CUDA) +#define CUDA_BLOCK_SIZE 16 +#endif + +#if defined(RAJA_ENABLE_HIP) +#define HIP_BLOCK_SIZE 16 +#endif + +// +// Functions for printing and checking results +// +void printLattice(int* lattice, int N_r, int N_c); +void checkResult(int* compLattice, int* refLattice, int totCells); + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nFive-point stencil example...\n"; + +// _stencil_define_start +// +// Define num of interior cells in row/cols in a lattice +// + constexpr int N_r = 5; + constexpr int N_c = 4; + +// +// Define total num of cells in rows/cols in a lattice +// + constexpr int totCellsInRow = N_r + 2; + constexpr int totCellsInCol = N_c + 2; + +// +// Define total num of cells in a lattice +// + constexpr int totCells = totCellsInRow * totCellsInCol; +// _stencil_define_end + +// +// Allocate and initialize lattice +// + int* input = memoryManager::allocate(totCells * sizeof(int)); + int* output = memoryManager::allocate(totCells * sizeof(int)); + int* output_ref = memoryManager::allocate(totCells * sizeof(int)); + + std::memset(input, 0, totCells * sizeof(int)); + std::memset(output, 0, totCells * sizeof(int)); + std::memset(output_ref, 0, totCells * sizeof(int)); + +// +// C-Style intialization +// +// _stencil_input_init_start + for (int row = 1; row <= N_r; ++row) { + for (int col = 1; col <= N_c; ++col) { + int id = col + totCellsInCol * row; + input[id] = 1; + } + } +// _stencil_input_init_end + + std::cout << "\ninput lattice:\n"; + printLattice(input, totCellsInRow, totCellsInCol); + +// +// Generate reference solution +// +// _stencil_output_ref_start + for (int row = 1; row <= N_r; ++row) { + for (int col = 1; col <= N_c; ++col) { + + int id = col + totCellsInCol * row; + output_ref[id] = input[id] + input[id + 1] + + input[id - 1] + + input[id + totCellsInCol] + + input[id - totCellsInCol]; + } + } +// _stencil_output_ref_end + + std::cout << "\noutput reference lattice:\n"; + printLattice(output_ref, totCellsInRow, totCellsInCol); + +//----------------------------------------------------------------------------// + +// +// The following code illustrates pairing an offset layout and a RAJA view +// object to simplify multidimensional indexing. +// An offset layout is constructed by using the make_offset_layout method. +// The first argument of the layout is an array object with the coordinates of +// the bottom left corner of the lattice, and the second argument is an array +// object of the coordinates of the top right corner plus 1. +// The example uses double braces to initiate the array object and its +// subobjects. +// + // _offsetlayout_views_start + const int DIM = 2; + + RAJA::OffsetLayout layout = + RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); + + RAJA::View> inputView(input, layout); + RAJA::View> outputView(output, layout); + // _offsetlayout_views_end + +// +// Create range segments used in kernels +// + // _offsetlayout_ranges_start + RAJA::TypedRangeSegment col_range(0, N_c); + RAJA::TypedRangeSegment row_range(0, N_r); + // _offsetlayout_ranges_end + +//----------------------------------------------------------------------------// + + std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n"; + + std::memset(output, 0, totCells * sizeof(int)); + + // _offsetlayout_rajaseq_start + using NESTED_EXEC_POL1 = + RAJA::KernelPolicy< + RAJA::statement::For<1, RAJA::loop_exec, // row + RAJA::statement::For<0, RAJA::loop_exec, // col + RAJA::statement::Lambda<0> + > + > + >; + + RAJA::kernel(RAJA::make_tuple(col_range, row_range), + [=](int col, int row) { + + outputView(row, col) = + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); + + }); + // _offsetlayout_rajaseq_end + + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); + checkResult(output, output_ref, totCells); + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + + std::cout << "\n Running five-point stencil (RAJA-Kernel OpenMP)...\n"; + + std::memset(output, 0, totCells * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement an OpenMP RAJA::kernel based version of the + /// the stencil operation where you collapse both loops to + /// parallelize the entire computation. Hint: recall the + /// kernelintro-execpols.cpp exercise file used in an + /// earlier tutorial section. + /// + + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); + checkResult(output, output_ref, totCells); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + + std::cout << "\n Running five-point stencil (RAJA-Kernel CUDA)...\n"; + + std::memset(output, 0, totCells * sizeof(int)); + + // _offsetlayout_rajacuda_start + using NESTED_EXEC_POL3 = + RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<1, RAJA::cuda_block_x_loop, //row + RAJA::statement::For<0, RAJA::cuda_thread_x_loop, //col + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::kernel(RAJA::make_tuple(col_range, row_range), + [=] RAJA_DEVICE(int col, int row) { + + outputView(row, col) = + inputView(row, col) + + inputView(row - 1, col) + + inputView(row + 1, col) + + inputView(row, col - 1) + + inputView(row, col + 1); + + }); + // _offsetlayout_rajacuda_end + + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); + checkResult(output, output_ref, totCells); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + + std::cout << "\n Running five-point stencil (RAJA-Kernel - " + "hip)...\n"; + + int* d_input = memoryManager::allocate_gpu(totCells * sizeof(int)); + int* d_output = memoryManager::allocate_gpu(totCells * sizeof(int)); + + hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice )); + + RAJA::View> d_inputView (d_input, layout); + RAJA::View> d_outputView(d_output, layout); + + // _offsetlayout_rajahip_start + using NESTED_EXEC_POL4 = + RAJA::KernelPolicy< + RAJA::statement::HipKernel< + RAJA::statement::For<1, RAJA::hip_block_x_loop, //row + RAJA::statement::For<0, RAJA::hip_thread_x_loop, //col + RAJA::statement::Lambda<0> + > + > + > + >; + + RAJA::kernel(RAJA::make_tuple(col_range, row_range), + [=] RAJA_DEVICE(int col, int row) { + + d_outputView(row, col) = + d_inputView(row, col) + + d_inputView(row - 1, col) + + d_inputView(row + 1, col) + + d_inputView(row, col - 1) + + d_inputView(row, col + 1); + }); + // _offsetlayout_rajahip_end + + hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost )); + + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); + checkResult(output, output_ref, totCells); + + memoryManager::deallocate_gpu(d_input); + memoryManager::deallocate_gpu(d_output); +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(input); + memoryManager::deallocate(output); + memoryManager::deallocate(output_ref); + + std::cout << "\n DONE!...\n"; + return 0; +} + +// +// Print Lattice +// +void printLattice(int* lattice, int totCellsInRow, int totCellsInCol) +{ + std::cout << std::endl; + for (int row = 0; row < totCellsInRow; ++row) { + for (int col = 0; col < totCellsInCol; ++col) { + + const int id = col + totCellsInCol * row; + std::cout << lattice[id] << " "; + } + std::cout << " " << std::endl; + } + std::cout << std::endl; +} + +// +// Check Result +// +void checkResult(int* compLattice, int* refLattice, int totCells) +{ + bool correct = true; + + int i = 0; + while ( correct && (i < totCells) ) { + correct = (compLattice[i] == refLattice[i]); + i++; + } + + if ( correct ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +} diff --git a/examples/tut_offset-layout.cpp b/exercises/offset-layout-stencil_solution.cpp similarity index 67% rename from examples/tut_offset-layout.cpp rename to exercises/offset-layout-stencil_solution.cpp index f5d4befa3a..a9abc8bcb1 100644 --- a/examples/tut_offset-layout.cpp +++ b/exercises/offset-layout-stencil_solution.cpp @@ -14,27 +14,25 @@ #include "memoryManager.hpp" /* - * Offset Layout example + * Offset Layout Stencil Exercise * - * This example applies a five-cell stencil to the - * interior cells of a lattice and stores the - * resulting sums in a second lattice of equal size. + * This exercise applies a five-point stencil to the interior cells of a + * lattice and stores the resulting sums in a second lattice of equal size. + * You can think of the lattice as representing the centers of cells on a + * two-dimensional Cartesian mesh. * - * The five-cell stencil accumulates values of a cell - * and its four neighbors. Assuming the cells of a - * lattice may be accessed through a row/col fashion, - * the stencil may be expressed as the following sum + * The five-point stencil accumulates values of a cell and its four neighbors. + * Assuming the cells of a lattice may be accessed through a row/col fashion, + * the stencil may be expressed as the following sum: * * output(row, col) = input(row, col) + * input(row - 1, col) + input(row + 1, col) + * input(row, col - 1) + input(row, col + 1) * - * We assume a lattice has N x N interior nodes - * and a padded edge of zeros for a lattice - * of size (N_r + 2) x (N_c + 2). + * We assume a lattice has N_r x N_c interior nodes and a padded edge of zeros + * for a lattice of size (N_r + 2) x (N_c + 2). * - * In the case of N = 3, the input lattice generated - * takes the form + * In the case of N_r = N_c = 3, the input lattice values are: * * --------------------- * | 0 | 0 | 0 | 0 | 0 | @@ -48,8 +46,7 @@ * | 0 | 0 | 0 | 0 | 0 | * --------------------- * - * after the computation, we expect the output - * lattice to take the form + * after the computation, we expect the output lattice to have values: * * --------------------- * | 0 | 0 | 0 | 0 | 0 | @@ -63,13 +60,9 @@ * | 0 | 0 | 0 | 0 | 0 | * --------------------- * - * In this example, we use RAJA's make_offset_layout - * method and view object to simplify applying - * the stencil to interior cells. - * The make_offset_layout method enables developers - * to create layouts which offset - * the enumeration of values in an array. Here we - * choose to enumerate the lattice in the following manner: + * In this exercise, we use RAJA::OffsetLayout and RAJA::View objects to + * simplify the indexing to perform the stencil calculation. For the + * purposes of discussion, we enumerate the lattice in the following manner: * * -------------------------------------------------- * | (-1, 3) | (0, 3) | (1, 3) | (2, 3) | (3, 3) | @@ -83,18 +76,22 @@ * | (-1,-1) | (0, -1) | (1, -1) | (2, -1) | (3, -1) | * --------------------------------------------------- * - * Notably (0, 0) corresponds to the bottom left - * corner of the region to which we wish to apply stencil. + * Notably (0, 0) corresponds to the bottom left corner of the stencil + * interior region to which we apply stencil. * * RAJA features shown: - * - `forall` loop iteration template method - * - Offset-layouts for RAJA Views - * - Index range segment - * - Execution policies + * - RAJA::kernel kernel execution method and execution policies + * - RAJA::View + * - RAJA::OffsetLayout + * - RAJA::make_offset_layout method + * + * For the CUDA implementation, we use unified memory to hold the lattice data. + * For HIP, we use explicit host-device memory and manually copy data between + * the two. */ /* - * Define number of threads in x and y dimensions of a CUDA thread block + * Define number of threads in x and y dimensions of a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) #define CUDA_BLOCK_SIZE 16 @@ -113,24 +110,26 @@ void checkResult(int* compLattice, int* refLattice, int totCells); int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nRAJA five-cell stencil example...\n"; + std::cout << "\n\nFive-point stencil example...\n"; +// _stencil_define_start // // Define num of interior cells in row/cols in a lattice // - const int N_r = 3; - const int N_c = 3; + constexpr int N_r = 5; + constexpr int N_c = 4; // // Define total num of cells in rows/cols in a lattice // - const int totCellsInRow = N_r + 2; - const int totCellsInCol = N_c + 2; + constexpr int totCellsInRow = N_r + 2; + constexpr int totCellsInCol = N_c + 2; // // Define total num of cells in a lattice // - const int totCells = totCellsInRow * totCellsInCol; + constexpr int totCells = totCellsInRow * totCellsInCol; +// _stencil_define_end // // Allocate and initialize lattice @@ -146,17 +145,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // C-Style intialization // +// _stencil_input_init_start for (int row = 1; row <= N_r; ++row) { for (int col = 1; col <= N_c; ++col) { int id = col + totCellsInCol * row; input[id] = 1; } } -// printLattice(input, totCellsInRow, totCellsInCol); +// _stencil_input_init_end + + std::cout << "\ninput lattice:\n"; + printLattice(input, totCellsInRow, totCellsInCol); // // Generate reference solution // +// _stencil_output_ref_start for (int row = 1; row <= N_r; ++row) { for (int col = 1; col <= N_c; ++col) { @@ -167,7 +171,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) + input[id - totCellsInCol]; } } -// printLattice(output_ref, totCellsInRow, totCellsInCol); +// _stencil_output_ref_end + + std::cout << "\noutput reference lattice:\n"; + printLattice(output_ref, totCellsInRow, totCellsInCol); //----------------------------------------------------------------------------// @@ -184,31 +191,32 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // _offsetlayout_views_start const int DIM = 2; - RAJA::OffsetLayout layout = - RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); + RAJA::OffsetLayout layout = + RAJA::make_offset_layout({{-1, -1}}, {{N_r+1, N_c+1}}); - RAJA::View> inputView(input, layout); - RAJA::View> outputView(output, layout); + RAJA::View> inputView(input, layout); + RAJA::View> outputView(output, layout); // _offsetlayout_views_end // // Create range segments used in kernels // // _offsetlayout_ranges_start - RAJA::RangeSegment col_range(0, N_r); - RAJA::RangeSegment row_range(0, N_c); + RAJA::TypedRangeSegment col_range(0, N_c); + RAJA::TypedRangeSegment row_range(0, N_r); // _offsetlayout_ranges_end //----------------------------------------------------------------------------// - std::cout << "\n Running five-cell stencil (RAJA-Kernel - " - "sequential)...\n"; + std::cout << "\n Running five-point stencil (RAJA-Kernel sequential)...\n"; + + std::memset(output, 0, totCells * sizeof(int)); // _offsetlayout_rajaseq_start using NESTED_EXEC_POL1 = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::seq_exec, // row - RAJA::statement::For<0, RAJA::seq_exec, // col + RAJA::statement::For<1, RAJA::loop_exec, // row + RAJA::statement::For<0, RAJA::loop_exec, // col RAJA::statement::Lambda<0> > > @@ -223,25 +231,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) + inputView(row + 1, col) + inputView(row, col - 1) + inputView(row, col + 1); + }); // _offsetlayout_rajaseq_end - //printLattice(output_ref, totCellsInRow, totCellsInCol); + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); //----------------------------------------------------------------------------// #if defined(RAJA_ENABLE_OPENMP) - std::cout << "\n Running five-cell stencil (RAJA-Kernel - omp " - "parallel for)...\n"; + std::cout << "\n Running five-point stencil (RAJA-Kernel OpenMP)...\n"; + + std::memset(output, 0, totCells * sizeof(int)); + // _offsetlayout_rajaomp_start using NESTED_EXEC_POL2 = RAJA::KernelPolicy< - RAJA::statement::For<1, RAJA::omp_parallel_for_exec, // row - RAJA::statement::For<0, RAJA::seq_exec, // col - RAJA::statement::Lambda<0> - > + RAJA::statement::Collapse, // row, col + RAJA::statement::Lambda<0> > >; @@ -254,9 +265,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) + inputView(row + 1, col) + inputView(row, col - 1) + inputView(row, col + 1); + }); + // _offsetlayout_rajaomp_end - //printLattice(output_ref, totCellsInRow, totCellsInCol); + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif @@ -264,9 +278,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) - std::cout << "\n Running five-cell stencil (RAJA-Kernel - " - "cuda)...\n"; + std::cout << "\n Running five-point stencil (RAJA-Kernel CUDA)...\n"; + + std::memset(output, 0, totCells * sizeof(int)); + // _offsetlayout_rajacuda_start using NESTED_EXEC_POL3 = RAJA::KernelPolicy< RAJA::statement::CudaKernel< @@ -287,9 +303,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) + inputView(row + 1, col) + inputView(row, col - 1) + inputView(row, col + 1); + }); + // _offsetlayout_rajacuda_end - //printLattice(output, totCellsInRow, totCellsInCol); + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); #endif @@ -297,18 +316,22 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_HIP) - std::cout << "\n Running five-cell stencil (RAJA-Kernel - " + std::cout << "\n Running five-point stencil (RAJA-Kernel - " "hip)...\n"; - int* d_input = memoryManager::allocate_gpu(totCells * sizeof(int)); - int* d_output = memoryManager::allocate_gpu(totCells * sizeof(int)); + std::memset(output, 0, totCells * sizeof(int)); + + int* d_input = memoryManager::allocate_gpu(totCells); + int* d_output = memoryManager::allocate_gpu(totCells); hipErrchk(hipMemcpy( d_input, input, totCells * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_output, output, totCells * sizeof(int), hipMemcpyHostToDevice )); - RAJA::View> d_inputView (d_input, layout); - RAJA::View> d_outputView(d_output, layout); + RAJA::View> d_inputView (d_input, layout); + RAJA::View> d_outputView(d_output, layout); - using NESTED_EXEC_POL3 = + // _offsetlayout_rajahip_start + using NESTED_EXEC_POL4 = RAJA::KernelPolicy< RAJA::statement::HipKernel< RAJA::statement::For<1, RAJA::hip_block_x_loop, //row @@ -319,7 +342,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) > >; - RAJA::kernel(RAJA::make_tuple(col_range, row_range), + RAJA::kernel(RAJA::make_tuple(col_range, row_range), [=] RAJA_DEVICE(int col, int row) { d_outputView(row, col) = @@ -329,10 +352,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) + d_inputView(row, col - 1) + d_inputView(row, col + 1); }); + // _offsetlayout_rajahip_end hipErrchk(hipMemcpy( output, d_output, totCells * sizeof(int), hipMemcpyDeviceToHost )); - //printLattice(output, totCellsInRow, totCellsInCol); + std::cout << "\noutput lattice:\n"; + printLattice(output, totCellsInRow, totCellsInCol); checkResult(output, output_ref, totCells); memoryManager::deallocate_gpu(d_input); @@ -374,14 +399,15 @@ void printLattice(int* lattice, int totCellsInRow, int totCellsInCol) // void checkResult(int* compLattice, int* refLattice, int totCells) { + bool correct = true; - bool pass = true; - - for (int i = 0; i < totCells; ++i) { - if (compLattice[i] != refLattice[i]) pass = false; + int i = 0; + while ( correct && (i < totCells) ) { + correct = (compLattice[i] == refLattice[i]); + i++; } - if (pass) { + if ( correct ) { std::cout << "\n\t result -- PASS\n"; } else { std::cout << "\n\t result -- FAIL\n"; diff --git a/exercises/permuted-layout-batch-matrix-multiply.cpp b/exercises/permuted-layout-batch-matrix-multiply.cpp new file mode 100644 index 0000000000..a1805c3e35 --- /dev/null +++ b/exercises/permuted-layout-batch-matrix-multiply.cpp @@ -0,0 +1,737 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "RAJA/util/Timer.hpp" + +#include "memoryManager.hpp" + +/* + * Batched Matrix Multiply Example + * + * This example performs batched matrix multiplication + * for matrices of dimension 3 x 3 using two different + * data layouts. + * + * Matrices are stored in arrays A and B. Results + * are stored in a third array, C. + * We introduce the notation A^{e}_rc + * to correspond to the matrix entry in the row, r, + * column, c, of matrix, e. Below we describe the two + * layouts for the case of two (N=2) 3 x 3 matrices. + * + * Layout 1: + * Matrix entries are grouped together so that each + * matrix is in a row major ordering. + * i.e. A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02}, + * A^{0}_{10}, A^{0}_{11}, A^{0}_{12}, + * A^{0}_{20}, A^{0}_{21}, A^{0}_{22}, + * A^{1}_{00}, A^{1}_{01}, A^{1}_{02}, + * A^{1}_{10}, A^{1}_{11}, A^{1}_{12}, + * A^{1}_{20}, A^{1}_{21}, A^{1}_{22}]; + * + * Layout 2: + * Matrix entries are first ordered by matrix number, + * then by column number, and finally by row number. + * i.e. A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01}, + * A^{1}_{01}, A^{0}_{02}, A^{1}_{02}, + * A^{0}_{10}, A^{1}_{10}, A^{0}_{11}, + * A^{1}_{11}, A^{0}_{12}, A^{1}_{12}, + * A^{0}_{20}, A^{1}_{20}, A^{0}_{21}, + * A^{1}_{21}, A^{0}_{22}, A^{1}_{22}]; + * + * The extension to N > 2 matrices follows by direct + * extension. By exploring different data layouts, + * we can assess which performs best under a given + * execution policy and architecture. + * + * RAJA features shown: + * - RAJA::forall kernel execution method + * - RAJA::View + * - RAJA::Layout + * - RAJA::make_permuted_layout method + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + * Define number of threads in a GPU thread block + */ +#if defined(RAJA_ENABLE_CUDA) +constexpr int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +constexpr int HIP_BLOCK_SIZE = 256; +#endif + +// +//Function for checking results +// +template +void checkResult(T C, int nMat, int nRows, int nCols); + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA batched matrix multiplication exercise...\n"; + +// Dimensions of matrices + constexpr int N_c = 3; + constexpr int N_r = 3; + +// Number of matrices + constexpr int N = 8000000; + +// Number of iterations + constexpr int NITER = 20; + + std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n"; + +// +// Initialize a RAJA timer object +// and variable to store minimum run time +// + auto timer = RAJA::Timer(); + double minRun = std::numeric_limits::max(); + +// +// Allocate space for data in layout 1 +// + double *A = memoryManager::allocate(N_c * N_r * N); + double *B = memoryManager::allocate(N_c * N_r * N); + double *C = memoryManager::allocate(N_c * N_r * N); + +// +// Layout 1 +// +// make_permuted_layout takes the number of entries in each dimension and a +// templated array indicating index arguments with slowest to fastest stride. +// Standard C++ arrays are used to hold the number of entries in each component. +// This example uses double braces to initalize the array and its subobjects. +// The layout object will index into the array as the following C macro would +// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. +// +// RAJA::Layout objects may be templated on dimension, argument type, and +// index with unit stride. Here, the column index has unit stride (argument 2). +// + // _permutedlayout_defviews_start + std::array perm1 {{0, 1, 2}}; + auto layout1 = + RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 ); + + RAJA::View> Aview(A, layout1); + RAJA::View> Bview(B, layout1); + RAJA::View> Cview(C, layout1); + // _permutedlayout_defviews_end + +// +// Allocate space for data in layout 2 +// + double *A2 = memoryManager::allocate(N_c * N_r * N); + double *B2 = memoryManager::allocate(N_c * N_r * N); + double *C2 = memoryManager::allocate(N_c * N_r * N); + +// +// Permuted layout - equivalent to indexing using the following macro +// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] +// In this case the element index has unit stride (argument 0). +// + + /// + /// TODO... + /// + /// EXERCISE: Define a permuted layout object (layout2) with the appropriate + /// permutation so that the matrix index has unit stride, + /// the column index has stride N (the number of matrices), + /// and the row index has stride N * N_c. + /// + /// Then, create views for the A2, B2, C2 arrays using the + /// layout object; i.e., Aview2, Bview2, and Cview2. + /// + /// Hint: You will the same indexing to access the array data + /// via the Views as for the Views above which are created + /// using the layout1 View (see kernels in the code below). + /// + /// When you are done with the Views, test them out by + /// uncommenting the kernels in the code below that use the + /// the Aview2, Bview2, and Cview2 views. + /// + +// +// Initialize data for layout 1 and layout 2 arrays/views. +// +// When OpenMP is enabled, we use an OpenMP exec policy for +// "first touch" initialization. +// +#if defined(RAJA_ENABLE_OPENMP) + using INIT_POL = RAJA::omp_parallel_for_exec; +#else + using INIT_POL = RAJA::loop_exec; +#endif + + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(e, row, col) = row; + Bview(e, row, col) = col; + Cview(e, row, col) = 0; + +// Aview2(e, row, col) = row; +// Bview2(e, row, col) = col; +// Cview2(e, row, col) = 0; + } + } + }); + + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - sequential) ... " << std::endl; + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + // _permutedlayout_batchedmatmult_loop_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + } + ); + // _permutedlayout_batchedmatmult_loop_end + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + std::cout << "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - sequential) ... " << std::endl; + +/* + timer.start(); + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + // _permutedlayout2_batchedmatmult_loop_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + + } + ); + // _permutedlayout2_batchedmatmult_loop_end + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); +*/ + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - omp parallel for) ... " << std::endl; + + std::memset(C, 0, N_c * N_r * N * sizeof(double)); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + // _permutedlayout_batchedmatmult_omp_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + + } + ); + // _permutedlayout_batchedmatmult_omp_end + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - omp parallel for) ... " << std::endl; + + std::memset(C2, 0, N_c * N_r * N * sizeof(double)); + +/* + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); +*/ + +#endif + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - cuda) ... " << std::endl; + + std::memset(C, 0, N_c * N_r * N * sizeof(double)); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - cuda) ... " << std::endl; + + std::memset(C2, 0, N_c * N_r * N * sizeof(double)); + +/* + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); +*/ +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - hip) ... " << std::endl; + + double *d_A = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_B = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_C = memoryManager::allocate_gpu(N_c * N_r * N); + + RAJA::View> d_Aview(d_A, layout1); + RAJA::View> d_Bview(d_B, layout1); + RAJA::View> d_Cview(d_C, layout1); + + hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 0); + d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 1); + d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 0); + d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 1); + d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 0); + d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 1); + d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + + std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +// +// Clean up. +// + memoryManager::deallocate_gpu(d_A); + memoryManager::deallocate_gpu(d_B); + memoryManager::deallocate_gpu(d_C); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - hip) ... " << std::endl; + +/* + double *d_A2 = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_B2 = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_C2 = memoryManager::allocate_gpu(N_c * N_r * N); + + RAJA::View> d_Aview2(d_A2, layout2); + RAJA::View> d_Bview2(d_B2, layout2); + RAJA::View> d_Cview2(d_C2, layout2); + + hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2); + + d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2); + + d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + +// +// Clean up. +// + memoryManager::deallocate_gpu(d_A2); + memoryManager::deallocate_gpu(d_B2); + memoryManager::deallocate_gpu(d_C2); +*/ +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(A); + memoryManager::deallocate(B); + memoryManager::deallocate(C); + memoryManager::deallocate(A2); + memoryManager::deallocate(B2); + memoryManager::deallocate(C2); + + std::cout << "\n DONE!...\n"; + return 0; +} + +// +// check result +// +template +void checkResult(T C, int nMat, int nRows, int nCols) +{ + + bool status = true; + for (int e = 0; e < nMat; ++e) { + for (int row = 0; row < nRows; ++row) { + for (int col = 0; col < nCols; ++col) { + if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) { + status = false; + } + } + } + } + + if ( status ) { + std::cout << "\tresult -- PASS\n"; + } else { + std::cout << "\tresult -- FAIL\n"; + } +} diff --git a/exercises/permuted-layout-batch-matrix-multiply_solution.cpp b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp new file mode 100644 index 0000000000..0cdb06d1b0 --- /dev/null +++ b/exercises/permuted-layout-batch-matrix-multiply_solution.cpp @@ -0,0 +1,711 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "RAJA/util/Timer.hpp" + + +#include "memoryManager.hpp" + +/* + * Batched Matrix Multiply Example + * + * This example performs batched matrix multiplication + * for matrices of dimension 3 x 3 using two different + * data layouts. + * + * Matrices are stored in arrays A and B. Results + * are stored in a third array, C. + * We introduce the notation A^{e}_rc + * to correspond to the matrix entry in the row, r, + * column, c, of matrix, e. Below we describe the two + * layouts for the case of two (N=2) 3 x 3 matrices. + * + * Layout 1: + * Matrix entries are grouped together so that each + * matrix is in a row major ordering. + * i.e. A = [A^{0}_{00}, A^{0}_{01}, A^{0}_{02}, + * A^{0}_{10}, A^{0}_{11}, A^{0}_{12}, + * A^{0}_{20}, A^{0}_{21}, A^{0}_{22}, + * A^{1}_{00}, A^{1}_{01}, A^{1}_{02}, + * A^{1}_{10}, A^{1}_{11}, A^{1}_{12}, + * A^{1}_{20}, A^{1}_{21}, A^{1}_{22}]; + * + * Layout 2: + * Matrix entries are first ordered by matrix number, + * then by column number, and finally by row number. + * i.e. A = [A^{0}_{00}, A^{1}_{00}, A^{0}_{01}, + * A^{1}_{01}, A^{0}_{02}, A^{1}_{02}, + * A^{0}_{10}, A^{1}_{10}, A^{0}_{11}, + * A^{1}_{11}, A^{0}_{12}, A^{1}_{12}, + * A^{0}_{20}, A^{1}_{20}, A^{0}_{21}, + * A^{1}_{21}, A^{0}_{22}, A^{1}_{22}]; + * + * The extension to N > 2 matrices follows by direct + * extension. By exploring different data layouts, + * we can assess which performs best under a given + * execution policy and architecture. + * + * RAJA features shown: + * - RAJA::forall kernel execution method + * - RAJA::View + * - RAJA::Layout + * - RAJA::make_permuted_layout method + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + * Define number of threads in a GPU thread block + */ +#if defined(RAJA_ENABLE_CUDA) +constexpr int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +constexpr int HIP_BLOCK_SIZE = 256; +#endif + +// +//Function for checking results +// +template +void checkResult(T C, int nMat, int nRows, int nCols); + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA batched matrix multiplication exercise...\n"; + +// Dimensions of matrices + constexpr int N_c = 3; + constexpr int N_r = 3; + +// Number of matrices + constexpr int N = 8000000; + +// Number of iterations + constexpr int NITER = 20; + + std::cout << "\n Number of matrices to be multiplied: " << N << " \n \n"; + +// +// Initialize a RAJA timer object +// and variable to store minimum run time +// + auto timer = RAJA::Timer(); + double minRun = std::numeric_limits::max(); + +// +// Allocate space for data in layout 1 +// + double *A = memoryManager::allocate(N_c * N_r * N); + double *B = memoryManager::allocate(N_c * N_r * N); + double *C = memoryManager::allocate(N_c * N_r * N); + +// +// Layout 1 +// +// make_permuted_layout takes the number of entries in each dimension and a +// templated array indicating index arguments with slowest to fastest stride. +// Standard C++ arrays are used to hold the number of entries in each component. +// This example uses double braces to initalize the array and its subobjects. +// The layout object will index into the array as the following C macro would +// #define Aview(e, r, c) A[c + N_c*(r + N_r*e)]. +// +// RAJA::Layout objects may be templated on dimension, argument type, and +// index with unit stride. Here, the column index has unit stride (argument 2). +// + // _permutedlayout_defviews_start + std::array perm1 {{0, 1, 2}}; + auto layout1 = + RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm1 ); + + RAJA::View> Aview(A, layout1); + RAJA::View> Bview(B, layout1); + RAJA::View> Cview(C, layout1); + // _permutedlayout_defviews_end + +// +// Allocate space for data in layout 2 +// + double *A2 = memoryManager::allocate(N_c * N_r * N); + double *B2 = memoryManager::allocate(N_c * N_r * N); + double *C2 = memoryManager::allocate(N_c * N_r * N); + +// +// Permuted layout - equivalent to indexing using the following macro +// #define Aview2(e, r, c) A2[e + N*(c + N_c*r)] +// In this case the element index has unit stride (argument 0). +// + // _permutedlayout_permviews_start + std::array perm2 {{1, 2, 0}}; + auto layout2 = + RAJA::make_permuted_layout( {{N, N_r, N_c}}, perm2 ); + + RAJA::View> Aview2(A2, layout2); + RAJA::View> Bview2(B2, layout2); + RAJA::View> Cview2(C2, layout2); + // _permutedlayout_permviews_end + +// +// Initialize data for layout 1 and layout 2 arrays/views. +// +// When OpenMP is enabled, we use an OpenMP exec policy for +// "first touch" initialization. +// +#if defined(RAJA_ENABLE_OPENMP) + using INIT_POL = RAJA::omp_parallel_for_exec; +#else + using INIT_POL = RAJA::loop_exec; +#endif + + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=](int e) { + for (int row = 0; row < N_r; ++row) { + for (int col = 0; col < N_c; ++col) { + Aview(e, row, col) = row; + Bview(e, row, col) = col; + Cview(e, row, col) = 0; + + Aview2(e, row, col) = row; + Bview2(e, row, col) = col; + Cview2(e, row, col) = 0; + } + } + }); + + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - sequential) ... " << std::endl; + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + // _permutedlayout_batchedmatmult_loop_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + } + ); + // _permutedlayout_batchedmatmult_loop_end + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + std::cout << "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - sequential) ... " << std::endl; + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + // _permutedlayout2_batchedmatmult_loop_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + + } + ); + // _permutedlayout2_batchedmatmult_loop_end + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - omp parallel for) ... " << std::endl; + + std::memset(C, 0, N_c * N_r * N * sizeof(double)); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + // _permutedlayout_batchedmatmult_omp_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + + } + ); + // _permutedlayout_batchedmatmult_omp_end + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - omp parallel for) ... " << std::endl; + + std::memset(C2, 0, N_c * N_r * N * sizeof(double)); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall(RAJA::TypedRangeSegment(0, N), + [=](int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : " << minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + +#endif + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - cuda) ... " << std::endl; + + std::memset(C, 0, N_c * N_r * N * sizeof(double)); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + Cview(e, 0, 0) = Aview(e, 0, 0) * Bview(e, 0, 0) + + Aview(e, 0, 1) * Bview(e, 1, 0) + + Aview(e, 0, 2) * Bview(e, 2, 0); + Cview(e, 0, 1) = Aview(e, 0, 0) * Bview(e, 0, 1) + + Aview(e, 0, 1) * Bview(e, 1, 1) + + Aview(e, 0, 2) * Bview(e, 2, 1); + Cview(e, 0, 2) = Aview(e, 0, 0) * Bview(e, 0, 2) + + Aview(e, 0, 1) * Bview(e, 1, 2) + + Aview(e, 0, 2) * Bview(e, 2, 2); + + Cview(e, 1, 0) = Aview(e, 1, 0) * Bview(e, 0, 0) + + Aview(e, 1, 1) * Bview(e, 1, 0) + + Aview(e, 1, 2) * Bview(e, 2, 0); + Cview(e, 1, 1) = Aview(e, 1, 0) * Bview(e, 0, 1) + + Aview(e, 1, 1) * Bview(e, 1, 1) + + Aview(e, 1, 2) * Bview(e, 2, 1); + Cview(e, 1, 2) = Aview(e, 1, 0) * Bview(e, 0, 2) + + Aview(e, 1, 1) * Bview(e, 1, 2) + + Aview(e, 1, 2) * Bview(e, 2, 2); + + Cview(e, 2, 0) = Aview(e, 2, 0) * Bview(e, 0, 0) + + Aview(e, 2, 1) * Bview(e, 1, 0) + + Aview(e, 2, 2) * Bview(e, 2, 0); + Cview(e, 2, 1) = Aview(e, 2, 0) * Bview(e, 0, 1) + + Aview(e, 2, 1) * Bview(e, 1, 1) + + Aview(e, 2, 2) * Bview(e, 2, 1); + Cview(e, 2, 2) = Aview(e, 2, 0) * Bview(e, 0, 2) + + Aview(e, 2, 1) * Bview(e, 1, 2) + + Aview(e, 2, 2) * Bview(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - cuda) ... " << std::endl; + + std::memset(C2, 0, N_c * N_r * N * sizeof(double)); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + Cview2(e, 0, 0) = Aview2(e, 0, 0) * Bview2(e, 0, 0) + + Aview2(e, 0, 1) * Bview2(e, 1, 0) + + Aview2(e, 0, 2) * Bview2(e, 2, 0); + Cview2(e, 0, 1) = Aview2(e, 0, 0) * Bview2(e, 0, 1) + + Aview2(e, 0, 1) * Bview2(e, 1, 1) + + Aview2(e, 0, 2) * Bview2(e, 2, 1); + Cview2(e, 0, 2) = Aview2(e, 0, 0) * Bview2(e, 0, 2) + + Aview2(e, 0, 1) * Bview2(e, 1, 2) + + Aview2(e, 0, 2) * Bview2(e, 2, 2); + + Cview2(e, 1, 0) = Aview2(e, 1, 0) * Bview2(e, 0, 0) + + Aview2(e, 1, 1) * Bview2(e, 1, 0) + + Aview2(e, 1, 2) * Bview2(e, 2, 0); + Cview2(e, 1, 1) = Aview2(e, 1, 0) * Bview2(e, 0, 1) + + Aview2(e, 1, 1) * Bview2(e, 1, 1) + + Aview2(e, 1, 2) * Bview2(e, 2, 1); + Cview2(e, 1, 2) = Aview2(e, 1, 0) * Bview2(e, 0, 2) + + Aview2(e, 1, 1) * Bview2(e, 1, 2) + + Aview2(e, 1, 2) * Bview2(e, 2, 2); + + Cview2(e, 2, 0) = Aview2(e, 2, 0) * Bview2(e, 0, 0) + + Aview2(e, 2, 1) * Bview2(e, 1, 0) + + Aview2(e, 2, 2) * Bview2(e, 2, 0); + Cview2(e, 2, 1) = Aview2(e, 2, 0) * Bview2(e, 0, 1) + + Aview2(e, 2, 1) * Bview2(e, 1, 1) + + Aview2(e, 2, 2) * Bview2(e, 2, 1); + Cview2(e, 2, 2) = Aview2(e, 2, 0) * Bview2(e, 0, 2) + + Aview2(e, 2, 1) * Bview2(e, 1, 2) + + Aview2(e, 2, 2) * Bview2(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + + std::cout << " \n Running batched matrix multiplication" + << " with layout 1 (RAJA - hip) ... " << std::endl; + + double *d_A = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_B = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_C = memoryManager::allocate_gpu(N_c * N_r * N); + + double *d_A2 = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_B2 = memoryManager::allocate_gpu(N_c * N_r * N); + double *d_C2 = memoryManager::allocate_gpu(N_c * N_r * N); + + RAJA::View> d_Aview(d_A, layout1); + RAJA::View> d_Bview(d_B, layout1); + RAJA::View> d_Cview(d_C, layout1); + + RAJA::View> d_Aview2(d_A2, layout2); + RAJA::View> d_Bview2(d_B2, layout2); + RAJA::View> d_Cview2(d_C2, layout2); + + hipErrchk(hipMemcpy( d_A, A, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_B, B, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_A2, A2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_B2, B2, N_c * N_r * N * sizeof(double), hipMemcpyHostToDevice )); + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + d_Cview(e, 0, 0) = d_Aview(e, 0, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 0); + d_Cview(e, 0, 1) = d_Aview(e, 0, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 1); + d_Cview(e, 0, 2) = d_Aview(e, 0, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 0, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 0, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 1, 0) = d_Aview(e, 1, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 0); + d_Cview(e, 1, 1) = d_Aview(e, 1, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 1); + d_Cview(e, 1, 2) = d_Aview(e, 1, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 1, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 1, 2) * d_Bview(e, 2, 2); + + d_Cview(e, 2, 0) = d_Aview(e, 2, 0) * d_Bview(e, 0, 0) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 0) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 0); + d_Cview(e, 2, 1) = d_Aview(e, 2, 0) * d_Bview(e, 0, 1) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 1) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 1); + d_Cview(e, 2, 2) = d_Aview(e, 2, 0) * d_Bview(e, 0, 2) + + d_Aview(e, 2, 1) * d_Bview(e, 1, 2) + + d_Aview(e, 2, 2) * d_Bview(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + hipErrchk(hipMemcpy( C, d_C, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + + std::cout<< "\trun time: "<< minRun << " seconds" << std::endl; + checkResult(Cview, N, N_r, N_c); + +//----------------------------------------------------------------------------// + + std::cout << " \n Running batched matrix multiplication" + << " with layout 2 (RAJA - hip) ... " << std::endl; + + minRun = std::numeric_limits::max(); + for (int i = 0; i < NITER; ++i) { + + timer.start(); + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE(int e) { + + d_Cview2(e, 0, 0) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 0, 1) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 0, 2) = d_Aview2(e, 0, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 0, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 0, 2) * d_Bview2(e, 2, 2); + + d_Cview2(e, 1, 0) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 1, 1) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 1, 2) = d_Aview2(e, 1, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 1, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 1, 2) * d_Bview2(e, 2, 2); + + d_Cview2(e, 2, 0) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 0) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 0) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 0); + d_Cview2(e, 2, 1) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 1) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 1) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 1); + d_Cview2(e, 2, 2) = d_Aview2(e, 2, 0) * d_Bview2(e, 0, 2) + + d_Aview2(e, 2, 1) * d_Bview2(e, 1, 2) + + d_Aview2(e, 2, 2) * d_Bview2(e, 2, 2); + + } + ); + timer.stop(); + + RAJA::Timer::ElapsedType tMin = timer.elapsed(); + if (tMin < minRun) minRun = tMin; + timer.reset(); + } + + hipErrchk(hipMemcpy( C2, d_C2, N_c * N_r * N * sizeof(double), hipMemcpyDeviceToHost )); + + std::cout<< "\trun time : "<< minRun << " seconds" << std::endl; + checkResult(Cview2, N, N_r, N_c); + + memoryManager::deallocate_gpu(d_A); + memoryManager::deallocate_gpu(d_B); + memoryManager::deallocate_gpu(d_C); + memoryManager::deallocate_gpu(d_A2); + memoryManager::deallocate_gpu(d_B2); + memoryManager::deallocate_gpu(d_C2); +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(A); + memoryManager::deallocate(B); + memoryManager::deallocate(C); + memoryManager::deallocate(A2); + memoryManager::deallocate(B2); + memoryManager::deallocate(C2); + + std::cout << "\n DONE!...\n"; + return 0; +} + +// +// check result +// +template +void checkResult(T C, int nMat, int nRows, int nCols) +{ + + bool status = true; + for (int e = 0; e < nMat; ++e) { + for (int row = 0; row < nRows; ++row) { + for (int col = 0; col < nCols; ++col) { + if (std::abs(C(e, row, col) - row * col * nCols) > 10e-12) { + status = false; + } + } + } + } + + if ( status ) { + std::cout << "\tresult -- PASS\n"; + } else { + std::cout << "\tresult -- FAIL\n"; + } +} diff --git a/exercises/reductions.cpp b/exercises/reductions.cpp new file mode 100644 index 0000000000..40d58c287b --- /dev/null +++ b/exercises/reductions.cpp @@ -0,0 +1,289 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Reduction Example + * + * This example illustrates use of the RAJA reduction types: min, max, + * sum, min-loc, and max-loc. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - Index range segment + * - Execution policies + * - Reduction types + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + Specify the number of threads in a GPU thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +//constexpr int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +constexpr int HIP_BLOCK_SIZE = 256; +#endif + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA reductions example...\n"; + + // _reductions_array_init_start +// +// Define array length +// + constexpr int N = 1000000; + +// +// Allocate array data and initialize data to alternating sequence of 1, -1. +// + int* a = memoryManager::allocate(N); + + for (int i = 0; i < N; ++i) { + if ( i % 2 == 0 ) { + a[i] = 1; + } else { + a[i] = -1; + } + } + +// +// Set min and max loc values +// + constexpr int minloc_ref = N / 2; + a[minloc_ref] = -100; + + constexpr int maxloc_ref = N / 2 + 1; + a[maxloc_ref] = 100; + // _reductions_array_init_end + +// +// Note: with this data initialization scheme, the following results will +// be observed for all reduction kernels below: +// +// - the sum will be zero +// - the min will be -100 +// - the max will be 100 +// - the min loc will be N/2 +// - the max loc will be N/2 + 1 +// +// + +// +// Define index range for iterating over a elements in all examples +// + // _reductions_range_start +//RAJA::TypedRangeSegment arange(0, N); + // _reductions_range_end + +//----------------------------------------------------------------------------// + + std::cout << "\n Running RAJA sequential reductions...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Define EXEC_POL1 and REDCUE_POL1 for executing sequentially. + /// + + /// TODO... + /// + /// EXERCISE: Remove comments for remainder of sequential section. + /// + /// Uncomment 'arange' variable above so it can be used in kernel. + /// + /* + RAJA::ReduceSum seq_sum(0); + RAJA::ReduceMin seq_min(std::numeric_limits::max()); + RAJA::ReduceMax seq_max(std::numeric_limits::min()); + RAJA::ReduceMinLoc seq_minloc(std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc seq_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(arange, [=](int i) { + + seq_sum += a[i]; + + seq_min.min(a[i]); + seq_max.max(a[i]); + + seq_minloc.minloc(a[i], i); + seq_maxloc.maxloc(a[i], i); + + }); + + std::cout << "\tsum = " << seq_sum.get() << std::endl; + std::cout << "\tmin = " << seq_min.get() << std::endl; + std::cout << "\tmax = " << seq_max.get() << std::endl; + std::cout << "\tmin, loc = " << seq_minloc.get() << " , " + << seq_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << seq_maxloc.get() << " , " + << seq_maxloc.getLoc() << std::endl; + */ + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_OPENMP) + std::cout << "\n Running RAJA OpenMP reductions...\n"; + + // _reductions_raja_omppolicy_start + /* + using EXEC_POL2 = RAJA::omp_parallel_for_exec; + using REDUCE_POL2 = RAJA::omp_reduce; + */ + // _reductions_raja_omppolicy_end + + /// + /// TODO... + /// + /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise. + /// + /// Uncomment 'arange' variable above so it can be used in kernel. + /// + + /// TODO... + /// + /// EXERCISE: Remove comments for remainder of openmp section and uncomment + /// policy types above to use in kernel. + /// + /* + RAJA::forall(arange, [=](int i) { + + omp_sum += a[i]; + + omp_min.min(a[i]); + omp_max.max(a[i]); + + omp_minloc.minloc(a[i], i); + omp_maxloc.maxloc(a[i], i); + + }); + + std::cout << "\tsum = " << omp_sum.get() << std::endl; + std::cout << "\tmin = " << omp_min.get() << std::endl; + std::cout << "\tmax = " << omp_max.get() << std::endl; + std::cout << "\tmin, loc = " << omp_minloc.get() << " , " + << omp_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << omp_maxloc.get() << " , " + << omp_maxloc.getLoc() << std::endl; + */ +#endif + + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + std::cout << "\n Running RAJA CUDA reductions...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Define EXEC_POL3 and REDCUE_POL3 for executing on a CUDA device. + /// + + /// + /// TODO... + /// + /// EXERCISE: Define Reduce(Sum, Min, Max, MinLoc, MaxLoc) to complete this exercise. + /// + /// Uncomment 'arange' variable above so it can be used in kernel. + /// + + /// TODO... + /// + /// EXERCISE: Remove comments for remainder of cuda section. + /// + /* + RAJA::forall(arange, [=] RAJA_DEVICE (int i) { + + cuda_sum += a[i]; + + cuda_min.min(a[i]); + cuda_max.max(a[i]); + + cuda_minloc.minloc(a[i], i); + cuda_maxloc.maxloc(a[i], i); + + }); + + std::cout << "\tsum = " << cuda_sum.get() << std::endl; + std::cout << "\tmin = " << cuda_min.get() << std::endl; + std::cout << "\tmax = " << cuda_max.get() << std::endl; + std::cout << "\tmin, loc = " << cuda_minloc.get() << " , " + << cuda_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << cuda_maxloc.get() << " , " + << cuda_maxloc.getLoc() << std::endl; + */ +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + std::cout << "\n Running RAJA HIP reductions...\n"; + + RAJA::TypedRangeSegment arange1(0, N); + + int* d_a = memoryManager::allocate_gpu(N); + hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + + // _reductions_raja_hippolicy_start + using EXEC_POL3 = RAJA::hip_exec; + using REDUCE_POL3 = RAJA::hip_reduce; + // _reductions_raja_hippolicy_end + + RAJA::ReduceSum hip_sum(0); + RAJA::ReduceMin hip_min(std::numeric_limits::max()); + RAJA::ReduceMax hip_max(std::numeric_limits::min()); + RAJA::ReduceMinLoc hip_minloc(std::numeric_limits::max(), -1); + RAJA::ReduceMaxLoc hip_maxloc(std::numeric_limits::min(), -1); + + RAJA::forall(arange1, [=] RAJA_DEVICE (int i) { + + hip_sum += d_a[i]; + + hip_min.min(d_a[i]); + hip_max.max(d_a[i]); + + hip_minloc.minloc(d_a[i], i); + hip_maxloc.maxloc(d_a[i], i); + + }); + + std::cout << "\tsum = " << hip_sum.get() << std::endl; + std::cout << "\tmin = " << hip_min.get() << std::endl; + std::cout << "\tmax = " << hip_max.get() << std::endl; + std::cout << "\tmin, loc = " << hip_minloc.get() << " , " + << hip_minloc.getLoc() << std::endl; + std::cout << "\tmax, loc = " << hip_maxloc.get() << " , " + << hip_maxloc.getLoc() << std::endl; + + memoryManager::deallocate_gpu(d_a); +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(a); + + std::cout << "\n DONE!...\n"; + + return 0; +} diff --git a/examples/tut_reductions.cpp b/exercises/reductions_solution.cpp similarity index 96% rename from examples/tut_reductions.cpp rename to exercises/reductions_solution.cpp index aa35f44e8f..068e8a0986 100644 --- a/examples/tut_reductions.cpp +++ b/exercises/reductions_solution.cpp @@ -29,14 +29,14 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block + Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; +constexpr int CUDA_BLOCK_SIZE = 256; #endif #if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 256; +constexpr int HIP_BLOCK_SIZE = 256; #endif int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) @@ -48,7 +48,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array length // - const int N = 1000000; + constexpr int N = 1000000; // // Allocate array data and initialize data to alternating sequence of 1, -1. @@ -66,10 +66,10 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Set min and max loc values // - const int minloc_ref = N / 2; + constexpr int minloc_ref = N / 2; a[minloc_ref] = -100; - const int maxloc_ref = N / 2 + 1; + constexpr int maxloc_ref = N / 2 + 1; a[maxloc_ref] = 100; // _reductions_array_init_end @@ -89,7 +89,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // Define index range for iterating over a elements in all examples // // _reductions_range_start - RAJA::RangeSegment arange(0, N); + RAJA::TypedRangeSegment arange(0, N); // _reductions_range_end //----------------------------------------------------------------------------// diff --git a/exercises/scan.cpp b/exercises/scan.cpp new file mode 100644 index 0000000000..e00b80d330 --- /dev/null +++ b/exercises/scan.cpp @@ -0,0 +1,409 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#define OP_PLUS_INT RAJA::operators::plus +#define OP_MIN_INT RAJA::operators::minimum +#define OP_MAX_INT RAJA::operators::maximum +#define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult(in, out, N); +#define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult(in, out, N); + +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Scan Exercise + * + * This exercise demonstrates RAJA inclusive and exclusive scan operations + * for integer arrays, including in-place, using different operators. + * Other array data types, operators, etc. are similar + * + * RAJA features shown: + * - `RAJA::inclusive_scan` and `RAJA::inclusive_scan_inplace` methods + * - `RAJA::exclusive_scan` and `RAJA::exclusive_scan_inplace` methods + * - RAJA operators + * - Execution policies + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + Specify the number of threads in a GPU thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +//constexpr int CUDA_BLOCK_SIZE = 16; +#endif + +#if defined(RAJA_ENABLE_HIP) +//constexpr int HIP_BLOCK_SIZE = 16; +#endif + +// +// Functions for checking results and printing vectors +// +template +void checkInclusiveScanResult(const T* in, const T* out, int N); +// +template +void checkExclusiveScanResult(const T* in, const T* out, int N); +// +template +void printArray(const T* v, int N); + + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA scan example...\n"; + + // _scan_array_init_start +// +// Define array length +// + constexpr int N = 20; + +// +// Allocate and initialize vector data +// + int* in = memoryManager::allocate(N); + int* out = memoryManager::allocate(N); + + std::iota(in, in + N, -1); + + std::cout << "\n in values...\n"; + printArray(in, N); + std::cout << "\n"; + // _scan_array_init_end + + + +//----------------------------------------------------------------------------// +// Perform various sequential scans to illustrate inclusive/exclusive, +// in-place, default scans with different operators +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential inclusive_scan (default)...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec + /// execution policy type. + /// + /// NOTE: We've done this one for you to help you get started... + /// + + // _scan_inclusive_seq_start + RAJA::inclusive_scan(RAJA::make_span(in, N), + RAJA::make_span(out, N)); + // _scan_inclusive_seq_end + + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential inclusive_scan (plus)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an inclusive RAJA scan with RAJA::seq_exec + /// execution policy type and an explicit plus operator. + /// + + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential exclusive_scan (plus)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an exclusive RAJA scan with RAJA::seq_exec + /// execution policy type and an explicit plus operator. + /// + + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::seq_exec + /// execution policy type and an explicit minimum operator. + /// + + CHECK_INC_SCAN_RESULTS(OP_MIN_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential exclusive_scan_inplace (maximum)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::seq_exec + /// execution policy type and an explicit maximum operator. + /// + + CHECK_EXC_SCAN_RESULTS(OP_MAX_INT) + printArray(out, N); + std::cout << "\n"; + + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// Perform a couple of OpenMP scans... +//----------------------------------------------------------------------------// + + std::cout << "\n Running OpenMP inclusive_scan (plus)...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Implement an inclusive RAJA scan with RAJA::omp_parallel_for_exec + /// execution policy type and an explicit plus operator. + /// + + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running OpenMP exclusive_scan_inplace (plus)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::omp_parallel_for_exec + /// execution policy type and an explicit plus operator. + /// + + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + +//----------------------------------------------------------------------------// +// Perform a couple of CUDA scans... +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::cuda_exec + /// execution policy type and an explicit plus operator. + /// + /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the top + /// of the file if you want to use it here. + /// + + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an exclusive inplace RAJA scan with RAJA::cuda_exec + /// execution policy type and an explicit plus operator. + /// + /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the top + /// of the file if you want to use it here. + /// + + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA exclusive_scan (plus)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement an exclusive RAJA scan with RAJA::cuda_exec + /// execution policy type and an explicit plus operator. + /// + /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the top + /// of the file if you want to use it here. + /// + + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +#endif + +//----------------------------------------------------------------------------// + + +#if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// +// Perform a couple of HIP scans... +//----------------------------------------------------------------------------// + + std::cout << "\n Running HIP inclusive_scan_inplace (plus)...\n"; + + std::copy_n(in, N, out); + int* d_in = memoryManager::allocate_gpu(N); + int* d_out = memoryManager::allocate_gpu(N); + + hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + + /// + /// TODO... + /// + /// EXERCISE: Implement an inclusive inplace RAJA scan with RAJA::hip_exec + /// execution policy type and an explicit plus operator. + /// + /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top + /// of the file if you want to use it here. + /// + + hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running HIP exclusive_scan (plus)...\n"; + + hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + + /// + /// TODO... + /// + /// EXERCISE: Implement an exclusive RAJA scan with RAJA::hip_exec + /// execution policy type and an explicit plus operator. + /// + /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the top + /// of the file if you want to use it here. + /// + + hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + + memoryManager::deallocate_gpu(d_in); + memoryManager::deallocate_gpu(d_out); + +#endif + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(in); + memoryManager::deallocate(out); + + std::cout << "\n DONE!...\n"; + + return 0; +} + + +// +// Function to check inclusive scan result +// +template +void checkInclusiveScanResult(const T* in, const T* out, int N) +{ + T val = Function::identity(); + for (int i = 0; i < N; ++i) { + val = Function()(val, in[i]); + if (out[i] != val) { + std::cout << "\n\t result -- WRONG\n"; + std::cout << "\t" << out[i] << " != " << val + << " (at index " << i << ")\n"; + } + } + std::cout << "\n\t result -- CORRECT\n"; +} + +// +// Function to check exclusive scan result +// +template +void checkExclusiveScanResult(const T* in, const T* out, int N) +{ + T val = Function::identity(); + for (int i = 0; i < N; ++i) { + if (out[i] != val) { + std::cout << "\n\t result -- WRONG\n"; + std::cout << "\t" << out[i] << " != " << val + << " (at index " << i << ")\n"; + } + val = Function()(val, in[i]); + } + std::cout << "\n\t result -- CORRECT\n"; +} + +// +// Function to print vector. +// +template +void printArray(const T* v, int N) +{ + std::cout << std::endl; + for (int i = 0; i < N; ++i) { std::cout << " " << v[i]; } + std::cout << std::endl; +} diff --git a/examples/tut_scan.cpp b/exercises/scan_solution.cpp similarity index 79% rename from examples/tut_scan.cpp rename to exercises/scan_solution.cpp index 1c89c16107..a72a4cdebd 100644 --- a/examples/tut_scan.cpp +++ b/exercises/scan_solution.cpp @@ -5,6 +5,12 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#define OP_PLUS_INT RAJA::operators::plus +#define OP_MIN_INT RAJA::operators::minimum +#define OP_MAX_INT RAJA::operators::maximum +#define CHECK_INC_SCAN_RESULTS(X) checkInclusiveScanResult(in, out, N); +#define CHECK_EXC_SCAN_RESULTS(X) checkExclusiveScanResult(in, out, N); + #include #include #include @@ -15,9 +21,9 @@ #include "RAJA/RAJA.hpp" /* - * Scan Example + * Scan Exercise * - * Example shows how to perform RAJA inclusive and exclusive scan operations + * This exercise demonstrates RAJA inclusive and exclusive scan operations * for integer arrays, including in-place, using different operators. * Other array data types, operators, etc. are similar * @@ -31,14 +37,14 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block + Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 16; + constexpr int CUDA_BLOCK_SIZE = 16; #endif #if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 16; + constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -63,7 +69,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array length // - const int N = 20; + constexpr int N = 20; // // Allocate and initialize vector data @@ -73,11 +79,11 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::iota(in, in + N, -1); - // _scan_array_init_end - std::cout << "\n in values...\n"; printArray(in, N); std::cout << "\n"; + // _scan_array_init_end + //----------------------------------------------------------------------------// @@ -92,7 +98,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::make_span(out, N)); // _scan_inclusive_seq_end - checkInclusiveScanResult>(in, out, N); + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; @@ -108,7 +114,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::plus{}); // _scan_inclusive_seq_plus_end - checkInclusiveScanResult>(in, out, N); + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; @@ -124,7 +130,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::plus{}); // _scan_exclusive_seq_plus_end - checkExclusiveScanResult>(in, out, N); + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; @@ -132,14 +138,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running sequential inclusive_scan_inplace (minimum)...\n"; + // _scan_inclusive_inplace_seq_min_start std::copy_n(in, N, out); - // _scan_inclusive_inplace_seq_min_start RAJA::inclusive_scan_inplace(RAJA::make_span(out, N), RAJA::operators::minimum{}); // _scan_inclusive_inplace_seq_min_end - checkInclusiveScanResult>(in, out, N); + CHECK_INC_SCAN_RESULTS(OP_MIN_INT) printArray(out, N); std::cout << "\n"; @@ -154,7 +160,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::maximum{}); // _scan_exclusive_inplace_seq_max_end - checkExclusiveScanResult>(in, out, N); + CHECK_EXC_SCAN_RESULTS(OP_MAX_INT) printArray(out, N); std::cout << "\n"; @@ -173,7 +179,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::plus{}); // _scan_inclusive_omp_plus_end - checkInclusiveScanResult>(in, out, N); + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; @@ -184,11 +190,12 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_exclusive_inplace_omp_plus_start - RAJA::exclusive_scan_inplace(RAJA::make_span(out, N), - RAJA::operators::plus{}); + RAJA::exclusive_scan_inplace( + RAJA::make_span(out, N), + RAJA::operators::plus{}); // _scan_exclusive_inplace_omp_plus_end - checkExclusiveScanResult>(in, out, N); + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; @@ -199,7 +206,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #if defined(RAJA_ENABLE_CUDA) //----------------------------------------------------------------------------// -// Perform a couple of CUDA scans... +// Perform a few CUDA scans... //----------------------------------------------------------------------------// std::cout << "\n Running CUDA inclusive_scan_inplace (plus)...\n"; @@ -207,11 +214,28 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_inclusive_inplace_cuda_plus_start - RAJA::inclusive_scan_inplace>(RAJA::make_span(out, N), - RAJA::operators::plus{}); + RAJA::inclusive_scan_inplace>( + RAJA::make_span(out, N), + RAJA::operators::plus{}); // _scan_inclusive_inplace_cuda_plus_end - checkInclusiveScanResult>(in, out, N); + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA exclusive_scan_inplace (plus)...\n"; + + std::copy_n(in, N, out); + + // _scan_exclusive_inplace_cuda_plus_start + RAJA::exclusive_scan_inplace>( + RAJA::make_span(out, N), + RAJA::operators::plus{}); + // _scan_exclusive_inplace_cuda_plus_end + + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; @@ -222,12 +246,13 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::copy_n(in, N, out); // _scan_exclusive_cuda_plus_start - RAJA::exclusive_scan>(RAJA::make_span(in, N), - RAJA::make_span(out, N), - RAJA::operators::plus{}); + RAJA::exclusive_scan>( + RAJA::make_span(in, N), + RAJA::make_span(out, N), + RAJA::operators::plus{}); // _scan_exclusive_cuda_plus_end - checkExclusiveScanResult>(in, out, N); + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; @@ -235,6 +260,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// + #if defined(RAJA_ENABLE_HIP) //----------------------------------------------------------------------------// @@ -249,28 +275,33 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); - RAJA::inclusive_scan_inplace>(RAJA::make_span(d_out, N), - RAJA::operators::plus{}); + // _scan_inclusive_inplace_hip_plus_start + RAJA::inclusive_scan_inplace>( + RAJA::make_span(d_out, N), + RAJA::operators::plus{}); + // _scan_inclusive_inplace_hip_plus_end hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); - checkInclusiveScanResult>(in, out, N); + CHECK_INC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; //----------------------------------------------------------------------------// + std::cout << "\n Running HIP exclusive_scan (plus)...\n"; + hipErrchk(hipMemcpy( d_in, in, N * sizeof(int), hipMemcpyHostToDevice )); hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); - std::cout << "\n Running HIP exclusive_scan (plus)...\n"; - RAJA::exclusive_scan>(RAJA::make_span(d_in, N), - RAJA::make_span(d_out, N), - RAJA::operators::plus{}); + RAJA::exclusive_scan>( + RAJA::make_span(d_in, N), + RAJA::make_span(d_out, N), + RAJA::operators::plus{}); hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); - checkExclusiveScanResult>(in, out, N); + CHECK_EXC_SCAN_RESULTS(OP_PLUS_INT) printArray(out, N); std::cout << "\n"; diff --git a/exercises/segment-indexset-basics.cpp b/exercises/segment-indexset-basics.cpp new file mode 100644 index 0000000000..44546fb940 --- /dev/null +++ b/exercises/segment-indexset-basics.cpp @@ -0,0 +1,284 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" + +#include "camp/resource.hpp" + +/* + * Segments and Index Sets exercise + * + * In this exercise, you will learn how to create RAJA segments and index sets + * and use them to execute kernels. There are no computations performed in the + * exercises and no parallel execution. The kernels contain only print + * statements to illustrate various iteration patterns. Thus, all kernels + * look the same. The only thing that changes in these versions is the object + * passed to the 'forall' method that defines the iteration space. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - TypedRangeSegment iteration space + * - TypedRangeStrideSegment iteration space + * - TypedListSegment iteration space + * - TypedIndexSet segment container + * - Hierarchical execution policies + */ + +//----------------------------------------------------------------------------// +// Define aliases for types used in the exercises +// (so example code is less verbose) +//----------------------------------------------------------------------------// +// _raja_segment_type_start +using IdxType = int; +using RangeSegType = RAJA::TypedRangeSegment; +using RangeStrideSegType = RAJA::TypedRangeStrideSegment; +using ListSegType = RAJA::TypedListSegment; +using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >; +// _raja_segment_type_end + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA segments index sets and index sets...\n"; + +// Resource object used to construct list segment objects with indices +// living in host (CPU) memory. + camp::resources::Resource host_res{camp::resources::Host()}; + + +//----------------------------------------------------------------------------// +// Stride-1 iteration spaces +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version range kernel...\n"; + + // _cstyle_range1_start + for (IdxType i = 0; i < 20; i++) { + std::cout << i << " "; + } + // _cstyle_range1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA range kernel...\n"; + + // _raja_range1_start + RAJA::forall(RangeSegType(0, 20), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_range1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-1 range kernel...\n"; + + // _raja_striderange1_start + RAJA::forall(RangeStrideSegType(0, 20, 1), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_striderange1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-1 list kernel...\n"; + + // _raja_list1_start + // + // Collect indices in a vector to create list segment + // + std::vector idx; + for (IdxType i = 0; i < 20; ++i) { + idx.push_back(i); + } + + ListSegType idx_list1( idx, host_res ); + + RAJA::forall(idx_list1, [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_list1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running C-style stride-1 list kernel...\n"; + + // _cstyle_list1_start + IdxType iis = static_cast(idx.size()); // to avoid compiler warning + for (IdxType ii = 0; ii < iis; ++ii) { + std::cout << idx[ ii ] << " "; + } + // _cstyle_list1_end + + std::cout << std::endl; + +//----------------------------------------------------------------------------// +// Negative stride iteration spaces +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version negative stride kernel...\n"; + + // _cstyle_negstriderange1_start + for (IdxType i = 19; i > -1; i--) { + std::cout << i << " "; + } + // _cstyle_negstriderange1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA negative stride kernel...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Make a RAJA -1 stride version of the kernel. + /// + + std::cout << std::endl; + +//----------------------------------// +// List variant +//----------------------------------// + + std::cout << "\n Running RAJA negative stride list kernel...\n"; + + // _raja_negstridelist1_start + // + // Reverse the order of indices in the vector + // + std::reverse( idx.begin(), idx.end() ); + ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res ); + + RAJA::forall(idx_list1_reverse, [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_negstridelist1_end + + std::cout << std::endl; + +//----------------------------------------------------------------------------// +// Non-unit uniform stride iteration spaces +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version stride-2 range kernel...\n"; + + // _cstyle_range2_start + for (IdxType i = 0; i < 20; i += 2) { + std::cout << i << " "; + } + // _cstyle_range2_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-2 range kernel...\n"; + + // _raja_range2_start + RAJA::forall(RangeStrideSegType(0, 20, 2), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_range2_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-3 range kernel...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Make a RAJA stride-3 version of the kernel. + /// + + std::cout << std::endl; + +//----------------------------------------------------------------------------// +// IndexSets: complex iteration spaces +//----------------------------------------------------------------------------// + +// +// Sequential index set execution policy used in several of the following +// example implementations. +// + + // _raja_seq_indexset_policy_start + using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; + // _raja_seq_indexset_policy__end + + std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n"; + + // _raja_indexset_2ranges_start + IndexSetType is2; + is2.push_back( RangeSegType(0, 10) ); + is2.push_back( RangeSegType(15, 20) ); + + RAJA::forall(is2, [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_indexset_2ranges_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running C-version of two segment kernel...\n"; + + // _cstyle_2ranges_start + for (IdxType i = 0; i < 10; ++i) { + std::cout << i << " "; + } + for (IdxType i = 15; i < 20; ++i) { + std::cout << i << " "; + } + // _cstyle_2ranges_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA index set (3 segments) kernel...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Make a RAJA version of a kernel that prints the sequence + /// + /// 0 1 2 3 4 5 6 7 10 11 14 20 22 24 25 26 27 + /// + /// using a RAJA::TypedIndexSet containing two + /// RAJA::TypedRangeSegment objects and on + /// RAJA::TypedListSegment object. + /// + + std::cout << std::endl; + +//----------------------------------------------------------------------------// + + std::cout << "\n DONE!...\n"; + + return 0; +} + diff --git a/exercises/segment-indexset-basics_solution.cpp b/exercises/segment-indexset-basics_solution.cpp new file mode 100644 index 0000000000..4e736bb9f7 --- /dev/null +++ b/exercises/segment-indexset-basics_solution.cpp @@ -0,0 +1,286 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" + +#include "camp/resource.hpp" + +/* + * Segments and Index Sets exercise + * + * In this exercise, you will learn how to create RAJA segments and index sets + * and use them to execute kernels. There are no computations performed in the + * exercises and no parallel execution. The kernels contain only print + * statements to illustrate various iteration patterns. Thus, all kernels + * look the same. The only thing that changes in these versions is the object + * passed to the 'forall' method that defines the iteration space. + * + * RAJA features shown: + * - `forall` loop iteration template method + * - TypedRangeSegment iteration space + * - TypedRangeStrideSegment iteration space + * - TypedListSegment iteration space + * - TypedIndexSet segment container + * - Hierarchical execution policies + */ + +//----------------------------------------------------------------------------// +// Define aliases for types used in the exercises +// (so example code is less verbose) +//----------------------------------------------------------------------------// +// _raja_segment_type_start +using IdxType = int; +using RangeSegType = RAJA::TypedRangeSegment; +using RangeStrideSegType = RAJA::TypedRangeStrideSegment; +using ListSegType = RAJA::TypedListSegment; +using IndexSetType = RAJA::TypedIndexSet< RangeSegType, ListSegType >; +// _raja_segment_type_end + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA segments index sets and index sets...\n"; + +// Resource object used to construct list segment objects with indices +// living in host (CPU) memory. + camp::resources::Resource host_res{camp::resources::Host()}; + + +//----------------------------------------------------------------------------// +// Stride-1 iteration spaces +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version range kernel...\n"; + +// _cstyle_range1_start + for (IdxType i = 0; i < 20; i++) { + std::cout << i << " "; + } +// _cstyle_range1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA range kernel...\n"; + + // _raja_range1_start + RAJA::forall(RangeSegType(0, 20), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_range1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-1 range kernel...\n"; + + // _raja_striderange1_start + RAJA::forall(RangeStrideSegType(0, 20, 1), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_striderange1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-1 list kernel...\n"; + + // _raja_list1_start + // + // Collect indices in a vector to create list segment + // + std::vector idx; + for (IdxType i = 0; i < 20; ++i) { + idx.push_back(i); + } + + ListSegType idx_list1( idx, host_res ); + + RAJA::forall(idx_list1, [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_list1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running C-style stride-1 list kernel...\n"; + + // _cstyle_list1_start + IdxType iis = static_cast(idx.size()); // to avoid compiler warning + for (IdxType ii = 0; ii < iis; ++ii) { + std::cout << idx[ ii ] << " "; + } + // _cstyle_list1_end + + std::cout << std::endl; + +//----------------------------------------------------------------------------// +// Negative stride iteration spaces +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version negative stride kernel...\n"; + + // _cstyle_negstriderange1_start + for (IdxType i = 19; i > -1; i--) { + std::cout << i << " "; + } + // _cstyle_negstriderange1_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA negative stride kernel...\n"; + + // _raja_negstriderange1_start + RAJA::forall(RangeStrideSegType(19, -1, -1), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_negstriderange1_end + + std::cout << std::endl; + +//----------------------------------// +// List variant +//----------------------------------// + + std::cout << "\n Running RAJA negative stride list kernel...\n"; + + // _raja_negstridelist1_start + // + // Reverse the order of indices in the vector + // + std::reverse( idx.begin(), idx.end() ); + ListSegType idx_list1_reverse( &idx[0], idx.size(), host_res ); + + RAJA::forall(idx_list1_reverse, [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_negstridelist1_end + + std::cout << std::endl; + +//----------------------------------------------------------------------------// +// Non-unit uniform stride iteration spaces +//----------------------------------------------------------------------------// + + std::cout << "\n Running C-version stride-2 range kernel...\n"; + + // _cstyle_range2_start + for (IdxType i = 0; i < 20; i += 2) { + std::cout << i << " "; + } + // _cstyle_range2_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-2 range kernel...\n"; + + // _raja_range2_start + RAJA::forall(RangeStrideSegType(0, 20, 2), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_range2_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA stride-3 range kernel...\n"; + + // _raja_range3_start + RAJA::forall(RangeStrideSegType(0, 20, 3), [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_range3_end + + std::cout << std::endl; + +//----------------------------------------------------------------------------// +// IndexSets: complex iteration spaces +//----------------------------------------------------------------------------// + +// +// Sequential index set execution policy used in several of the following +// example implementations. +// + + std::cout << "\n Running RAJA index set (2 RangeSegments) kernel...\n"; + + // _raja_indexset_2ranges_start + using SEQ_ISET_EXECPOL = RAJA::ExecPolicy; + + IndexSetType is2; + is2.push_back( RangeSegType(0, 10) ); + is2.push_back( RangeSegType(15, 20) ); + + RAJA::forall(is2, [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_indexset_2ranges_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running C-version of two segment kernel...\n"; + + // _cstyle_2ranges_start + for (IdxType i = 0; i < 10; ++i) { + std::cout << i << " "; + } + for (IdxType i = 15; i < 20; ++i) { + std::cout << i << " "; + } + // _cstyle_2ranges_end + + std::cout << std::endl; + +//----------------------------------// + + std::cout << "\n Running RAJA index set (3 segments) kernel...\n"; + + // _raja_indexset_3segs_start + IndexSetType is3; + + is3.push_back( RangeSegType(0, 8) ); + + IdxType indx[ ] = {10, 11, 14, 20, 22}; + ListSegType list2( indx, 5, host_res ); + is3.push_back( list2 ); + + is3.push_back( RangeSegType(24, 28) ); + + RAJA::forall(is3, [=] (IdxType i) { + std::cout << i << " "; + }); + // _raja_indexset_3segs_end + + std::cout << std::endl; + +//----------------------------------------------------------------------------// + + std::cout << "\n DONE!...\n"; + + return 0; +} + diff --git a/exercises/sort.cpp b/exercises/sort.cpp new file mode 100644 index 0000000000..26a0e6e1f4 --- /dev/null +++ b/exercises/sort.cpp @@ -0,0 +1,702 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#define OP_GREATER RAJA::operators::greater +#define OP_LESS RAJA::operators::less + +#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult(in, out, N) +#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult(in, out, in_vals, out_vals, N) +#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult(in, out, N) +#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult(in, out, in_vals, out_vals, N) + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * Sort Exercise + * + * Exercise demonstrates how to perform RAJA unstable and stable sort operations + * for integer arrays, including pairs variant, using different comparators. + * Other array data types, comparators, etc. are similar + * + * RAJA features shown: + * - `RAJA::sort` and `RAJA::sort_pairs` methods + * - `RAJA::stable_sort` and `RAJA::stable_sort_pairs` methods + * - RAJA operators + * - Execution policies + * + * If CUDA is enabled, CUDA unified memory is used. + */ + +/* + Specify the number of threads in a GPU thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +//constexpr int CUDA_BLOCK_SIZE = 16; +#endif + +#if defined(RAJA_ENABLE_HIP) +//constexpr int HIP_BLOCK_SIZE = 16; +#endif + +// +// Functions for checking results and printing vectors +// +template +void checkUnstableSortResult(const T* in, const T* out, int N); +template +void checkUnstableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N); +// +template +void checkStableSortResult(const T* in, const T* out, int N); +template +void checkStableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N); +// +template +void printArray(const T* k, int N); +template +void printArray(const T* k, const U* v, int N); + + +int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA sort example...\n"; + + // _sort_array_init_start +// +// Define array length +// + constexpr int N = 20; + +// +// Allocate and initialize vector data +// + int* in = memoryManager::allocate(N); + int* out = memoryManager::allocate(N); + + unsigned* in_vals = memoryManager::allocate(N); + unsigned* out_vals = memoryManager::allocate(N); + + std::iota(in , in + N/2, 0); + std::iota(in + N/2, in + N , 0); + std::shuffle(in , in + N/2, std::mt19937{12345u}); + std::shuffle(in + N/2, in + N , std::mt19937{67890u}); + + std::fill(in_vals , in_vals + N/2, 0); + std::fill(in_vals + N/2, in_vals + N , 1); + + std::cout << "\n in keys...\n"; + printArray(in, N); + std::cout << "\n in (key, value) pairs...\n"; + printArray(in, in_vals, N); + std::cout << "\n"; + + // _sort_array_init_end + + +//----------------------------------------------------------------------------// +// Perform various sequential sorts to illustrate unstable/stable, +// pairs, default sorts with different comparators +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential sort (default)...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec + /// execution policy type. + /// + /// NOTE: We've done this one for you to help you get started... + /// + + // _sort_seq_start + std::copy_n(in, N, out); + + RAJA::sort(RAJA::make_span(out, N)); + // _sort_seq_end + + //checkUnstableSortResult>(in, out, N); + CHECK_UNSTABLE_SORT_RESULT(OP_LESS); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement a RAJA sort with RAJA::seq_exec execution + /// policy type and an explicit less operation. + /// + + //checkUnstableSortResult>(in, out, N); + CHECK_UNSTABLE_SORT_RESULT(OP_LESS); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution + /// policy type and an explicit less operation. + /// + + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_LESS); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential stable_sort (non-increasing)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement a stable RAJA sort with RAJA::seq_exec execution + /// policy type and an explicit greater operation. + /// + + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_GREATER); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential sort_pairs (non-decreasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + /// + /// TODO... + /// + /// EXERCISE: Implement a RAJA pair sort with RAJA::seq_exec execution + /// policy type and an explicit less operation. + /// + + //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); + printArray(out, out_vals, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running sequential stable_sort_pairs (non-increasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + /// + /// TODO... + /// + /// EXERCISE: Implement a stable RAJA pair sort with RAJA::seq_exec execution + /// policy type and an explicit greater operation. + /// + + //checkStableSortResult>(in, out, in_vals, out_vals, N); + CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); + printArray(out, out_vals, N); + std::cout << "\n"; + + +#if defined(RAJA_ENABLE_OPENMP) + +//----------------------------------------------------------------------------// +// Perform a couple of OpenMP sorts... +//----------------------------------------------------------------------------// + + std::cout << "\n Running OpenMP sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement a RAJA sort with RAJA::omp_parallel_for_exec execution + /// policy type and an explicit less operation. + /// + + //checkUnstableSortResult>(in, out, N); + CHECK_UNSTABLE_SORT_RESULT(OP_LESS); + printArray(out, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running OpenMP stable_sort_pairs (non-increasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + /// + /// TODO... + /// + /// EXERCISE: Implement a stable RAJA sort with RAJA::omp_parallel_for_exec execution + /// policy type and an explicit greater operation. + /// + + //checkStableSortResult>(in, out, in_vals, out_vals, N); + CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); + printArray(out, out_vals, N); + std::cout << "\n"; + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_CUDA) + +//----------------------------------------------------------------------------// +// Perform a couple of CUDA sorts... +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA sort_pairs (non-increasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + /// + /// TODO... + /// + /// EXERCISE: Implement a RAJA pair sort with RAJA::cuda_exec execution + /// policy type and an explicit greater operation. + /// + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER); + printArray(out, out_vals, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running CUDA stable_sort (non-decreasing)...\n"; + + std::copy_n(in, N, out); + + /// + /// TODO... + /// + /// EXERCISE: Implement a stable RAJA pair sort with RAJA::cuda_exec execution + /// policy type and an explicit less operation. + /// + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_LESS); + printArray(out, N); + std::cout << "\n"; + +#endif + +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + +//----------------------------------------------------------------------------// +// Perform a couple of HIP sorts... +//----------------------------------------------------------------------------// + + std::cout << "\n Running HIP sort_pairs (non-decreasing)...\n"; + + std::copy_n(in, N, out); + std::copy_n(in_vals, N, out_vals); + + int* d_out = memoryManager::allocate_gpu(N); + int* d_out_vals = memoryManager::allocate_gpu(N); + + hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_out_vals, out_vals, N * sizeof(int), hipMemcpyHostToDevice )); + + /// + /// TODO... + /// + /// EXERCISE: Implement a RAJA pair sort with RAJA::hip_exec execution + /// policy type and an explicit less operation. + /// + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); + + //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); + printArray(out, out_vals, N); + std::cout << "\n"; + +//----------------------------------------------------------------------------// + + std::cout << "\n Running HIP stable_sort (non-increasing)...\n"; + + std::copy_n(in, N, out); + + hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); + + /// + /// TODO... + /// + /// EXERCISE: Implement a stable RAJA sort with RAJA::hip_exec execution + /// policy type and an explicit less operation. + /// + /// NOTE: You will need to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); + + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_GREATER); + printArray(out, N); + std::cout << "\n"; + + memoryManager::deallocate_gpu(d_out); + memoryManager::deallocate_gpu(d_out_vals); + +#endif + + +//----------------------------------------------------------------------------// + +// +// Clean up. +// + memoryManager::deallocate(in); + memoryManager::deallocate(out); + + memoryManager::deallocate(in_vals); + memoryManager::deallocate(out_vals); + + std::cout << "\n DONE!...\n"; + + return 0; +} + +template +bool equivalent(T const& a, T const& b, Comparator comp) +{ + return !comp(a, b) && !comp(b, a); +} + +// +// Function to check unstable sort result +// +template +void checkUnstableSortResult(const T* in, const T* out, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to keys + using val_map = std::unordered_multiset; + std::unordered_map keys; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys.find(in[i]); + if (key_iter == keys.end()) { + auto ret = keys.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace(in[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i-1] << ", " << out[i] + << " out of order" + << " (at index " << i-1 << ")\n"; + } + // test there is an item with this + auto key_iter = keys.find(out[i]); + if (key_iter == keys.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " unknown or duplicate key" + << " (at index " << i << ")\n"; + } + auto val_iter = key_iter->second.find(out[i]); + if (val_iter == key_iter->second.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " unknown or duplicate val" + << " (at index " << i << ")\n"; + } + key_iter->second.erase(val_iter); + if (key_iter->second.size() == 0) { + keys.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} + +template +void checkUnstableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to vals + using val_map = std::unordered_multiset; + std::unordered_map keys_to_vals; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys_to_vals.find(in[i]); + if (key_iter == keys_to_vals.end()) { + auto ret = keys_to_vals.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace(in_vals[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i-1] << "," << out_vals[i-1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" + << " out of order" + << " (at index " << i-1 << ")\n"; + } + // test there is a pair with this key and val + auto key_iter = keys_to_vals.find(out[i]); + if (key_iter == keys_to_vals.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " unknown or duplicate key" + << " (at index " << i << ")\n"; + } + auto val_iter = key_iter->second.find(out_vals[i]); + if (val_iter == key_iter->second.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " unknown or duplicate val" + << " (at index " << i << ")\n"; + } + key_iter->second.erase(val_iter); + if (key_iter->second.size() == 0) { + keys_to_vals.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} + +// +// Function to check stable sort result +// +template +void checkStableSortResult(const T* in, const T* out, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to keys + using val_map = std::list; + std::unordered_map keys; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys.find(in[i]); + if (key_iter == keys.end()) { + auto ret = keys.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace_back(in[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i-1] << ", " << out[i] + << " out of order " + << " (at index " << i-1 << ")\n"; + } + // test there is an item with this + auto key_iter = keys.find(out[i]); + if (key_iter == keys.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " unknown or duplicate key " + << " (at index " << i << ")\n"; + } + if (key_iter->second.front() != out[i]) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << out[i] + << " out of stable order or unknown val " + << " (at index " << i << ")\n"; + } + key_iter->second.pop_front(); + if (key_iter->second.size() == 0) { + keys.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} + +template +void checkStableSortResult(const T* in, const T* out, + const U* in_vals, const U* out_vals, int N) +{ + Comparator comp; + bool correct = true; + + // make map of keys to vals + using val_map = std::list; + std::unordered_map keys_to_vals; + for (RAJA::Index_type i = 0; i < N; i++) { + auto key_iter = keys_to_vals.find(in[i]); + if (key_iter == keys_to_vals.end()) { + auto ret = keys_to_vals.emplace(in[i], val_map{}); + assert(ret.second); + key_iter = ret.first; + } + key_iter->second.emplace_back(in_vals[i]); + } + + for (RAJA::Index_type i = 0; i < N; i++) { + // test ordering + if (i > 0 && comp(out[i], out[i-1])) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i-1] << "," << out_vals[i-1] << ")," + << " (" << out[i] << "," << out_vals[i] << ")" + << " out of order " + << " (at index " << i-1 << ")\n"; + } + // test there is a pair with this key and val + auto key_iter = keys_to_vals.find(out[i]); + if (key_iter == keys_to_vals.end()) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " unknown or duplicate key " + << " (at index " << i << ")\n"; + } + if (key_iter->second.front() != out_vals[i]) { + if (correct) { + std::cout << "\n\t result -- WRONG\n"; + correct = false; + } + std::cout << "\t" + << "(" << out[i] << "," << out_vals[i] << ")" + << " out of stable order or unknown val " + << " (at index " << i << ")\n"; + } + key_iter->second.pop_front(); + if (key_iter->second.size() == 0) { + keys_to_vals.erase(key_iter); + } + } + if (correct) { + std::cout << "\n\t result -- CORRECT\n"; + } +} + + +// +// Function to print vector. +// +template +void printArray(const T* k, int N) +{ + std::cout << std::endl; + for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; } + std::cout << std::endl; +} +/// +template +void printArray(const T* k, const U* v, int N) +{ + std::cout << std::endl; + for (int i = 0; i < N; ++i) { std::cout << " (" << k[i] << "," << v[i] << ")"; } + std::cout << std::endl; +} + diff --git a/examples/tut_sort.cpp b/exercises/sort_solution.cpp similarity index 88% rename from examples/tut_sort.cpp rename to exercises/sort_solution.cpp index 18ec192de0..d86cd72b70 100644 --- a/examples/tut_sort.cpp +++ b/exercises/sort_solution.cpp @@ -5,6 +5,14 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +#define OP_GREATER RAJA::operators::greater +#define OP_LESS RAJA::operators::less + +#define CHECK_UNSTABLE_SORT_RESULT(X) checkUnstableSortResult(in, out, N) +#define CHECK_UNSTABLE_SORT_PAIR_RESULT(X) checkUnstableSortResult(in, out, in_vals, out_vals, N) +#define CHECK_STABLE_SORT_RESULT(X) checkStableSortResult(in, out, N) +#define CHECK_STABLE_SORT_PAIR_RESULT(X) checkStableSortResult(in, out, in_vals, out_vals, N) + #include #include #include @@ -20,9 +28,9 @@ #include "RAJA/RAJA.hpp" /* - * Sort Example + * Sort Exercise * - * Example shows how to perform RAJA unstable and stable sort operations + * Exercise demonstrates how to perform RAJA unstable and stable sort operations * for integer arrays, including pairs variant, using different comparators. * Other array data types, comparators, etc. are similar * @@ -36,14 +44,14 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block + Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 16; +constexpr int CUDA_BLOCK_SIZE = 16; #endif #if defined(RAJA_ENABLE_HIP) -const int HIP_BLOCK_SIZE = 16; +constexpr int HIP_BLOCK_SIZE = 16; #endif // @@ -76,7 +84,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) // // Define array length // - const int N = 20; + constexpr int N = 20; // // Allocate and initialize vector data @@ -95,14 +103,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::fill(in_vals , in_vals + N/2, 0); std::fill(in_vals + N/2, in_vals + N , 1); - // _sort_array_init_end - std::cout << "\n in keys...\n"; printArray(in, N); std::cout << "\n in (key, value) pairs...\n"; printArray(in, in_vals, N); std::cout << "\n"; + // _sort_array_init_end + //----------------------------------------------------------------------------// // Perform various sequential sorts to illustrate unstable/stable, @@ -111,13 +119,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running sequential sort (default)...\n"; + // _sort_seq_start std::copy_n(in, N, out); - // _sort_seq_start RAJA::sort(RAJA::make_span(out, N)); // _sort_seq_end - checkUnstableSortResult>(in, out, N); + //checkUnstableSortResult>(in, out, N); + CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; @@ -132,7 +141,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_seq_less_end - checkUnstableSortResult>(in, out, N); + //checkUnstableSortResult>(in, out, N); + CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; @@ -147,13 +157,14 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_stable_seq_less_end - checkStableSortResult>(in, out, N); + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; //----------------------------------------------------------------------------// - std::cout << "\n Running sequential stable_sort (non-decreasing)...\n"; + std::cout << "\n Running sequential stable_sort (non-increasing)...\n"; std::copy_n(in, N, out); @@ -162,7 +173,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::greater{}); // _sort_stable_seq_greater_end - checkStableSortResult>(in, out, N); + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; @@ -179,7 +191,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_pairs_seq_less_end - checkUnstableSortResult>(in, out, in_vals, out_vals, N); + //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; @@ -196,7 +209,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::greater{}); // _sort_stable_pairs_seq_greater_end - checkStableSortResult>(in, out, in_vals, out_vals, N); + //checkStableSortResult>(in, out, in_vals, out_vals, N); + CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; @@ -216,7 +230,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_omp_less_end - checkUnstableSortResult>(in, out, N); + //checkUnstableSortResult>(in, out, N); + CHECK_UNSTABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; @@ -233,7 +248,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::greater{}); // _sort_stable_pairs_omp_greater_end - checkStableSortResult>(in, out, in_vals, out_vals, N); + //checkStableSortResult>(in, out, in_vals, out_vals, N); + CHECK_STABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; @@ -258,7 +274,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::greater{}); // _sort_pairs_cuda_greater_end - checkUnstableSortResult>(in, out, in_vals, out_vals, N); + //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_GREATER); printArray(out, out_vals, N); std::cout << "\n"; @@ -273,7 +290,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) RAJA::operators::less{}); // _sort_stable_cuda_less_end - checkStableSortResult>(in, out, N); + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_LESS); printArray(out, N); std::cout << "\n"; @@ -305,7 +323,8 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); hipErrchk(hipMemcpy( out_vals, d_out_vals, N * sizeof(int), hipMemcpyDeviceToHost )); - checkUnstableSortResult>(in, out, in_vals, out_vals, N); + //checkUnstableSortResult>(in, out, in_vals, out_vals, N); + CHECK_UNSTABLE_SORT_PAIR_RESULT(OP_LESS); printArray(out, out_vals, N); std::cout << "\n"; @@ -317,12 +336,16 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) hipErrchk(hipMemcpy( d_out, out, N * sizeof(int), hipMemcpyHostToDevice )); - RAJA::stable_sort>(RAJA::make_span(d_out, N), - RAJA::operators::greater{}); + // _sort_stable_hip_greater_start + RAJA::stable_sort>( + RAJA::make_span(d_out, N), + RAJA::operators::greater{}); + // _sort_stable_hip_greater_end hipErrchk(hipMemcpy( out, d_out, N * sizeof(int), hipMemcpyDeviceToHost )); - checkStableSortResult>(in, out, N); + //checkStableSortResult>(in, out, N); + CHECK_STABLE_SORT_RESULT(OP_GREATER); printArray(out, N); std::cout << "\n"; @@ -331,6 +354,7 @@ int main(int RAJA_UNUSED_ARG(argc), char** RAJA_UNUSED_ARG(argv[])) #endif + //----------------------------------------------------------------------------// // @@ -419,7 +443,7 @@ void checkUnstableSortResult(const T* in, const T* out, int N) std::cout << "\n\t result -- CORRECT\n"; } } -/// + template void checkUnstableSortResult(const T* in, const T* out, const U* in_vals, const U* out_vals, int N) @@ -551,7 +575,7 @@ void checkStableSortResult(const T* in, const T* out, int N) std::cout << "\n\t result -- CORRECT\n"; } } -/// + template void checkStableSortResult(const T* in, const T* out, const U* in_vals, const U* out_vals, int N) @@ -628,7 +652,7 @@ void printArray(const T* k, int N) for (int i = 0; i < N; ++i) { std::cout << " " << k[i]; } std::cout << std::endl; } -/// + template void printArray(const T* k, const U* v, int N) { diff --git a/exercises/supervisord.conf b/exercises/supervisord.conf new file mode 100644 index 0000000000..f40fe78616 --- /dev/null +++ b/exercises/supervisord.conf @@ -0,0 +1,10 @@ +[supervisord] +nodaemon = true +user = XXX +logfile = /tmp/supervisord.log + +[program:openvscode-server] +environment=HOME="/home/XXX",USER="XXX" +redirect_stderr = true +stdout_logfile = /var/log/openvscode-server.log +command = /opt/archives/openvscode-server-v1.69.1-linux-x64/bin/openvscode-server --without-connection-token --host 0.0.0.0 diff --git a/exercises/tutorial_halfday/CMakeLists.txt b/exercises/tutorial_halfday/CMakeLists.txt index 2c9a0ab86b..7fbaa2437b 100644 --- a/exercises/tutorial_halfday/CMakeLists.txt +++ b/exercises/tutorial_halfday/CMakeLists.txt @@ -5,14 +5,6 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -raja_add_executable( - NAME ex1_vector-addition - SOURCES ex1_vector-addition.cpp) - -raja_add_executable( - NAME ex1_vector-addition_solution - SOURCES ex1_vector-addition_solution.cpp) - raja_add_executable( NAME ex2_approx-pi SOURCES ex2_approx-pi.cpp) @@ -21,22 +13,6 @@ raja_add_executable( NAME ex2_approx-pi_solution SOURCES ex2_approx-pi_solution.cpp) -raja_add_executable( - NAME ex3_colored-indexset - SOURCES ex3_colored-indexset.cpp) - -raja_add_executable( - NAME ex3_colored-indexset_solution - SOURCES ex3_colored-indexset_solution.cpp) - -raja_add_executable( - NAME ex4_atomic-histogram - SOURCES ex4_atomic-histogram.cpp) - -raja_add_executable( - NAME ex4_atomic-histogram_solution - SOURCES ex4_atomic-histogram_solution.cpp) - raja_add_executable( NAME ex5_line-of-sight SOURCES ex5_line-of-sight.cpp) @@ -53,14 +29,6 @@ raja_add_executable( NAME ex6_stencil-offset-layout_solution SOURCES ex6_stencil-offset-layout_solution.cpp) -raja_add_executable( - NAME ex7_nested-loop-reorder - SOURCES ex7_nested-loop-reorder.cpp) - -raja_add_executable( - NAME ex7_nested-loop-reorder_solution - SOURCES ex7_nested-loop-reorder_solution.cpp) - raja_add_executable( NAME ex8_tiled-matrix-transpose SOURCES ex8_tiled-matrix-transpose.cpp) diff --git a/exercises/tutorial_halfday/ex7_nested-loop-reorder.cpp b/exercises/tutorial_halfday/ex7_nested-loop-reorder.cpp deleted file mode 100644 index 2c33f46344..0000000000 --- a/exercises/tutorial_halfday/ex7_nested-loop-reorder.cpp +++ /dev/null @@ -1,158 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include - -#include "RAJA/RAJA.hpp" - -/* - * EXERCISE #6: Nested Loop Reordering - * - * In this exercise, you will use RAJA::kernel execution policies - * to permute the order of loops in a triple loop nest. In particular, - * you will reorder loop statements in execution policies. The exercise - * does no actual computation and just prints out the loop indices to show - * the different orderings. - * - * To avoid the complexity of interpreting parallel output, the execution - * policies you will write will use sequential execution. - * - * RAJA features shown: - * - Index range segment - * - 'RAJA::kernel' loop abstractions and execution policies - * - Nested loop reordering - * - Strongly-typed loop indices - */ - -// -// Define three named loop index types used in the triply-nested loops. -// These will trigger compilation errors if lambda index argument ordering -// and types do not match the typed range index ordering. See final -// example in this file. -// -RAJA_INDEX_VALUE(KIDX, "KIDX"); -RAJA_INDEX_VALUE(JIDX, "JIDX"); -RAJA_INDEX_VALUE(IIDX, "IIDX"); - - -int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) -{ - - std::cout << "\n\nExercise #7: RAJA nested loop reorder example...\n"; - - std::cout << "\n Running C-style loop nest with loop ordering: K-outer, J-middle, I-inner" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - - for (int k = 2; k < 4; ++k) { - for (int j = 1; j < 3; ++j) { - for (int i = 0; i < 2; ++i) { - printf( " (%d, %d, %d) \n", i, j, k); - } - } - } - -// -// The RAJA variants of the loop nest used following typed range segments -// based on the typed indices defined above, outside of main(). -// - RAJA::TypedRangeSegment KRange(2, 4); - RAJA::TypedRangeSegment JRange(1, 3); - RAJA::TypedRangeSegment IRange(0, 2); - -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA nested loop example (K-outer, J-middle, I-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - - using KJI_EXECPOL = RAJA::KernelPolicy< - RAJA::statement::For<2, RAJA::seq_exec, // k - RAJA::statement::For<1, RAJA::seq_exec, // j - RAJA::statement::For<0, RAJA::seq_exec,// i - RAJA::statement::Lambda<0> - > - > - > - >; - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA nested loop example (J-outer, I-middle, K-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - - /// - /// TODO... - /// - /// EXERCISE: Define an execution policy (JIK_EXECPOL) that reorders the - /// loop nest so that the outer loop is the j-loop (slowest - /// running index), the inner loop is the k-loop (fastest - /// running index), and the i-loop is the middle loop. - /// - /// NOTE: You will have to enable this code section to compile and run it. - /// - -#if 0 - using JIK_EXECPOL = - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); -#endif - - -//----------------------------------------------------------------------------// - - std::cout << "\n Running RAJA nested loop example (I-outer, K-middle, J-inner)" - << "...\n\n" << " (I, J, K)\n" << " ---------\n"; - - /// - /// TODO... - /// - /// EXERCISE: Define an execution policy (IKJ_EXECPOL) that reorders the - /// loop nest so that the outer loop is the i-loop (slowest - /// running index), the inner loop is the j-loop (fastest - /// running index), and the k-loop is the middle loop. - /// - /// NOTE: You will have to enable this code section to compile and run it. - /// - -#if 0 - using IKJ_EXECPOL = - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (IIDX i, JIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); -#endif - - -#if 0 -//----------------------------------------------------------------------------// -// The following demonstrates that code will not compile if lambda argument -// types/order do not match the types/order For statements in the execution -// policy. To see this, enable this code section and try to compile this file. -//----------------------------------------------------------------------------// - - RAJA::kernel( RAJA::make_tuple(IRange, JRange, KRange), - [=] (JIDX i, IIDX j, KIDX k) { - printf( " (%d, %d, %d) \n", (int)(*i), (int)(*j), (int)(*k)); - }); - -#endif - - std::cout << "\n DONE!...\n"; - - return 0; -} - diff --git a/exercises/user-data.sh b/exercises/user-data.sh new file mode 100644 index 0000000000..dd557ae116 --- /dev/null +++ b/exercises/user-data.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +yum update -y +amazon-linux-extras install docker +systemctl start docer +systemctl enable docker + +wget https://raw.githubusercontent.com/LLNL/RAJA/task/tut-reorg-aws/exercises/Dockerfile +wget https://raw.githubusercontent.com/LLNL/RAJA/task/tut-reorg-aws/exercises/supervisord.conf + +env DOCKER_BUILDKIT=1 docker build . -t raja-aws-tut +docker run --init --gpus all -p 3000:3000 raja-aws-tut diff --git a/exercises/tutorial_halfday/ex1_vector-addition.cpp b/exercises/vector-addition.cpp similarity index 59% rename from exercises/tutorial_halfday/ex1_vector-addition.cpp rename to exercises/vector-addition.cpp index ae6a85c403..89b6e45fc4 100644 --- a/exercises/tutorial_halfday/ex1_vector-addition.cpp +++ b/exercises/vector-addition.cpp @@ -14,7 +14,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #1: Vector Addition + * Vector Addition Exercise * * In this exercise, you will compute c = a + b, where a, b, c are * integer vectors. @@ -36,14 +36,19 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block + Specify the number of threads in a GPU thread block +*/ +#if defined(RAJA_ENABLE_CUDA) +//constexpr int CUDA_BLOCK_SIZE = 256; +#endif - Uncomment to use when filling in exercises. +#if defined(RAJA_ENABLE_HIP) +//constexpr int HIP_BLOCK_SIZE = 256; +#endif -#if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; +#if defined(RAJA_ENABLE_SYCL) +//constexpr int SYCL_BLOCK_SIZE = 256; #endif -*/ // // Functions for checking and printing arrays @@ -55,12 +60,17 @@ void printArray(int* v, int len); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nExercise #1: RAJA Vector Addition...\n"; + std::cout << "\n\nExercise: RAJA Vector Addition...\n"; + +#if defined(RAJA_ENABLE_SYCL) + memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()}; + ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res); +#endif // // Define vector length // - const int N = 1000000; + constexpr int N = 1000000; // // Allocate and initialize vector data to random numbers in [1, 10]. @@ -84,9 +94,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style sequential vector addition...\n"; + // _cstyle_vector_add_start for (int i = 0; i < N; ++i) { c_ref[i] = a[i] + b[i]; } + // _cstyle_vector_add_end //printArray(c_ref, N); @@ -108,11 +120,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) /// NOTE: We've done this one for you to help you get started... /// - using EXEC_POL1 = RAJA::seq_exec; - - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; + // _rajaseq_vector_add_start + RAJA::forall(RAJA::TypedRangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; }); + // _rajaseq_vector_add_end checkResult(c, c_ref, N); //printArray(c, N); @@ -212,15 +224,120 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA vector addition...\n"; + int *d_a = memoryManager::allocate_gpu(N); + int *d_b = memoryManager::allocate_gpu(N); + int *d_c = memoryManager::allocate_gpu(N); + + cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice )); + /// /// TODO... /// /// EXERCISE: Implement the vector addition kernel using a RAJA::forall /// method and RAJA::cuda_exec execution policy type. /// + /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); checkResult(c, c_ref, N); //printArray(c, N); + +//----------------------------------------------------------------------------// +// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a +// GPU device with 2 blocks per SM. +//----------------------------------------------------------------------------// + + std::memset(c, 0, N * sizeof(int)); + + std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; + + /// + /// TODO... + /// + /// EXERCISE: Implement the vector addition kernel using a RAJA::forall + /// method and RAJA::cuda_exec execution policy type with + /// arguments defining 2 blocks per SM and asynchronous execution. + /// + /// NOTE: You will have to uncomment 'CUDA_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + + checkResult(c, c_ref, N); +//printResult(c, N); +#endif + +//----------------------------------------------------------------------------// +// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + std::cout << "\n Running RAJA HIP vector addition...\n"; + + int *d_a = memoryManager::allocate_gpu(N); + int *d_b = memoryManager::allocate_gpu(N); + int *d_c = memoryManager::allocate_gpu(N); + + hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + + /// + /// TODO... + /// + /// EXERCISE: Implement the vector addition kernel using a RAJA::forall + /// method and RAJA::hip_exec execution policy type. + /// + /// NOTE: You will have to uncomment 'HIP_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); + + checkResult(c, c_ref, N); +//printResult(c, N); + + memoryManager::deallocate_gpu(d_a); + memoryManager::deallocate_gpu(d_b); + memoryManager::deallocate_gpu(d_c); +#endif + +//----------------------------------------------------------------------------// +// RAJA::sycl_exec policy runs the loop as a SYCL kernel. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_SYCL) + std::cout << "\n Running RAJA SYCL vector addition...\n"; + + int *d_a = memoryManager::allocate_gpu(N); + int *d_b = memoryManager::allocate_gpu(N); + int *d_c = memoryManager::allocate_gpu(N); + + memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int)); + memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement the vector addition kernel using a RAJA::forall + /// method and RAJA::hip_exec execution policy type. + /// + /// NOTE: You will have to uncomment 'SYCL_BLOCK_SIZE' near the + /// top of the file if you want to use it here. + /// + + memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); + + checkResult(c, c_ref, N); +//printResult(c, N); + + memoryManager::deallocate_gpu(d_a); + memoryManager::deallocate_gpu(d_b); + memoryManager::deallocate_gpu(d_c); #endif //----------------------------------------------------------------------------// diff --git a/exercises/tutorial_halfday/ex1_vector-addition_solution.cpp b/exercises/vector-addition_solution.cpp similarity index 54% rename from exercises/tutorial_halfday/ex1_vector-addition_solution.cpp rename to exercises/vector-addition_solution.cpp index d02c2cb26f..31bf643488 100644 --- a/exercises/tutorial_halfday/ex1_vector-addition_solution.cpp +++ b/exercises/vector-addition_solution.cpp @@ -14,7 +14,7 @@ #include "memoryManager.hpp" /* - * EXERCISE #1: Vector Addition + * Vector Addition Exercise * * In this exercise, you will compute c = a + b, where a, b, c are * integer vectors. @@ -36,10 +36,18 @@ */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block + Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; +constexpr int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +constexpr int HIP_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_SYCL) +constexpr int SYCL_BLOCK_SIZE = 256; #endif // @@ -52,12 +60,17 @@ void printArray(int* v, int len); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nExercise #1: RAJA Vector Addition...\n"; + std::cout << "\n\nExercise: RAJA Vector Addition...\n"; + +#if defined(RAJA_ENABLE_SYCL) + memoryManager::sycl_res = new camp::resources::Resource{camp::resources::Sycl()}; + ::RAJA::sycl::detail::setQueue(memoryManager::sycl_res); +#endif // // Define vector length // - const int N = 1000000; + constexpr int N = 1000000; // // Allocate and initialize vector data to random numbers in [1, 10]. @@ -81,9 +94,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style sequential vector addition...\n"; + // _cstyle_vector_add_start for (int i = 0; i < N; ++i) { c_ref[i] = a[i] + b[i]; } + // _cstyle_vector_add_end //printArray(c_ref, N); @@ -96,11 +111,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA sequential vector addition...\n"; - using EXEC_POL1 = RAJA::seq_exec; - - RAJA::forall< EXEC_POL1 >(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + // _rajaseq_vector_add_start + RAJA::forall< RAJA::seq_exec >( + RAJA::TypedRangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + } + ); + // _rajaseq_vector_add_end checkResult(c, c_ref, N); //printArray(c, N); @@ -115,11 +132,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA SIMD vector addition...\n"; - using EXEC_POL2 = RAJA::simd_exec; - - RAJA::forall< EXEC_POL2 >(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall( + RAJA::TypedRangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + } + ); checkResult(c, c_ref, N); //printArray(c, N); @@ -134,11 +151,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA loop-exec vector addition...\n"; - using EXEC_POL3 = RAJA::loop_exec; - - RAJA::forall< EXEC_POL3 >(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + RAJA::forall< RAJA::loop_exec >( + RAJA::TypedRangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + } + ); checkResult(c, c_ref, N); //printArray(c, N); @@ -176,11 +193,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA OpenMP multithreaded vector addition...\n"; - using EXEC_POL4 = RAJA::omp_parallel_for_exec; - - RAJA::forall< EXEC_POL4 >(RAJA::RangeSegment(0, N), [=] (int i) { - c[i] = a[i] + b[i]; - }); + // _rajaomp_vector_add_start + RAJA::forall< RAJA::omp_parallel_for_exec >( + RAJA::TypedRangeSegment(0, N), [=] (int i) { + c[i] = a[i] + b[i]; + } + ); + // _rajaomp_vector_add_end checkResult(c, c_ref, N); //printArray(c, N); @@ -197,14 +216,109 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running RAJA CUDA vector addition...\n"; - using EXEC_POL5 = RAJA::cuda_exec; + int *d_a = memoryManager::allocate_gpu(N); + int *d_b = memoryManager::allocate_gpu(N); + int *d_c = memoryManager::allocate_gpu(N); - RAJA::forall< EXEC_POL5 >(RAJA::RangeSegment(0, N), [=] RAJA_DEVICE (int i) { - c[i] = a[i] + b[i]; + cudaErrchk(cudaMemcpy( d_a, a, N * sizeof(int), cudaMemcpyHostToDevice )); + cudaErrchk(cudaMemcpy( d_b, b, N * sizeof(int), cudaMemcpyHostToDevice )); + + // _rajacuda_vector_add_start + RAJA::forall< RAJA::cuda_exec >(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + d_c[i] = d_a[i] + d_b[i]; }); + // _rajacuda_vector_add_end + + cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); checkResult(c, c_ref, N); //printArray(c, N); + +//----------------------------------------------------------------------------// +// RAJA::cuda_exec policy runs the loop as a CUDA kernel asynchronously on a +// GPU device with 2 blocks per SM. +//----------------------------------------------------------------------------// + + std::memset(c, 0, N * sizeof(int)); + + std::cout << "\n Running RAJA CUDA explicit (2 blocks per SM) vector addition...\n"; + + // _rajacuda_explicit_vector_add_start + const bool Asynchronous = true; + + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + d_c[i] = d_a[i] + d_b[i]; + }); + // _rajacuda_explicit_vector_add_end + + cudaErrchk(cudaMemcpy( c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost )); + + checkResult(c, c_ref, N); +//printResult(c, N); +#endif + +//----------------------------------------------------------------------------// +// RAJA::hip_exec policy runs the loop as a HIP kernel on a GPU device. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + std::cout << "\n Running RAJA HIP vector addition...\n"; + + int *d_a = memoryManager::allocate_gpu(N); + int *d_b = memoryManager::allocate_gpu(N); + int *d_c = memoryManager::allocate_gpu(N); + + hipErrchk(hipMemcpy( d_a, a, N * sizeof(int), hipMemcpyHostToDevice )); + hipErrchk(hipMemcpy( d_b, b, N * sizeof(int), hipMemcpyHostToDevice )); + + // _rajahip_vector_add_start + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + d_c[i] = d_a[i] + d_b[i]; + }); + // _rajahip_vector_add_end + + hipErrchk(hipMemcpy( c, d_c, N * sizeof(int), hipMemcpyDeviceToHost )); + + checkResult(c, c_ref, N); +//printResult(c, N); + + memoryManager::deallocate_gpu(d_a); + memoryManager::deallocate_gpu(d_b); + memoryManager::deallocate_gpu(d_c); +#endif + +//----------------------------------------------------------------------------// +// RAJA::sycl_exec policy runs the loop as a SYCL kernel. +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_SYCL) + std::cout << "\n Running RAJA SYCL vector addition...\n"; + + int *d_a = memoryManager::allocate_gpu(N); + int *d_b = memoryManager::allocate_gpu(N); + int *d_c = memoryManager::allocate_gpu(N); + + memoryManager::sycl_res->memcpy(d_a, a, N * sizeof(int)); + memoryManager::sycl_res->memcpy(d_b, b, N * sizeof(int)); + + // _rajasycl_vector_add_start + RAJA::forall>(RAJA::TypedRangeSegment(0, N), + [=] RAJA_DEVICE (int i) { + d_c[i] = d_a[i] + d_b[i]; + }); + // _rajasycl_vector_add_end + + memoryManager::sycl_res->memcpy(c, d_c, N * sizeof(int)); + + checkResult(c, c_ref, N); +//printResult(c, N); + + memoryManager::deallocate_gpu(d_a); + memoryManager::deallocate_gpu(d_b); + memoryManager::deallocate_gpu(d_c); #endif //----------------------------------------------------------------------------// diff --git a/exercises/tutorial_halfday/ex3_colored-indexset.cpp b/exercises/vertexsum-indexset.cpp similarity index 51% rename from exercises/tutorial_halfday/ex3_colored-indexset.cpp rename to exercises/vertexsum-indexset.cpp index f42d047648..60709ddee9 100644 --- a/exercises/tutorial_halfday/ex3_colored-indexset.cpp +++ b/exercises/vertexsum-indexset.cpp @@ -18,10 +18,10 @@ #include "memoryManager.hpp" /* - * EXERCISE #3: Mesh vertex area with "colored" TypedIndexSet + * Mesh vertex area exercise * * In this exercise, you will use a RAJA TypedIndexSet containing 4 - * ListSegments to parallelize the mesh vertex area computation. + * TypedListSegments to parallelize the mesh vertex area computation. * A sum is computed at each vertex on a logically-Cartesian 2D mesh * where the sum represents the vertex "area" as an average of the 4 * element areas surrounding the vertex. The computation is written as @@ -29,7 +29,7 @@ * contributions may be written to the same vertex value at the same time, * the elements are partitioned into 4 subsets, where no two elements in * each subset share a vertex. A ListSegment enumerates the elements in - * each subset. When the ListSegments are put into an TypedIndexSet, the entire + * each subset. When the ListSegments are put into an IndexSet, the entire * computation can be executed with one RAJA::forall() statement, where * you iterate over the segments sequentially and execute each segment in * parallel. This exercise illustrates how RAJA can be used to enable one @@ -43,22 +43,23 @@ * * RAJA features you will use: * - `forall` loop iteration template method - * - Index list segment - * - TypedIndexSet segment container + * - List segment + * - IndexSet segment container * - Hierarchical execution policies * * If CUDA is enabled, CUDA unified memory is used. */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block - - Uncomment to use when filling in exercises. - + Specify the number of threads in a GPU thread block +*/ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; +constexpr int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +constexpr int HIP_BLOCK_SIZE = 256; #endif -*/ // // Functions to check and print result. @@ -70,47 +71,50 @@ void printMeshData(double* v, int n, int joff); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nExercise #3: Mesh vertex area with 'colored' TypedIndexSet...\n"; + std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n"; +// _vertexsum_define_start // // 2D mesh has N^2 elements (N+1)^2 vertices. // - const int N = 1000; - const int Nelem = N; - const int Nvert = N + 1; - double* areae = memoryManager::allocate(Nelem*Nelem); - double* areav = memoryManager::allocate(Nvert*Nvert); - double* areav_ref = memoryManager::allocate(Nvert*Nvert); - int* e2v_map = memoryManager::allocate(4*Nelem*Nelem); - + constexpr int N = 1000; + constexpr int Nelem = N; + constexpr int Nelem_tot = Nelem * Nelem; + constexpr int Nvert = N + 1; + constexpr int Nvert_tot = Nvert * Nvert; +// _vertexsum_define_end + double* areae = memoryManager::allocate(Nelem_tot); + double* areav = memoryManager::allocate(Nvert_tot); + double* areav_ref = memoryManager::allocate(Nvert_tot); + int* e2v_map = memoryManager::allocate(4*Nelem_tot); + +// _vertexsum_elemarea_start // // Define mesh spacing factor 'h' and set up elem to vertex mapping array. // - double h = 0.1; - - for (int j = 0 ; j < Nelem ; ++j) { - for (int i = 0 ; i < Nelem ; ++i) { - int ielem = i + j*Nelem ; - int imap = 4 * ielem ; - e2v_map[imap] = ielem + j; - e2v_map[imap+1] = ielem + j + 1; - e2v_map[imap+2] = ielem + j + Nvert; - e2v_map[imap+3] = ielem + j + 1 + Nvert; - } + constexpr double h = 0.1; + + for (int ie = 0; ie < Nelem_tot; ++ie) { + int j = ie / Nelem; + int imap = 4 * ie ; + e2v_map[imap] = ie + j; + e2v_map[imap+1] = ie + j + 1; + e2v_map[imap+2] = ie + j + Nvert; + e2v_map[imap+3] = ie + j + 1 + Nvert; } // // Initialize element areas so each element area // depends on the i,j coordinates of the element. // - std::memset(areae, 0, Nelem*Nelem * sizeof(double)); + std::memset(areae, 0, Nelem_tot * sizeof(double)); - for (int j = 0 ; j < Nelem ; ++j) { - for (int i = 0 ; i < Nelem ; ++i) { - int ielem = i + j*Nelem ; - areae[ielem] = h*(i+1) * h*(j+1); - } + for (int ie = 0; ie < Nelem_tot; ++ie) { + int i = ie % Nelem; + int j = ie / Nelem; + areae[ie] = h*(i+1) * h*(j+1); } +// _vertexsum_elemarea_end //std::cout << "\n Element areas...\n"; //printMeshData(areae, Nelem, Nelem); @@ -121,15 +125,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running sequential C-style version of vertex sum...\n"; - std::memset(areav_ref, 0, Nvert*Nvert * sizeof(double)); +// _cstyle_vertexarea_seq_start + std::memset(areav_ref, 0, Nvert_tot * sizeof(double)); - for (int ie = 0; ie < Nelem*Nelem; ++ie) { - int* iv = &(e2v_map[4*ie]); - areav_ref[ iv[0] ] += areae[ie] / 4.0 ; - areav_ref[ iv[1] ] += areae[ie] / 4.0 ; - areav_ref[ iv[2] ] += areae[ie] / 4.0 ; - areav_ref[ iv[3] ] += areae[ie] / 4.0 ; + for (int ie = 0; ie < Nelem_tot; ++ie) { + int* iv = &(e2v_map[4*ie]); + areav_ref[ iv[0] ] += areae[ie] / 4.0 ; + areav_ref[ iv[1] ] += areae[ie] / 4.0 ; + areav_ref[ iv[2] ] += areae[ie] / 4.0 ; + areav_ref[ iv[3] ] += areae[ie] / 4.0 ; } +// _cstyle_vertexarea_seq_end //std::cout << "\n Vertex areas (reference)...\n"; //printMeshData(areav_ref, Nvert, jvoff); @@ -153,33 +159,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Since none of the elements with the same number share a common vertex, // we can iterate over each subset ("color") in parallel. // -// We use RAJA ListSegments and a RAJA TypedIndexSet to define the element +// We use RAJA ListSegments and a RAJA IndexSet to define the element // partitioning. // +// _vertexarea_color_start // // Gather the element indices for each color in a vector. // std::vector< std::vector > idx(4); - for (int j = 0 ; j < Nelem ; ++j) { - for (int i = 0 ; i < Nelem ; ++i) { - int ie = i + j*Nelem ; - if ( i % 2 == 0 ) { - if ( j % 2 == 0 ) { - idx[0].push_back(ie); - } else { - idx[2].push_back(ie); - } + for (int ie = 0; ie < Nelem_tot; ++ie) { + int i = ie % Nelem; + int j = ie / Nelem; + if ( i % 2 == 0 ) { + if ( j % 2 == 0 ) { + idx[0].push_back(ie); + } else { + idx[2].push_back(ie); + } + } else { + if ( j % 2 == 0 ) { + idx[1].push_back(ie); } else { - if ( j % 2 == 0 ) { - idx[1].push_back(ie); - } else { - idx[3].push_back(ie); - } + idx[3].push_back(ie); } } } +// _vertexarea_color_end //----------------------------------------------------------------------------// @@ -191,7 +198,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style OpenMP vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + +// _cstyle_vertexarea_omp_start + std::memset(areav, 0, Nvert_tot * sizeof(double)); for (int icol = 0; icol < 4; ++icol) { const std::vector& ievec = idx[icol]; @@ -208,6 +217,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } } +// _cstyle_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); //std::cout << "\n Vertex areas (reference)...\n"; @@ -216,13 +226,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -// The TypedIndexSet is a variadic template, where the template arguments -// are the segment types that the TypedIndexSet can hold. -// -#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) -#if 0 // needed for exercises, but if-def'd out to quiet compiler warnings. +// The IndexSet is a variadic template, where the template arguments +// are the segment types that the IndexSet can hold. +// +#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +// _vertexarea_listsegtype_start using SegmentType = RAJA::TypedListSegment; -#endif +// _vertexarea_listsegtype_end #endif #if defined(RAJA_ENABLE_OPENMP) @@ -233,25 +243,25 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // camp::resources::Resource host_res{camp::resources::Host()}; -// -// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of -// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA +// +// Create a RAJA IndexSet with four ListSegments, one for the indices of +// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA // variants of the vertex sum calculation. -#if 0 // needed for exercises, but if-def'd out to quiet compiler warnings. + RAJA::TypedIndexSet colorset; -#endif + + colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); /// /// TODO... /// - /// EXERCISE: Add four SegmentType objects to the coloret, one for each of - /// the 'idx' arrays above. Remember to pass the 'host_res' - /// object to the SegmentType constructor. + /// EXERCISE: Add the three list segments to the index set to account + /// for all mesh elements. Then, run the OpenMP kernel variant + /// below to check if it's correct. /// - //----------------------------------------------------------------------------// -// RAJA OpenMP vertex sum calculation using TypedIndexSet (sequential iteration +// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration // over segments, OpenMP parallel iteration of each segment) //----------------------------------------------------------------------------// @@ -259,19 +269,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(areav, 0, Nvert*Nvert * sizeof(double)); - /// - /// TODO... - /// - /// EXERCISE: Implement the vertex sum kernel a RAJA::forall - /// method with execution policy type - /// - /// RAJA::ExecPolicy - /// - /// so that the kernel iterates over the segments sequentially - /// and executes each segment in parallel using OpenMP. - /// +// _raja_vertexarea_omp_start + using EXEC_POL1 = RAJA::ExecPolicy; + RAJA::forall(colorset, [=](int ie) { + int* iv = &(e2v_map[4*ie]); + areav[ iv[0] ] += areae[ie] / 4.0 ; + areav[ iv[1] ] += areae[ie] / 4.0 ; + areav[ iv[2] ] += areae[ie] / 4.0 ; + areav[ iv[3] ] += areae[ie] / 4.0 ; + }); +// _raja_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); //std::cout << "\n Vertex volumes...\n"; @@ -281,7 +290,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// -// RAJA CUDA vertex sum calculation using TypedIndexSet (sequential iteration +// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration // over segments, CUDA kernel launched for each segment) //----------------------------------------------------------------------------// @@ -289,58 +298,112 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Resource object used to construct list segment objects with indices -// living in host (CPU) memory. +// living in device (GPU) memory. // camp::resources::Resource cuda_res{camp::resources::Cuda()}; -#if 0 // needed for exercises, but if-def'd out to quiet compiler warnings. - RAJA::TypedIndexSet cuda_colorset; -#endif +// +// Create a RAJA IndexSet with four ListSegments, one for the indices of +// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA +// variants of the vertex sum calculation. - /// - /// TODO... - /// - /// EXERCISE: Add four SegmentType objects to the cuda_coloret, one for - /// each of the 'idx' arrays above. Remember to pass the 'cuda_res' - /// object to the SegmentType constructor. - /// + RAJA::TypedIndexSet cuda_colorset; + cuda_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), cuda_res) ); /// /// TODO... /// - /// EXERCISE: Implement the vertex sum kernel a RAJA::forall - /// method with execution policy type - /// - /// RAJA::ExecPolicy> + /// EXERCISE: Add the three list segments to the index set to account + /// for all mesh elements. Then, run the CUDA kernel variant + /// below to check if it's correct. /// - /// so that the kernel iterates over the segments sequentially - /// and executes each segment in parallel as a CUDA kernel. - std::cout << "\n Running RAJA CUDA index set vertex sum...\n"; std::memset(areav, 0, Nvert*Nvert * sizeof(double)); - /// - /// TODO... - /// - /// EXERCISE: Implement the vertex sum kernel a RAJA::forall - /// method with execution policy type - /// - /// RAJA::ExecPolicy> - /// - /// so that the kernel iterates over the segments sequentially - /// and executes each segment in parallel as a CUDA kernel. - /// +// _raja_vertexarea_cuda_start + using EXEC_POL2 = RAJA::ExecPolicy>; + + RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { + int* iv = &(e2v_map[4*ie]); + areav[ iv[0] ] += areae[ie] / 4.0 ; + areav[ iv[1] ] += areae[ie] / 4.0 ; + areav[ iv[2] ] += areae[ie] / 4.0 ; + areav[ iv[3] ] += areae[ie] / 4.0 ; + }); +// _raja_vertexarea_cuda_end + + checkResult(areav, areav_ref, Nvert); +//std::cout << "\n Vertex volumes...\n"; +//printMeshData(areav, Nvert, jvoff); + +#endif + +//----------------------------------------------------------------------------// +// RAJA HIP vertex sum calculation using IndexSet (sequential iteration +// over segments, HIP kernel launched for each segment) +//----------------------------------------------------------------------------// +#if defined(RAJA_ENABLE_HIP) +// +// Allocate and initialize device memory arrays +// + double* d_areae = memoryManager::allocate_gpu(Nelem_tot); + double* d_areav = memoryManager::allocate_gpu(Nvert_tot); + int* d_e2v_map = memoryManager::allocate_gpu(4*Nelem_tot); + + hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice); + + std::memset(areav, 0, Nvert_tot * sizeof(double)); + hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice); + +// +// Resource object used to construct list segment objects with indices +// living in device (GPU) memory. +// + camp::resources::Resource hip_res{camp::resources::Hip()}; + +// +// Create a RAJA IndexSet with four ListSegments, one for the indices of +// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA +// variants of the vertex sum calculation. + + RAJA::TypedIndexSet hip_colorset; + + hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) ); + hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) ); + hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) ); + hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) ); + + std::cout << "\n Running RAJA HIP index set vertex sum...\n"; + +// _raja_vertexarea_hip_start + using EXEC_POL3 = RAJA::ExecPolicy>; + + RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { + int* iv = &(d_e2v_map[4*ie]); + d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; + d_areav[ iv[1] ] += d_areae[ie] / 4.0 ; + d_areav[ iv[2] ] += d_areae[ie] / 4.0 ; + d_areav[ iv[3] ] += d_areae[ie] / 4.0 ; + }); +// _raja_vertexarea_hip_end + + hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost); checkResult(areav, areav_ref, Nvert); //std::cout << "\n Vertex volumes...\n"; //printMeshData(areav, Nvert, jvoff); + memoryManager::deallocate_gpu(d_areae); + memoryManager::deallocate_gpu(d_areav); + memoryManager::deallocate_gpu(d_e2v_map); + #endif //----------------------------------------------------------------------------// diff --git a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp b/exercises/vertexsum-indexset_solution.cpp similarity index 59% rename from exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp rename to exercises/vertexsum-indexset_solution.cpp index 98804fb933..e941c7ec51 100644 --- a/exercises/tutorial_halfday/ex3_colored-indexset_solution.cpp +++ b/exercises/vertexsum-indexset_solution.cpp @@ -18,10 +18,10 @@ #include "memoryManager.hpp" /* - * EXERCISE #3: Mesh vertex area with "colored" TypedIndexSet + * Mesh vertex area exercise * * In this exercise, you will use a RAJA TypedIndexSet containing 4 - * ListSegments to parallelize the mesh vertex area computation. + * TypedListSegments to parallelize the mesh vertex area computation. * A sum is computed at each vertex on a logically-Cartesian 2D mesh * where the sum represents the vertex "area" as an average of the 4 * element areas surrounding the vertex. The computation is written as @@ -29,7 +29,7 @@ * contributions may be written to the same vertex value at the same time, * the elements are partitioned into 4 subsets, where no two elements in * each subset share a vertex. A ListSegment enumerates the elements in - * each subset. When the ListSegments are put into an TypedIndexSet, the entire + * each subset. When the ListSegments are put into an IndexSet, the entire * computation can be executed with one RAJA::forall() statement, where * you iterate over the segments sequentially and execute each segment in * parallel. This exercise illustrates how RAJA can be used to enable one @@ -43,18 +43,22 @@ * * RAJA features you will use: * - `forall` loop iteration template method - * - Index list segment - * - TypedIndexSet segment container + * - List segment + * - IndexSet segment container * - Hierarchical execution policies * * If CUDA is enabled, CUDA unified memory is used. */ /* - CUDA_BLOCK_SIZE - specifies the number of threads in a CUDA thread block + Specify the number of threads in a GPU thread block */ #if defined(RAJA_ENABLE_CUDA) -const int CUDA_BLOCK_SIZE = 256; +constexpr int CUDA_BLOCK_SIZE = 256; +#endif + +#if defined(RAJA_ENABLE_HIP) +constexpr int HIP_BLOCK_SIZE = 256; #endif // @@ -67,47 +71,50 @@ void printMeshData(double* v, int n, int joff); int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - std::cout << "\n\nExercise #3: Mesh vertex area with 'colored' TypedIndexSet...\n"; + std::cout << "\n\nExercise: Mesh vertex area with 'colored' IndexSet...\n"; +// _vertexsum_define_start // // 2D mesh has N^2 elements (N+1)^2 vertices. // - const int N = 1000; - const int Nelem = N; - const int Nvert = N + 1; - double* areae = memoryManager::allocate(Nelem*Nelem); - double* areav = memoryManager::allocate(Nvert*Nvert); - double* areav_ref = memoryManager::allocate(Nvert*Nvert); - int* e2v_map = memoryManager::allocate(4*Nelem*Nelem); - + constexpr int N = 1000; + constexpr int Nelem = N; + constexpr int Nelem_tot = Nelem * Nelem; + constexpr int Nvert = N + 1; + constexpr int Nvert_tot = Nvert * Nvert; +// _vertexsum_define_end + double* areae = memoryManager::allocate(Nelem_tot); + double* areav = memoryManager::allocate(Nvert_tot); + double* areav_ref = memoryManager::allocate(Nvert_tot); + int* e2v_map = memoryManager::allocate(4*Nelem_tot); + +// _vertexsum_elemarea_start // // Define mesh spacing factor 'h' and set up elem to vertex mapping array. // - double h = 0.1; - - for (int j = 0 ; j < Nelem ; ++j) { - for (int i = 0 ; i < Nelem ; ++i) { - int ielem = i + j*Nelem ; - int imap = 4 * ielem ; - e2v_map[imap] = ielem + j; - e2v_map[imap+1] = ielem + j + 1; - e2v_map[imap+2] = ielem + j + Nvert; - e2v_map[imap+3] = ielem + j + 1 + Nvert; - } + constexpr double h = 0.1; + + for (int ie = 0; ie < Nelem_tot; ++ie) { + int j = ie / Nelem; + int imap = 4 * ie ; + e2v_map[imap] = ie + j; + e2v_map[imap+1] = ie + j + 1; + e2v_map[imap+2] = ie + j + Nvert; + e2v_map[imap+3] = ie + j + 1 + Nvert; } // // Initialize element areas so each element area // depends on the i,j coordinates of the element. // - std::memset(areae, 0, Nelem*Nelem * sizeof(double)); + std::memset(areae, 0, Nelem_tot * sizeof(double)); - for (int j = 0 ; j < Nelem ; ++j) { - for (int i = 0 ; i < Nelem ; ++i) { - int ielem = i + j*Nelem ; - areae[ielem] = h*(i+1) * h*(j+1); - } + for (int ie = 0; ie < Nelem_tot; ++ie) { + int i = ie % Nelem; + int j = ie / Nelem; + areae[ie] = h*(i+1) * h*(j+1); } +// _vertexsum_elemarea_end //std::cout << "\n Element areas...\n"; //printMeshData(areae, Nelem, Nelem); @@ -118,15 +125,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running sequential C-style version of vertex sum...\n"; - std::memset(areav_ref, 0, Nvert*Nvert * sizeof(double)); +// _cstyle_vertexarea_seq_start + std::memset(areav_ref, 0, Nvert_tot * sizeof(double)); - for (int ie = 0; ie < Nelem*Nelem; ++ie) { - int* iv = &(e2v_map[4*ie]); - areav_ref[ iv[0] ] += areae[ie] / 4.0 ; - areav_ref[ iv[1] ] += areae[ie] / 4.0 ; - areav_ref[ iv[2] ] += areae[ie] / 4.0 ; - areav_ref[ iv[3] ] += areae[ie] / 4.0 ; + for (int ie = 0; ie < Nelem_tot; ++ie) { + int* iv = &(e2v_map[4*ie]); + areav_ref[ iv[0] ] += areae[ie] / 4.0 ; + areav_ref[ iv[1] ] += areae[ie] / 4.0 ; + areav_ref[ iv[2] ] += areae[ie] / 4.0 ; + areav_ref[ iv[3] ] += areae[ie] / 4.0 ; } +// _cstyle_vertexarea_seq_end //std::cout << "\n Vertex areas (reference)...\n"; //printMeshData(areav_ref, Nvert, jvoff); @@ -150,33 +159,34 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // Since none of the elements with the same number share a common vertex, // we can iterate over each subset ("color") in parallel. // -// We use RAJA ListSegments and a RAJA TypedIndexSet to define the element +// We use RAJA ListSegments and a RAJA IndexSet to define the element // partitioning. // +// _vertexarea_color_start // // Gather the element indices for each color in a vector. // std::vector< std::vector > idx(4); - for (int j = 0 ; j < Nelem ; ++j) { - for (int i = 0 ; i < Nelem ; ++i) { - int ie = i + j*Nelem ; - if ( i % 2 == 0 ) { - if ( j % 2 == 0 ) { - idx[0].push_back(ie); - } else { - idx[2].push_back(ie); - } + for (int ie = 0; ie < Nelem_tot; ++ie) { + int i = ie % Nelem; + int j = ie / Nelem; + if ( i % 2 == 0 ) { + if ( j % 2 == 0 ) { + idx[0].push_back(ie); } else { - if ( j % 2 == 0 ) { - idx[1].push_back(ie); - } else { - idx[3].push_back(ie); - } + idx[2].push_back(ie); + } + } else { + if ( j % 2 == 0 ) { + idx[1].push_back(ie); + } else { + idx[3].push_back(ie); } } } +// _vertexarea_color_end //----------------------------------------------------------------------------// @@ -188,7 +198,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running C-style OpenMP vertex sum...\n"; - std::memset(areav, 0, Nvert*Nvert * sizeof(double)); + +// _cstyle_vertexarea_omp_start + std::memset(areav, 0, Nvert_tot * sizeof(double)); for (int icol = 0; icol < 4; ++icol) { const std::vector& ievec = idx[icol]; @@ -205,6 +217,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } } +// _cstyle_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); //std::cout << "\n Vertex areas (reference)...\n"; @@ -213,11 +226,13 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif -// The TypedIndexSet is a variadic template, where the template arguments -// are the segment types that the TypedIndexSet can hold. +// The IndexSet is a variadic template, where the template arguments +// are the segment types that the IndexSet can hold. // -#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) +#if defined(RAJA_ENABLE_OPENMP) || defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) +// _vertexarea_listsegtype_start using SegmentType = RAJA::TypedListSegment; +// _vertexarea_listsegtype_end #endif #if defined(RAJA_ENABLE_OPENMP) @@ -229,19 +244,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) camp::resources::Resource host_res{camp::resources::Host()}; // -// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of +// Create a RAJA IndexSet with four ListSegments, one for the indices of // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA // variants of the vertex sum calculation. +// _vertexarea_indexset_start RAJA::TypedIndexSet colorset; colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), host_res) ); colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), host_res) ); colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), host_res) ); - colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) ); + colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), host_res) ); +// _vertexarea_indexset_end //----------------------------------------------------------------------------// -// RAJA OpenMP vertex sum calculation using TypedIndexSet (sequential iteration +// RAJA OpenMP vertex sum calculation using IndexSet (sequential iteration // over segments, OpenMP parallel iteration of each segment) //----------------------------------------------------------------------------// @@ -249,16 +266,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(areav, 0, Nvert*Nvert * sizeof(double)); - using EXEC_POL3 = RAJA::ExecPolicy; - RAJA::forall(colorset, [=](int ie) { + RAJA::forall(colorset, [=](int ie) { int* iv = &(e2v_map[4*ie]); areav[ iv[0] ] += areae[ie] / 4.0 ; areav[ iv[1] ] += areae[ie] / 4.0 ; areav[ iv[2] ] += areae[ie] / 4.0 ; areav[ iv[3] ] += areae[ie] / 4.0 ; }); +// _raja_vertexarea_omp_end checkResult(areav, areav_ref, Nvert); //std::cout << "\n Vertex volumes...\n"; @@ -268,7 +287,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) //----------------------------------------------------------------------------// -// RAJA CUDA vertex sum calculation using TypedIndexSet (sequential iteration +// RAJA CUDA vertex sum calculation using IndexSet (sequential iteration // over segments, CUDA kernel launched for each segment) //----------------------------------------------------------------------------// @@ -276,12 +295,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) // // Resource object used to construct list segment objects with indices -// living in host (CPU) memory. +// living in device (GPU) memory. // camp::resources::Resource cuda_res{camp::resources::Cuda()}; // -// Create a RAJA TypedIndexSet with four ListSegments, one for the indices of +// Create a RAJA IndexSet with four ListSegments, one for the indices of // the elements in each subsut. This will be used in the RAJA OpenMP and CUDA // variants of the vertex sum calculation. @@ -296,16 +315,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::memset(areav, 0, Nvert*Nvert * sizeof(double)); - using EXEC_POL4 = RAJA::ExecPolicy>; - RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { + RAJA::forall(cuda_colorset, [=] RAJA_DEVICE (int ie) { int* iv = &(e2v_map[4*ie]); areav[ iv[0] ] += areae[ie] / 4.0 ; areav[ iv[1] ] += areae[ie] / 4.0 ; areav[ iv[2] ] += areae[ie] / 4.0 ; areav[ iv[3] ] += areae[ie] / 4.0 ; }); +// _raja_vertexarea_cuda_end checkResult(areav, areav_ref, Nvert); //std::cout << "\n Vertex volumes...\n"; @@ -313,6 +334,70 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) #endif +//----------------------------------------------------------------------------// +// RAJA HIP vertex sum calculation using IndexSet (sequential iteration +// over segments, HIP kernel launched for each segment) +//----------------------------------------------------------------------------// + +#if defined(RAJA_ENABLE_HIP) + +// +// Allocate and initialize device memory arrays +// + double* d_areae = memoryManager::allocate_gpu(Nelem_tot); + double* d_areav = memoryManager::allocate_gpu(Nvert_tot); + int* d_e2v_map = memoryManager::allocate_gpu(4*Nelem_tot); + + hipMemcpy(d_areae, areae, Nelem_tot*sizeof(double), hipMemcpyHostToDevice); + hipMemcpy(d_e2v_map, e2v_map, 4*Nelem_tot*sizeof(int), hipMemcpyHostToDevice); + + std::memset(areav, 0, Nvert_tot * sizeof(double)); + hipMemcpy(d_areav, areav, Nvert_tot*sizeof(double), hipMemcpyHostToDevice); + +// +// Resource object used to construct list segment objects with indices +// living in device (GPU) memory. +// + camp::resources::Resource hip_res{camp::resources::Hip()}; + +// +// Create a RAJA IndexSet with four ListSegments, one for the indices of +// the elements in each subsut. This will be used in the RAJA OpenMP and CUDA +// variants of the vertex sum calculation. + + RAJA::TypedIndexSet hip_colorset; + + hip_colorset.push_back( SegmentType(&idx[0][0], idx[0].size(), hip_res) ); + hip_colorset.push_back( SegmentType(&idx[1][0], idx[1].size(), hip_res) ); + hip_colorset.push_back( SegmentType(&idx[2][0], idx[2].size(), hip_res) ); + hip_colorset.push_back( SegmentType(&idx[3][0], idx[3].size(), hip_res) ); + + std::cout << "\n Running RAJA HIP index set vertex sum...\n"; + +// _raja_vertexarea_hip_start + using EXEC_POL3 = RAJA::ExecPolicy>; + + RAJA::forall(hip_colorset, [=] RAJA_DEVICE (int ie) { + int* iv = &(d_e2v_map[4*ie]); + d_areav[ iv[0] ] += d_areae[ie] / 4.0 ; + d_areav[ iv[1] ] += d_areae[ie] / 4.0 ; + d_areav[ iv[2] ] += d_areae[ie] / 4.0 ; + d_areav[ iv[3] ] += d_areae[ie] / 4.0 ; + }); +// _raja_vertexarea_hip_end + + hipMemcpy(areav, d_areav, Nvert_tot*sizeof(double), hipMemcpyDeviceToHost); + checkResult(areav, areav_ref, Nvert); +//std::cout << "\n Vertex volumes...\n"; +//printMeshData(areav, Nvert, jvoff); + + memoryManager::deallocate_gpu(d_areae); + memoryManager::deallocate_gpu(d_areav); + memoryManager::deallocate_gpu(d_e2v_map); + +#endif + //----------------------------------------------------------------------------// // Clean up... diff --git a/exercises/view-layout.cpp b/exercises/view-layout.cpp new file mode 100644 index 0000000000..a96998d349 --- /dev/null +++ b/exercises/view-layout.cpp @@ -0,0 +1,625 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * View and Layout Exercise + * + * Examples illustrate the use of RAJA View and Layout types. + * + * RAJA features shown: + * - RAJA::View + * - RAJA::Layout + * - Layout permutations + * - OffsetLayout + * - OffsetLayout permutations + * + * NOTE: no RAJA kernel execution methods are used in these examples. + */ + +// +// Functions to check and print arrays +// +template +void checkResult(T* C, T* Cref, int N); + +template +void printValues(T* C, int N); + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA view & layout exercises...\n"; + +//----------------------------------------------------------------------------// +// +// Matrix-matrix multiplication: default layout +// +//----------------------------------------------------------------------------// + + // _matmult_init_start + // + // Define the size of N x N of matrices. + // + constexpr int N = 4; + + // + // Allocate storage for matrices and initialize matrix entries + // + double *A = new double[ N * N ]; + double *B = new double[ N * N ]; + double *C = new double[ N * N ]; + double *Cref = new double[ N * N ]; + + for (int row = 0; row < N; ++row) { + for (int col = 0; col < N; ++col) { + A[ col + N*row ] = row + 1; + B[ col + N*row ] = col + 1; + C[ col + N*row ] = 0.0; + Cref[ col + N*row ] = 0.0; + } + } + // _matmult_init_end + +//printValues(A, N*N); +//printValues(B, N*N); +//printValues(C, N*N); +//printValues(Cref, N*N); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running matrix multiplication reference solution...\n"; + + // _cstyle_matmult_start + for (int row = 0; row < N; ++row) { + for (int col = 0; col < N; ++col) { + for (int k = 0; k < N; ++k) { + Cref[col + N*row] += A[k + N*row] * B[col + N*k]; + } + } + } + // _cstyle_matmult_end + +//printValues(Cref, N*N); + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running matrix multiplication w/Views...\n"; + + // + // Define RAJA View objects to simplify access to the matrix entries. + // + // Note: we use default Layout + // + // _matmult_views_start + RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N); + RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N); + RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N); + // _matmult_views_end + + // _cstyle_matmult_views_start + for (int row = 0; row < N; ++row) { + for (int col = 0; col < N; ++col) { + for (int k = 0; k < N; ++k) { + Cview(row, col) += Aview(row, k) * Bview(k, col); + } + } + } + // _cstyle_matmult_views_end + + checkResult(C, Cref, N*N); +//printValues(C, N*N); + +// +// Clean up. +// + delete [] A; + delete [] B; + delete [] C; + delete [] Cref; + +//----------------------------------------------------------------------------// +// +// Default layouts use row-major data ordering +// +//----------------------------------------------------------------------------// + + // + // Define dimensions and allocate arrays + // + // _default_views_init_start + constexpr int Nx = 3; + constexpr int Ny = 5; + constexpr int Nz = 2; + constexpr int Ntot = Nx*Ny*Nz; + int* a = new int[ Ntot ]; + int* aref = new int[ Ntot ]; + + for (int i = 0; i < Ntot; ++i) + { + aref[i] = i; + } + // _default_views_init_end + +//printValues(ref, Ntot); + +//----------------------------------------// + + std::cout << "\n Running default layout view cases...\n"; + + std::cout << "\n\t Running 1D view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_view1D_start + RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot); + + for (int i = 0; i < Ntot; ++i) { + view_1D(i) = i; + } + // _default_view1D_end + + checkResult(a, aref, Ntot); +//printValues(a, Ntot); + +//----------------------------------------// + + std::cout << "\n\t Running 2D default layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_view2D_start + RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny); + + int iter{0}; + for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) { + view_2D(i, j) = iter; + ++iter; + } + } + // _default_view2D_end + + checkResult(a, aref, Nx*Ny); +//printValues(a, Nx*Ny); + +//----------------------------------------// + + std::cout << "\n\t Running 3D default layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement a triple loop nest using a RAJA::View and + /// three-dimensional RAJA::Layout that iterates over the + /// data array 'a' with unit stride. + /// + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +//----------------------------------------------------------------------------// +// +// Permuted layouts change the data striding order +// +//----------------------------------------------------------------------------// + + std::cout << "\n Running permuted layout cases...\n"; + +//----------------------------------------// + + std::cout << "\n\t Running 2D default permutation view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_perm_view2D_start + std::array defperm2 {{0, 1}}; + RAJA::Layout< 2, int > defperm2_layout = + RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2); + RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout); + + iter = 0; + for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) { + defperm_view_2D(i, j) = iter; + ++iter; + } + } + // _default_perm_view2D_end + + checkResult(a, aref, Nx*Ny); +//printValues(a, Nx*Ny); + +//----------------------------------------// + + std::cout << "\n\t Running 3D default permutation view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement a triple loop nest using a RAJA::View and + /// three-dimensional RAJA::Layout with the identity permutation. + /// + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +//----------------------------------------// +//----------------------------------------// + + std::cout << "\n\t Running 2D permuted layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _perm_2D_start + std::array perm2 {{1, 0}}; + RAJA::Layout< 2, int > perm2_layout = + RAJA::make_permuted_layout( {{Nx, Ny}}, perm2); + RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout); + + iter = 0; + for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) { + perm_view_2D(i, j) = iter; + ++iter; + } + } + // _perm_2D_end + + checkResult(a, aref, Nx*Ny); +//printValues(a, Nx*Ny); + +//----------------------------------------// + + std::cout << "\n\t Running 3D perma layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement a triple loop nest using a RAJA::View and + /// three-dimensional RAJA::Layout with the permutation + /// {2, 1, 0}. + /// + /// Name the Layout object 'perm3a_layout' so it can be used + /// with the index conversion methods in the section below. + /// Uncomment those methods if you want to try them with the + /// Layout object you create here. + /// + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +//----------------------------------------// + + std::cout << "\n\t Running 3D permb layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _permb_view3D_start + std::array perm3b {{1, 2, 0}}; + RAJA::Layout< 3, int > perm3b_layout = + RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b); + RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout); + + iter = 0; + for (int j = 0; j < Ny; ++j) { + for (int k = 0; k < Nz; ++k) { + for (int i = 0; i < Nx; ++i) { + perm3b_view_3D(i, j, k) = iter; + ++iter; + } + } + } + // _permb_view3D_end + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +// +// Clean up. +// + delete [] a; + delete [] aref; + +//----------------------------------------------------------------------------// +// +// Layouts: multi-dimensional indices vs. linear indicies +// +// RAJA::Layout type has methods that can be used to convert between +// multi-dimensional and linear indices. We show these below using the +// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz +// sizes defined earlier: +// +// constexpr int Nx = 3; +// constexpr int Ny = 5; +// constexpr int Nz = 2; +// +//----------------------------------------------------------------------------// + + std::cout << "\n Multi-dimensional indices to linear indices...\n"; + + + std::cout << "\nperm3a_layout...\n" << std::endl; + + int lin = -1; + int i = -1; + int j = -1; + int k = -1; + +/* + // _perm3d_layout_start + lin = perm3a_layout(1, 2, 0); + std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl; + std::cout << "\t Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; + + perm3a_layout.toIndices(7, i, j, k); + std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + // _perm3d_layout_end + + + lin = perm3a_layout(2, 3, 1); + std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl; + std::cout << "\t Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; + + perm3a_layout.toIndices(26, i, j, k); + std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + + + lin = perm3a_layout(0, 2, 1); + std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl; + std::cout << "\t Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; + + perm3a_layout.toIndices(21, i, j, k); + std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; +*/ + +//----------------------------------------------------------------------------// + + std::cout << "\nperm3b_layout...\n" << std::endl; + + lin = perm3b_layout(1, 2, 0); + std::cout << "\tperm3b_layout(1, 2, 0) = " << lin << std::endl; + std::cout << "\t Should be 13 = 1 + 0 * Nx + 2 * Nx * Nz " + << "(since perm is {1, 2, 0})" << std::endl; + + perm3b_layout.toIndices(13, i, j, k); + std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + + + lin = perm3b_layout(2, 3, 1); + std::cout << "\tperm3b_layout(2, 3, 1) = " << lin << std::endl; + std::cout << "\t Should be 23 = 2 + 1 * Nx + 3 * Nx * Nz " + << "(since perm is {1, 2, 0})" << std::endl; + + perm3b_layout.toIndices(23, i, j, k); + std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + + + lin = perm3b_layout(0, 2, 1); + std::cout << "\tperm3b_layout(0, 2, 1) = " << lin << std::endl; + std::cout << "\t Should be 15 = 0 + 1 * Nx + 2 * Nx * Nz " + << "(since perm is {1, 2, 0})" << std::endl; + perm3b_layout.toIndices(15, i, j, k); + std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + + /// + /// TODO... + /// + /// EXERCISE: Implement a triple loop nest using a RAJA::View and + /// three-dimensional RAJA::Layout that iterates over the + /// data array 'a' with unit stride. + /// + +//----------------------------------------------------------------------------// +// +// Offset layouts apply offsets to indices +// +//----------------------------------------------------------------------------// + + std::cout << "\n Running offset layout cases...\n"; + + // + // Define some dimensions, and allocate arrays + // + constexpr int Ntot_ao = 40; + int* ao = new int[ Ntot_ao ]; + int* ao_ref = new int[ Ntot_ao ]; + +//----------------------------------------// + + std::cout << "\n\t Running 1D offset layout case...\n"; + + // + // Set reference solution to compare with + // + + std::memset(ao_ref, 0, Ntot_ao * sizeof(int)); + + // _cstyle_offlayout1D_start + int imin = -5; + int imax = 6; + + for (int i = imin; i < imax; ++i) { + ao_ref[ i-imin ] = i; + } + // _cstyle_offlayout1D_end + +//printValues(ao_ref, imax-imin); + +//----------------------------------------// + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + // _raja_offlayout1D_start + RAJA::OffsetLayout<1, int> offlayout_1D = + RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); + + RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, + offlayout_1D); + + for (int i = imin; i < imax; ++i) { + aoview_1Doff(i) = i; + } + // _raja_offlayout1D_end + + checkResult(ao, ao_ref, imax-imin); +//printValues(ao, 11); + +//----------------------------------------// + + std::cout << "\n\t Running 2D offset layout case...\n"; + + // + // Set reference solution to compare with + // + + std::memset(ao_ref, 0, Ntot_ao * sizeof(int)); + + // _cstyle_offlayout2D_start + imin = -1; + imax = 2; + int jmin = -5; + int jmax = 5; + + iter = 0; + for (int i = imin; i < imax; ++i) { + for (int j = jmin; j < jmax; ++j) { + ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin) ] = iter; + iter++; + } + } + // _cstyle_offlayout2D_end + +//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + +//----------------------------------------// + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + /// + /// TODO... + /// + /// EXERCISE: Implement a double loop nest using a RAJA::View and + /// two-dimensional RAJA::OffsetLayout which performs the + /// same operations as the C-style example above. + /// + + checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); +//printValues(ao, (imax-imin)*(jmax-jmin)); + +//----------------------------------------// + + std::cout << "\n\t Running 2D permuted offset layout case...\n"; + + // + // Set reference solution to compare with + // + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + // _cstyle_permofflayout2D_start + iter = 0; + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { + ao_ref[ (i-imin) + (j-jmin) * (imax-imin) ] = iter; + iter++; + } + } + // _cstyle_permofflayout2D_end + +//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + +//----------------------------------------// + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + // _raja_permofflayout2D_start + std::array perm1D {{1, 0}}; + RAJA::OffsetLayout<2> permofflayout_2D = + RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, + {{imax, jmax}}, + perm1D ); + + RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, + permofflayout_2D); + + iter = 0; + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { + aoview_2Dpermoff(i, j) = iter; + iter++; + } + } + // _raja_permofflayout2D_end + + checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); +//printValues(ao, (imax-imin)*(jmax-jmin)); + +// +// Clean up. +// + delete [] ao; + delete [] ao_ref; + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(T* C, T* Cref, int N) +{ + bool match = true; + for (int i = 0; i < N; ++i) { + if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) { + match = false; + } + } + if ( match ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +template +void printValues(T* C, int N) +{ + for (int i = 0; i < N; ++i) { + std::cout << "array[" << i << "] = " << C[i] << std::endl; + } +}; diff --git a/exercises/view-layout_solution.cpp b/exercises/view-layout_solution.cpp new file mode 100644 index 0000000000..3da033953e --- /dev/null +++ b/exercises/view-layout_solution.cpp @@ -0,0 +1,643 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "memoryManager.hpp" + +#include "RAJA/RAJA.hpp" + +/* + * View and Layout Exercise + * + * Examples illustrate the use of RAJA View and Layout types. + * + * RAJA features shown: + * - RAJA::View + * - RAJA::Layout + * - Layout permutations + * - OffsetLayout + * - OffsetLayout permutations + * + * NOTE: no RAJA kernel execution methods are used in these examples. + */ + +// +// Functions to check and print arrays +// +template +void checkResult(T* C, T* Cref, int N); + +template +void printValues(T* C, int N); + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + std::cout << "\n\nRAJA view & layout exercises...\n"; + +//----------------------------------------------------------------------------// +// +// Matrix-matrix multiplication: default layout +// +//----------------------------------------------------------------------------// + + // _matmult_init_start + // + // Define the size of N x N of matrices. + // + constexpr int N = 4; + + // + // Allocate storage for matrices and initialize matrix entries + // + double *A = new double[ N * N ]; + double *B = new double[ N * N ]; + double *C = new double[ N * N ]; + double *Cref = new double[ N * N ]; + + for (int row = 0; row < N; ++row) { + for (int col = 0; col < N; ++col) { + A[ col + N*row ] = row + 1; + B[ col + N*row ] = col + 1; + C[ col + N*row ] = 0.0; + Cref[ col + N*row ] = 0.0; + } + } + // _matmult_init_end + +//printValues(A, N*N); +//printValues(B, N*N); +//printValues(C, N*N); +//printValues(Cref, N*N); + +//----------------------------------------------------------------------------// + + std::cout << "\n Running matrix multiplication reference solution...\n"; + + // _cstyle_matmult_start + for (int row = 0; row < N; ++row) { + for (int col = 0; col < N; ++col) { + for (int k = 0; k < N; ++k) { + Cref[col + N*row] += A[k + N*row] * B[col + N*k]; + } + } + } + // _cstyle_matmult_end + +//printValues(Cref, N*N); + + +//----------------------------------------------------------------------------// + + std::cout << "\n Running matrix multiplication w/Views...\n"; + + // + // Define RAJA View objects to simplify access to the matrix entries. + // + // Note: we use default Layout + // + // _matmult_views_start + RAJA::View< double, RAJA::Layout<2, int> > Aview(A, N, N); + RAJA::View< double, RAJA::Layout<2, int> > Bview(B, N, N); + RAJA::View< double, RAJA::Layout<2, int> > Cview(C, N, N); + // _matmult_views_end + + // _cstyle_matmult_views_start + for (int row = 0; row < N; ++row) { + for (int col = 0; col < N; ++col) { + for (int k = 0; k < N; ++k) { + Cview(row, col) += Aview(row, k) * Bview(k, col); + } + } + } + // _cstyle_matmult_views_end + + checkResult(C, Cref, N*N); +//printValues(C, N*N); + +// +// Clean up. +// + delete [] A; + delete [] B; + delete [] C; + delete [] Cref; + +//----------------------------------------------------------------------------// +// +// Default layouts use row-major data ordering +// +//----------------------------------------------------------------------------// + + // + // Define dimensions and allocate arrays + // + // _default_views_init_start + constexpr int Nx = 3; + constexpr int Ny = 5; + constexpr int Nz = 2; + constexpr int Ntot = Nx*Ny*Nz; + int* a = new int[ Ntot ]; + int* aref = new int[ Ntot ]; + + for (int i = 0; i < Ntot; ++i) + { + aref[i] = i; + } + // _default_views_init_end + +//printValues(ref, Ntot); + +//----------------------------------------// + + std::cout << "\n Running default layout view cases...\n"; + + std::cout << "\n\t Running 1D view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_view1D_start + RAJA::View< int, RAJA::Layout<1, int> > view_1D(a, Ntot); + + for (int i = 0; i < Ntot; ++i) { + view_1D(i) = i; + } + // _default_view1D_end + + checkResult(a, aref, Ntot); +//printValues(a, Ntot); + +//----------------------------------------// + + std::cout << "\n\t Running 2D default layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_view2D_start + RAJA::View< int, RAJA::Layout<2, int> > view_2D(a, Nx, Ny); + + int iter{0}; + for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) { + view_2D(i, j) = iter; + ++iter; + } + } + // _default_view2D_end + + checkResult(a, aref, Nx*Ny); +//printValues(a, Nx*Ny); + +//----------------------------------------// + + std::cout << "\n\t Running 3D default layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_view3D_start + RAJA::View< int, RAJA::Layout<3, int> > view_3D(a, Nx, Ny, Nz); + + iter = 0; + for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) { + for (int k = 0; k < Nz; ++k) { + view_3D(i, j, k) = iter; + ++iter; + } + } + } + // _default_view3D_end + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +//----------------------------------------------------------------------------// +// +// Permuted layouts change the data striding order +// +//----------------------------------------------------------------------------// + + std::cout << "\n Running permuted layout cases...\n"; + +//----------------------------------------// + + std::cout << "\n\t Running 2D default permutation view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_perm_view2D_start + std::array defperm2 {{0, 1}}; + RAJA::Layout< 2, int > defperm2_layout = + RAJA::make_permuted_layout( {{Nx, Ny}}, defperm2); + RAJA::View< int, RAJA::Layout<2, int> > defperm_view_2D(a, defperm2_layout); + + iter = 0; + for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) { + defperm_view_2D(i, j) = iter; + ++iter; + } + } + // _default_perm_view2D_end + + checkResult(a, aref, Nx*Ny); +//printValues(a, Nx*Ny); + +//----------------------------------------// + + std::cout << "\n\t Running 3D default permutation view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _default_perm_view3D_start + std::array defperm3 {{0, 1, 2}}; + RAJA::Layout< 3, int > defperm3_layout = + RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, defperm3); + RAJA::View< int, RAJA::Layout<3, int> > defperm_view_3D(a, defperm3_layout); + + iter = 0; + for (int i = 0; i < Nx; ++i) { + for (int j = 0; j < Ny; ++j) { + for (int k = 0; k < Nz; ++k) { + defperm_view_3D(i, j, k) = iter; + ++iter; + } + } + } + // _default_perm_view3D_end + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +//----------------------------------------// +//----------------------------------------// + + std::cout << "\n\t Running 2D permuted layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _perm_2D_start + std::array perm2 {{1, 0}}; + RAJA::Layout< 2, int > perm2_layout = + RAJA::make_permuted_layout( {{Nx, Ny}}, perm2); + RAJA::View< int, RAJA::Layout<2, int> > perm_view_2D(a, perm2_layout); + + iter = 0; + for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) { + perm_view_2D(i, j) = iter; + ++iter; + } + } + // _perm_2D_end + + checkResult(a, aref, Nx*Ny); +//printValues(a, Nx*Ny); + +//----------------------------------------// + + std::cout << "\n\t Running 3D perma layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _perma_view3D_start + std::array perm3a {{2, 1, 0}}; + RAJA::Layout< 3, int > perm3a_layout = + RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3a); + RAJA::View< int, RAJA::Layout<3, int> > perm3a_view_3D(a, perm3a_layout); + + iter = 0; + for (int k = 0; k < Nz; ++k) { + for (int j = 0; j < Ny; ++j) { + for (int i = 0; i < Nx; ++i) { + perm3a_view_3D(i, j, k) = iter; + ++iter; + } + } + } + // _perma_view3D_end + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +//----------------------------------------// + + std::cout << "\n\t Running 3D permb layout view case...\n"; + + std::memset(a, 0, Ntot * sizeof(int)); + + // _permb_view3D_start + std::array perm3b {{1, 2, 0}}; + RAJA::Layout< 3, int > perm3b_layout = + RAJA::make_permuted_layout( {{Nx, Ny, Nz}}, perm3b); + RAJA::View< int, RAJA::Layout<3, int> > perm3b_view_3D(a, perm3b_layout); + + iter = 0; + for (int j = 0; j < Ny; ++j) { + for (int k = 0; k < Nz; ++k) { + for (int i = 0; i < Nx; ++i) { + perm3b_view_3D(i, j, k) = iter; + ++iter; + } + } + } + // _permb_view3D_end + + checkResult(a, aref, Nx*Ny*Nz); +//printValues(a, Nx*Ny*Nz); + +// +// Clean up. +// + delete [] a; + delete [] aref; + +//----------------------------------------------------------------------------// +// +// Layouts: multi-dimensional indices vs. linear indicies +// +// RAJA::Layout type has methods that can be used to convert between +// multi-dimensional and linear indices. We show these below using the +// three-dimensional layouts in the examples above. Recall the Nx, Ny, Nz +// sizes defined earlier: +// +// constexpr int Nx = 3; +// constexpr int Ny = 5; +// constexpr int Nz = 2; +// +//----------------------------------------------------------------------------// + + std::cout << "\n Multi-dimensional indices to linear indices...\n"; + + + std::cout << "\nperm3a_layout...\n" << std::endl; + + int lin = -1; + int i = -1; + int j = -1; + int k = -1; + + // _perm3d_layout_start + lin = perm3a_layout(1, 2, 0); + std::cout << "\tperm3a_layout(1, 2, 0) = " << lin << std::endl; + std::cout << "\t Should be 7 = 1 + 2 * Nx + 0 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; + + perm3a_layout.toIndices(7, i, j, k); + std::cout << "\tperm3a_layout.toIndices(7, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + // _perm3d_layout_end + + + lin = perm3a_layout(2, 3, 1); + std::cout << "\tperm3a_layout(2, 3, 1) = " << lin << std::endl; + std::cout << "\t Should be 26 = 2 + 3 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; + + perm3a_layout.toIndices(26, i, j, k); + std::cout << "\tperm3a_layout.toIndices(26, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + + + lin = perm3a_layout(0, 2, 1); + std::cout << "\tperm3a_layout(0, 2, 1) = " << lin << std::endl; + std::cout << "\t Should be 21 = 0 + 2 * Nx + 1 * Nx * Ny " + << "(since perm is {2, 1, 0})" << std::endl; + + perm3a_layout.toIndices(21, i, j, k); + std::cout << "\tperm3a_layout.toIndices(21, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + +//----------------------------------------------------------------------------// + + std::cout << "\nperm3b_layout...\n" << std::endl; + + lin = perm3b_layout(1, 2, 0); + std::cout << "\tperm3b_layout(1, 2, 0) = " << lin << std::endl; + std::cout << "\t Should be 13 = 1 + 0 * Nx + 2 * Nx * Nz " + << "(since perm is {1, 2, 0})" << std::endl; + + perm3b_layout.toIndices(13, i, j, k); + std::cout << "\tperm3b_layout.toIndices(13, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + + + lin = perm3b_layout(2, 3, 1); + std::cout << "\tperm3b_layout(2, 3, 1) = " << lin << std::endl; + std::cout << "\t Should be 23 = 2 + 1 * Nx + 3 * Nx * Nz " + << "(since perm is {1, 2, 0})" << std::endl; + + perm3b_layout.toIndices(23, i, j, k); + std::cout << "\tperm3b_layout.toIndices(23, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + + + lin = perm3b_layout(0, 2, 1); + std::cout << "\tperm3b_layout(0, 2, 1) = " << lin << std::endl; + std::cout << "\t Should be 15 = 0 + 1 * Nx + 2 * Nx * Nz " + << "(since perm is {1, 2, 0})" << std::endl; + + perm3b_layout.toIndices(15, i, j, k); + std::cout << "\tperm3b_layout.toIndices(15, i, j, k) --> (i, j, k) = " + << "(" << i << ", " << j << ", " << k << ")\n" << std::endl; + +//----------------------------------------------------------------------------// +// +// Offset layouts apply offsets to indices +// +//----------------------------------------------------------------------------// + + std::cout << "\n Running offset layout cases...\n"; + + // + // Define some dimensions, and allocate arrays + // + constexpr int Ntot_ao = 40; + int* ao = new int[ Ntot_ao ]; + int* ao_ref = new int[ Ntot_ao ]; + +//----------------------------------------// + + std::cout << "\n\t Running 1D offset layout case...\n"; + + // + // Set reference solution to compare with + // + + std::memset(ao_ref, 0, Ntot_ao * sizeof(int)); + + // _cstyle_offlayout1D_start + int imin = -5; + int imax = 6; + + for (int i = imin; i < imax; ++i) { + ao_ref[ i-imin ] = i; + } + // _cstyle_offlayout1D_end + +//printValues(ao_ref, imax-imin); + +//----------------------------------------// + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + // _raja_offlayout1D_start + RAJA::OffsetLayout<1, int> offlayout_1D = + RAJA::make_offset_layout<1, int>( {{imin}}, {{imax}} ); + + RAJA::View< int, RAJA::OffsetLayout<1, int> > aoview_1Doff(ao, + offlayout_1D); + + for (int i = imin; i < imax; ++i) { + aoview_1Doff(i) = i; + } + // _raja_offlayout1D_end + + checkResult(ao, ao_ref, imax-imin); +//printValues(ao, 11); + +//----------------------------------------// + + std::cout << "\n\t Running 2D offset layout case...\n"; + + // + // Set reference solution to compare with + // + + std::memset(ao_ref, 0, Ntot_ao * sizeof(int)); + + // _cstyle_offlayout2D_start + imin = -1; + imax = 2; + int jmin = -5; + int jmax = 5; + + iter = 0; + for (int i = imin; i < imax; ++i) { + for (int j = jmin; j < jmax; ++j) { + ao_ref[ (j-jmin) + (i-imin) * (jmax-jmin) ] = iter; + iter++; + } + } + // _cstyle_offlayout2D_end + +//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + +//----------------------------------------// + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + // _raja_offlayout2D_start + RAJA::OffsetLayout<2, int> offlayout_2D = + RAJA::make_offset_layout<2, int>( {{imin, jmin}}, {{imax, jmax}} ); + + RAJA::View< int, RAJA::OffsetLayout<2, int> > aoview_2Doff(ao, + offlayout_2D); + iter = 0; + for (int i = imin; i < imax; ++i) { + for (int j = jmin; j < jmax; ++j) { + aoview_2Doff(i, j) = iter; + iter++; + } + } + // _raja_offlayout2D_end + + checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); +//printValues(ao, (imax-imin)*(jmax-jmin)); + +//----------------------------------------// + + std::cout << "\n\t Running 2D permuted offset layout case...\n"; + + // + // Set reference solution to compare with + // + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + // _cstyle_permofflayout2D_start + iter = 0; + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { + ao_ref[ (i-imin) + (j-jmin) * (imax-imin) ] = iter; + iter++; + } + } + // _cstyle_permofflayout2D_end + +//printValues(ao_ref, (imax-imin)*(jmax-jmin)); + +//----------------------------------------// + + std::memset(ao, 0, Ntot_ao * sizeof(int)); + + // _raja_permofflayout2D_start + std::array perm1D {{1, 0}}; + RAJA::OffsetLayout<2> permofflayout_2D = + RAJA::make_permuted_offset_layout<2>( {{imin, jmin}}, + {{imax, jmax}}, + perm1D ); + + RAJA::View< int, RAJA::OffsetLayout<2> > aoview_2Dpermoff(ao, + permofflayout_2D); + + iter = 0; + for (int j = jmin; j < jmax; ++j) { + for (int i = imin; i < imax; ++i) { + aoview_2Dpermoff(i, j) = iter; + iter++; + } + } + // _raja_permofflayout2D_end + + checkResult(ao, ao_ref, (imax-imin)*(jmax-jmin)); +//printValues(ao, (imax-imin)*(jmax-jmin)); + +// +// Clean up. +// + delete [] ao; + delete [] ao_ref; + +//----------------------------------------------------------------------------// +//----------------------------------------------------------------------------// + + std::cout << "\n DONE!...\n"; + + return 0; +} + +// +// Function to check result and report P/F. +// +template +void checkResult(T* C, T* Cref, int N) +{ + bool match = true; + for (int i = 0; i < N; ++i) { + if ( std::abs( C[i] - Cref[i] ) > 10e-12 ) { + match = false; + } + } + if ( match ) { + std::cout << "\n\t result -- PASS\n"; + } else { + std::cout << "\n\t result -- FAIL\n"; + } +}; + +template +void printValues(T* C, int N) +{ + for (int i = 0; i < N; ++i) { + std::cout << "array[" << i << "] = " << C[i] << std::endl; + } +}; diff --git a/host-configs/alcf-builds/sycl.cmake b/host-configs/alcf-builds/sycl.cmake index 35d7557bb8..f3efb32477 100755 --- a/host-configs/alcf-builds/sycl.cmake +++ b/host-configs/alcf-builds/sycl.cmake @@ -14,15 +14,16 @@ set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "") -set(CMAKE_CXX_COMPILER "clang++" CACHE PATH "") -#set(CMAKE_CXX_COMPILER "dpcpp" CACHE PATH "") +#set(CMAKE_CXX_COMPILER "clang++" CACHE PATH "") +#set(CMAKE_CXX_COMPILER "g++" CACHE PATH "") +set(CMAKE_CXX_COMPILER "dpcpp" CACHE PATH "") #set(CMAKE_CXX_FLAGS_RELEASE "-O3 -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend '-device skl' " CACHE STRING "") #set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend '-device skl'" CACHE STRING "") #set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -fsycl -fsycl-unnamed-lambda -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend '-device skl'" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELEASE "-O3 -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "") -set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "") -set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -std=c++17 -fsycl -fsycl-unnamed-lambda" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g -std=c++17 -fsycl -fsycl-unnamed-lambda" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -std=c++17 -fsycl -fsycl-unnamed-lambda" CACHE STRING "") set(CMAKE_CXX_LINK_FLAGS "-fsycl -Wl,-rpath,/usr/tce/packages/oneapi/oneapi-2021.2/compiler/2021.2.0/linux/compiler/lib/intel64_lin/" CACHE STRING "") set(RAJA_RANGE_ALIGN 4 CACHE INT "") diff --git a/host-configs/lc-builds/toss3/oneapi_X.cmake b/host-configs/lc-builds/toss3/oneapi_X.cmake new file mode 100644 index 0000000000..680cc0e25d --- /dev/null +++ b/host-configs/lc-builds/toss3/oneapi_X.cmake @@ -0,0 +1,16 @@ +############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} -O3 -march=native -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O3 -g -march=native -finline-functions" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -g" CACHE STRING "") + +set(RAJA_RANGE_ALIGN 4 CACHE INT "") +set(RAJA_RANGE_MIN_LENGTH 32 CACHE INT "") +set(RAJA_DATA_ALIGN 64 CACHE INT "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/host-configs/lc-builds/toss4/corona_sycl.cmake b/host-configs/lc-builds/toss4/corona_sycl.cmake new file mode 100755 index 0000000000..ea240f745f --- /dev/null +++ b/host-configs/lc-builds/toss4/corona_sycl.cmake @@ -0,0 +1,25 @@ +############################################################################### +# Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +# and RAJA project contributors. See the RAJA/LICENSE file for details. +# +# SPDX-License-Identifier: (BSD-3-Clause) +############################################################################### + +set(RAJA_COMPILER "RAJA_COMPILER_CLANG" CACHE STRING "") + +set(CMAKE_CXX_COMPILER "clang++" CACHE PATH "") +#set(CMAKE_CXX_COMPILER "dpcpp" CACHE PATH "") + +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -L${SYCL_LIB_PATH} -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -L${SYCL_LIB_PATH} -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -L${SYCL_LIB_PATH} -fsycl -fsycl-unnamed-lambda -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a" CACHE STRING "") +#set(CMAKE_CXX_FLAGS_RELEASE "-O3 -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "") +#set(CMAKE_CXX_FLAGS_RELWITHDEBINFO " -O3 -g -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "") +#set(CMAKE_CXX_FLAGS_DEBUG " -O0 -g -std=c++17 -fsycl -fsycl-unnamed-lambda --gcc-toolchain=/usr/tce/packages/gcc/gcc-7.1.0" CACHE STRING "") +#set(CMAKE_CXX_LINK_FLAGS "-fsycl -Wl,-rpath,/usr/tce/packages/oneapi/oneapi-2021.2/compiler/2021.2.0/linux/compiler/lib/intel64_lin/" CACHE STRING "") + +set(RAJA_RANGE_ALIGN 4 CACHE STRING "") +set(RAJA_RANGE_MIN_LENGTH 32 CACHE STRING "") +set(RAJA_DATA_ALIGN 64 CACHE STRING "") + +set(RAJA_HOST_CONFIG_LOADED On CACHE BOOL "") diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index 7a2df45ffb..c01f9167bc 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -43,12 +43,14 @@ // #include "RAJA/pattern/forall.hpp" #include "RAJA/pattern/kernel.hpp" -#include "RAJA/pattern/teams.hpp" +#include "RAJA/pattern/launch.hpp" // // Generic templates to describe SIMD/SIMT registers and vectors // +#if defined(RAJA_ENABLE_VECTORIZATION) #include "RAJA/pattern/tensor.hpp" +#endif // // All platforms must support sequential execution. @@ -64,7 +66,9 @@ // All platforms should support simd and vector execution. // #include "RAJA/policy/simd.hpp" +#if defined(RAJA_ENABLE_VECTORIZATION) #include "RAJA/policy/tensor.hpp" +#endif #if defined(RAJA_ENABLE_TBB) #include "RAJA/policy/tbb.hpp" @@ -195,9 +199,9 @@ namespace RAJA { namespace expt{} - // provide a RAJA::expt namespace for experimental work, but bring alias - // it into RAJA so it doesn't affect user code - using namespace expt; +// // provide a RAJA::expt namespace for experimental work, but bring alias +// // it into RAJA so it doesn't affect user code +// using namespace expt; } #endif // closing endif for header file include guard diff --git a/include/RAJA/config.hpp.in b/include/RAJA/config.hpp.in index 550fdc4198..404d8beebf 100644 --- a/include/RAJA/config.hpp.in +++ b/include/RAJA/config.hpp.in @@ -100,6 +100,12 @@ static_assert(RAJA_HAS_SOME_CXX14, "compiler and/or standard library does not claim support for " "C++14 features we need"); +#if defined(__cpp_lib_is_invocable) && (__cpp_lib_is_invocable >= 201703L) +#define RAJA_HAS_CXX17_IS_INVOCABLE 1 +#else +#define RAJA_HAS_CXX17_IS_INVOCABLE 0 +#endif + /*! ****************************************************************************** * @@ -169,6 +175,7 @@ static_assert(RAJA_HAS_SOME_CXX14, #cmakedefine RAJA_ENABLE_CLANG_CUDA #cmakedefine RAJA_ENABLE_HIP #cmakedefine RAJA_ENABLE_SYCL +#cmakedefine RAJA_ENABLE_VECTORIZATION #cmakedefine RAJA_ENABLE_NV_TOOLS_EXT #cmakedefine RAJA_ENABLE_ROCTX @@ -234,15 +241,15 @@ static_assert(RAJA_HAS_SOME_CXX14, namespace RAJA { -#if defined(RAJA_ENABLE_OPENMP) +#if defined(RAJA_ENABLE_OPENMP) && !defined(__HIP_DEVICE_COMPILE__) #if defined(_OPENMP) -#if _OPENMP >= 200805 +#if (_OPENMP >= 200805) #define RAJA_ENABLE_OPENMP_TASK #endif #else -#error RAJA configured with RAJA_ENABLE_OPENMP, but OpenMP not supported by current compiler +#error RAJA configured with RAJA_ENABLE_OPENMP, but _OPENMP is not defined in this code section #endif // _OPENMP -#endif // RAJA_ENABLE_OPENMP +#endif // RAJA_ENABLE_OPENMP && __HIP_DEVICE_COMPILE__ #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__) #define RAJA_CUDA_ACTIVE @@ -252,10 +259,12 @@ namespace RAJA { #define RAJA_HIP_ACTIVE #include -#if (HIP_VERSION_MAJOR > 4) || \ - (HIP_VERSION_MAJOR == 4 && HIP_VERSION_MINOR >= 3) -// enable device function pointers with rocm version >= 4.3 +#if (HIP_VERSION_MAJOR > 5) || \ + (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR >= 1) +// enable device function pointers with rocm version >= 5.1 +// this used to be set to 4.3, but tests start passing with 5.1 #define RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL +#define RAJA_ENABLE_HIP_DOUBLE_ATOMICADD #endif #if (HIP_VERSION_MAJOR > 4) || \ (HIP_VERSION_MAJOR == 4 && HIP_VERSION_MINOR >= 2) @@ -265,8 +274,15 @@ namespace RAJA { #endif #endif // RAJA_ENABLE_HIP && __HIPCC__ +#if defined(RAJA_ENABLE_SYCL) +#if defined(SYCL_LANGUAGE_VERSION) +#define RAJA_SYCL_ACTIVE +#endif +#endif + #if defined(RAJA_CUDA_ACTIVE) || \ - defined(RAJA_HIP_ACTIVE) + defined(RAJA_HIP_ACTIVE) || \ + defined(RAJA_SYCL_ACTIVE) #define RAJA_DEVICE_ACTIVE #endif @@ -372,6 +388,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #endif #define RAJA_UNROLL RAJA_PRAGMA(unroll) +#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll(N)) #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) #define RAJA_ALIGN_DATA(d) d @@ -400,8 +417,10 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #if !defined(__NVCC__) #define RAJA_UNROLL RAJA_PRAGMA(GCC unroll 10000) +#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(GCC unroll N) #else #define RAJA_UNROLL RAJA_PRAGMA(unroll) +#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll N) #endif #if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) @@ -429,7 +448,9 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; // #define RAJA_FORCEINLINE_RECURSIVE #define RAJA_INLINE inline __attribute__((always_inline)) -#define RAJA_UNROLL +#define RAJA_UNROLL +#define RAJA_UNROLL_COUNT(N) + // FIXME: alignx is breaking CUDA+xlc #if defined(RAJA_ENABLE_CUDA) #define RAJA_ALIGN_DATA(d) d @@ -458,6 +479,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #define RAJA_FORCEINLINE_RECURSIVE #define RAJA_INLINE inline __attribute__((always_inline)) #define RAJA_UNROLL RAJA_PRAGMA(clang loop unroll(enable)) +#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(clang loop unroll_count(N)) // note that neither nvcc nor Apple Clang compiler currently doesn't support @@ -499,6 +521,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #define RAJA_SIMD #define RAJA_NO_SIMD #define RAJA_UNROLL +#define RAJA_UNROLL_COUNT(N) #else @@ -509,6 +532,7 @@ const int DATA_ALIGN = @RAJA_DATA_ALIGN@; #define RAJA_SIMD #define RAJA_NO_SIMD #define RAJA_UNROLL +#define RAJA_UNROLL_COUNT(N) #endif @@ -546,10 +570,16 @@ T * align_hint(T * x) #define RAJA_UNROLL #endif -// If we're in CUDA device code, we can use the nvcc unroll pragma -#if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE) +#ifndef RAJA_UNROLL_COUNT +#define RAJA_UNROLL_COUNT(N) +#endif + +// If we're in CUDA or HIP device code, we can use the unroll pragma +#if (defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)) && defined(RAJA_DEVICE_ACTIVE) #undef RAJA_UNROLL +#undef RAJA_UNROLL_COUNT #define RAJA_UNROLL RAJA_PRAGMA(unroll) +#define RAJA_UNROLL_COUNT(N) RAJA_PRAGMA(unroll N) #endif #endif // closing endif for header file include guard diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp index d9d1de4010..52a1161577 100644 --- a/include/RAJA/index/IndexSet.hpp +++ b/include/RAJA/index/IndexSet.hpp @@ -299,16 +299,16 @@ class TypedIndexSet : public TypedIndexSet //! Add copy of segment to back end of index set. template - RAJA_INLINE void push_back(Tnew const &val) + RAJA_INLINE void push_back(Tnew &&val) { - push_internal(new Tnew(val), PUSH_BACK, PUSH_COPY); + push_internal(new typename std::decay::type(std::forward(val)), PUSH_BACK, PUSH_COPY); } //! Add copy of segment to front end of index set. template - RAJA_INLINE void push_front(Tnew const &val) + RAJA_INLINE void push_front(Tnew &&val) { - push_internal(new Tnew(val), PUSH_FRONT, PUSH_COPY); + push_internal(new typename std::decay::type(std::forward(val)), PUSH_FRONT, PUSH_COPY); } //! Return total length -- sum of lengths of all segments diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp index 105fa6650b..de998abbbe 100644 --- a/include/RAJA/index/ListSegment.hpp +++ b/include/RAJA/index/ListSegment.hpp @@ -49,12 +49,12 @@ namespace RAJA * end() -- returns a StorageT* * size() -- returns size of the Segment iteration space (RAJA::Index_type) * - * NOTE: TypedListSegment supports the option for the segment to own the + * NOTE: TypedListSegment supports the option for the segment to own the * its index data or simply use the index array passed to the constructor. - * Owning the index data is the default; an array is created in the + * Owning the index data is the default; an array is created in the * memory space specified by the camp resource object and the values are - * copied from the input array to that. Ownership of the indices is - * determined by an optional ownership enum value passed to the + * copied from the input array to that. Ownership of the indices is + * determined by an optional ownership enum value passed to the * constructor. * * Usage: @@ -62,14 +62,14 @@ namespace RAJA * A common C-style loop traversal pattern using an indirection array would be: * * \verbatim - * const T* indices = ...; + * const T* indices = ...; * for (T i = begin; i < end; ++i) { * // loop body -- use indices[i] as index value * } * \endverbatim * * A TypedListSegment would be used with a RAJA forall execution template as: - * + * * \verbatim * camp::resources::Resource resource{ camp resource type }; * TypedListSegment listseg(indices, length, resource); @@ -88,7 +88,7 @@ class TypedListSegment //@{ //! @name Types used in implementation based on template parameter. - + //! The underlying value type for index storage using value_type = StorageT; @@ -107,11 +107,12 @@ class TypedListSegment * \brief Construct a list segment from given array with specified length * and use given camp resource to allocate list segment index data * if owned by this list segment. - * + * * \param values array of indices defining iteration space of segment * \param length number of indices * \param resource camp resource defining memory space where index data live - * \param owned optional enum value indicating whether segment owns indices (Owned or Unowned). Default is Owned. + * \param owned optional enum value indicating whether segment owns indices + * (Owned or Unowned). Default is Owned. * * If 'Unowned' is passed as last argument, the segment will not own its * index data. In this case, caller must manage array lifetime properly. @@ -120,9 +121,9 @@ class TypedListSegment Index_type length, camp::resources::Resource resource, IndexOwnership owned = Owned) - : m_resource(resource) + : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0) { - initIndexData(values, length, owned); + initIndexData(values, length, resource, owned); } /*! @@ -140,8 +141,7 @@ class TypedListSegment template TypedListSegment(const Container& container, camp::resources::Resource resource) - : m_resource(resource), - m_owned(Unowned), m_data(nullptr), m_size(container.size()) + : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size()) { if (m_size > 0) { @@ -158,8 +158,9 @@ class TypedListSegment ++src; } - m_data = m_resource.allocate(m_size); - m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size); + m_resource = new camp::resources::Resource(resource); + m_data = m_resource->allocate(m_size); + m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size); m_owned = Owned; host_res.deallocate(tmp); @@ -171,29 +172,74 @@ class TypedListSegment TypedListSegment() = delete; //! Copy constructor for list segment - TypedListSegment(const TypedListSegment& other) - : m_resource(other.m_resource), - m_owned(Unowned), m_data(nullptr), m_size(0) + // As this may be called from a lambda in a + // RAJA method we perform a shallow copy + RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other) + : m_resource(nullptr), + m_owned(Unowned), m_data(other.m_data), m_size(other.m_size) + { + } + + //! Copy assignment for list segment + // As this may be called from a lambda in a + // RAJA method we perform a shallow copy + RAJA_HOST_DEVICE TypedListSegment& operator=(const TypedListSegment& other) { - bool from_copy_ctor = true; - initIndexData(other.m_data, other.m_size, other.m_owned, from_copy_ctor); + clear(); + m_resource = nullptr; + m_owned = Unowned; + m_data = other.m_data; + m_size = other.m_size; + } + + //! move assignment for list segment + // As this may be called from a lambda in a + // RAJA method we perform a shallow copy + RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs) + { + clear(); + m_resource = rhs.m_resource; + m_owned = rhs.m_owned; + m_data = rhs.m_data; + m_size = rhs.m_size; + + rhs.m_resource = nullptr; + rhs.m_owned = Unowned; + rhs.m_data = nullptr; + rhs.m_size = 0; } //! Move constructor for list segment - TypedListSegment(TypedListSegment&& rhs) + RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs) : m_resource(rhs.m_resource), m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size) { - // make the rhs non-owning so it's destructor won't have any side effects rhs.m_owned = Unowned; + rhs.m_resource = nullptr; + rhs.m_size = 0; + rhs.m_data = nullptr; } //! List segment destructor - ~TypedListSegment() + RAJA_HOST_DEVICE ~TypedListSegment() { + clear(); + } + + //! Clear method to be called + RAJA_HOST_DEVICE void clear() + { + +#if !defined(RAJA_DEVICE_CODE) if (m_data != nullptr && m_owned == Owned) { - m_resource.deallocate(m_data); + m_resource->deallocate(m_data); + delete m_resource; } +#endif + m_data = nullptr; + m_resource = nullptr; + m_owned = Unowned; + m_size = 0; } //@} @@ -235,7 +281,7 @@ class TypedListSegment * \return true if segment size is same as given length value and values in * given array match segment index values, else false * - * Method assumes values in given array and segment indices both live in host + * Method assumes values in given array and segment indices both live in host * memory space. */ RAJA_HOST_DEVICE bool indicesEqual(const value_type* container, @@ -252,9 +298,9 @@ class TypedListSegment /*! * \brief Compare this segment to another for equality * - * \return true if both segments are the same size and indices match, + * \return true if both segments are the same size and indices match, * else false - * + * * Method assumes indices in both segments live in host memory space. */ RAJA_HOST_DEVICE bool operator==(const TypedListSegment& other) const @@ -265,9 +311,9 @@ class TypedListSegment /*! * \brief Compare this segment to another for inequality * - * \return true if segments are not the same size or indices do not match, + * \return true if segments are not the same size or indices do not match, * else false - * + * * Method assumes indices in both segments live in host memory space. */ RAJA_HOST_DEVICE bool operator!=(const TypedListSegment& other) const @@ -294,8 +340,8 @@ class TypedListSegment // void initIndexData(const value_type* container, Index_type len, - IndexOwnership container_own, - bool from_copy_ctor = false) + camp::resources::Resource resource_, + IndexOwnership container_own) { // empty list segment @@ -311,12 +357,7 @@ class TypedListSegment m_owned = container_own; if (m_owned == Owned) { - if ( from_copy_ctor ) { - - m_data = m_resource.allocate(m_size); - m_resource.memcpy(m_data, container, sizeof(value_type) * m_size); - - } else { + m_resource = new camp::resources::Resource(resource_); camp::resources::Resource host_res{camp::resources::Host()}; @@ -326,16 +367,14 @@ class TypedListSegment tmp[i] = container[i]; } - m_data = m_resource.allocate(m_size); - m_resource.memcpy(m_data, tmp, sizeof(value_type) * m_size); + m_data = m_resource->allocate(m_size); + m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size); host_res.deallocate(tmp); - } - return; } - + // list segment accesses container data directly. // Uh-oh. Using evil const_cast.... m_data = const_cast(container); @@ -343,7 +382,7 @@ class TypedListSegment // Copy of camp resource passed to ctor - camp::resources::Resource m_resource; + camp::resources::Resource *m_resource; // Ownership flag to guide data copying/management IndexOwnership m_owned; diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp index b64508fd11..9775f4771a 100644 --- a/include/RAJA/internal/RAJAVec.hpp +++ b/include/RAJA/internal/RAJAVec.hpp @@ -21,6 +21,7 @@ #include "RAJA/config.hpp" +#include #include #include #include diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp index cc0d7af4e3..78f18ee0d3 100644 --- a/include/RAJA/internal/foldl.hpp +++ b/include/RAJA/internal/foldl.hpp @@ -48,6 +48,28 @@ struct foldl_impl { using Ret = Arg1; }; +#if RAJA_HAS_CXX17_IS_INVOCABLE + +template +struct foldl_impl { + using Ret = typename std::invoke_result::type; +}; + +template +struct foldl_impl { + using Ret = typename foldl_impl< + Op, + typename std::invoke_result::type, + Arg3>::type, + Rest...>::Ret; +}; + +#else + template struct foldl_impl { using Ret = typename std::result_of::type; @@ -66,6 +88,8 @@ struct foldl_impl { Rest...>::Ret; }; +#endif + } // namespace detail template diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp index 15b4b94e77..65b8cd53a7 100644 --- a/include/RAJA/pattern/WorkGroup.hpp +++ b/include/RAJA/pattern/WorkGroup.hpp @@ -193,12 +193,14 @@ struct WorkSite { template struct WorkPool, + STORAGE_POLICY_T, + DISPATCH_POLICY_T>, INDEX_T, xargs, ALLOCATOR_T> @@ -206,7 +208,8 @@ struct WorkPool; + using dispatch_policy = DISPATCH_POLICY_T; + using policy = WorkGroupPolicy; using index_type = INDEX_T; using xarg_type = xargs; using Allocator = ALLOCATOR_T; @@ -216,9 +219,9 @@ struct WorkPool; + exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>; using storage_type = detail::WorkStorage< - storage_policy, Allocator, typename workrunner_type::vtable_type>; + storage_policy, Allocator, typename workrunner_type::dispatcher_type>; friend workgroup_type; friend worksite_type; @@ -302,12 +305,14 @@ struct WorkPool struct WorkGroup, + STORAGE_POLICY_T, + DISPATCH_POLICY_T>, INDEX_T, xargs, ALLOCATOR_T> @@ -315,7 +320,8 @@ struct WorkGroup; + using dispatch_policy = DISPATCH_POLICY_T; + using policy = WorkGroupPolicy; using index_type = INDEX_T; using xarg_type = xargs; using Allocator = ALLOCATOR_T; @@ -372,12 +378,14 @@ struct WorkGroup struct WorkSite, + STORAGE_POLICY_T, + DISPATCH_POLICY_T>, INDEX_T, xargs, ALLOCATOR_T> @@ -385,7 +393,8 @@ struct WorkSite; + using dispatch_policy = DISPATCH_POLICY_T; + using policy = WorkGroupPolicy; using index_type = INDEX_T; using xarg_type = xargs; using Allocator = ALLOCATOR_T; @@ -439,17 +448,18 @@ struct WorkSite inline typename WorkPool< - WorkGroupPolicy, + WorkGroupPolicy, INDEX_T, xargs, ALLOCATOR_T>::workgroup_type WorkPool< - WorkGroupPolicy, + WorkGroupPolicy, INDEX_T, xargs, ALLOCATOR_T>::instantiate() @@ -465,21 +475,22 @@ WorkPool< template inline typename WorkGroup< - WorkGroupPolicy, + WorkGroupPolicy, INDEX_T, xargs, ALLOCATOR_T>::worksite_type WorkGroup< - WorkGroupPolicy, + WorkGroupPolicy, INDEX_T, xargs, ALLOCATOR_T>::run(typename WorkGroup< - WorkGroupPolicy, + WorkGroupPolicy, INDEX_T, xargs, ALLOCATOR_T>::resource_type r, diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp new file mode 100644 index 0000000000..221f900b98 --- /dev/null +++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp @@ -0,0 +1,725 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file providing RAJA Dispatcher for workgroup. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_PATTERN_WORKGROUP_Dispatcher_HPP +#define RAJA_PATTERN_WORKGROUP_Dispatcher_HPP + + +#include "RAJA/config.hpp" + +#include "RAJA/policy/WorkGroup.hpp" + +#include "camp/number.hpp" +#include "camp/list.hpp" +#include "camp/helpers.hpp" + +#include + + +namespace RAJA +{ + +namespace detail +{ + +template < typename > +struct DispatcherVoidPtrWrapper +{ + void* ptr; + DispatcherVoidPtrWrapper() = default; + // implicit constructor from void* + RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { } +}; + +template < typename > +struct DispatcherVoidConstPtrWrapper +{ + const void* ptr; + DispatcherVoidConstPtrWrapper() = default; + // implicit constructor from const void* + RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { } +}; + + +constexpr bool dispatcher_use_host_invoke(Platform platform) { + return !(platform == Platform::cuda || platform == Platform::hip); +} + +// Transforms one dispatch policy into another by creating a dispatch policy +// of holder_type objects. See usage in WorkRunner for more explanation. +template < typename dispatch_policy, typename holder_type > +struct dispatcher_transform_types; +/// +template < typename dispatch_policy, typename holder_type > +using dispatcher_transform_types_t = + typename dispatcher_transform_types::type; + +/*! + * A dispatcher abstraction that provides an interface to some basic + * functionality that is implemented differently based on the dispatch_policy. + * + * DispatcherID is used to differentiate function pointers based on their + * function signature. + */ +template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs > +struct Dispatcher; + + +template < typename holder_type > +struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> { + using type = ::RAJA::indirect_function_call_dispatch; +}; + +/*! + * Version of Dispatcher that acts essentially like a vtable. It implements + * the interface with function pointers. + * + * DispatcherID can be helpful to avoid function signature collisions + * with functions that will not be used through this class. This is useful + * during device linking when functions with high register counts may cause + * device linking to fail. + */ +template < Platform platform, typename DispatcherID, typename ... CallArgs > +struct Dispatcher { + static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); + using dispatch_policy = ::RAJA::indirect_function_call_dispatch; + using void_ptr_wrapper = DispatcherVoidPtrWrapper; + using void_cptr_wrapper = DispatcherVoidConstPtrWrapper; + + /// + /// move construct an object of type T in dest as a copy of a T from src and + /// destroy the T obj in src + /// + template < typename T > + static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) + { + T* dest_as_T = static_cast(dest.ptr); + T* src_as_T = static_cast(src.ptr); + new(dest_as_T) T(std::move(*src_as_T)); + (*src_as_T).~T(); + } + + /// + /// invoke the call operator of the object of type T in obj with args + /// + template < typename T > + static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args) + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + /// + template < typename T > + static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args) + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + + /// + /// destroy the object of type T in obj + /// + template < typename T > + static void s_destroy(void_ptr_wrapper obj) + { + T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T).~T(); + } + + using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/); + using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/); + using destroyer_type = void(*)(void_ptr_wrapper /*obj*/); + + // This can't be a cuda device lambda due to compiler limitations + template < typename T > + struct DeviceInvokerFactory { + using value_type = invoker_type; + RAJA_DEVICE value_type operator()() { +#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) + return nullptr; +#else + return &s_device_invoke; +#endif + } + }; + + /// + /// create a Dispatcher that can be used on the host for objects of type T + /// + template< typename T, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher() { + return { mover_type{&s_move_construct_destroy}, + invoker_type{&s_host_invoke}, + destroyer_type{&s_destroy}, + sizeof(T) + }; + } + /// + /// create a Dispatcher that can be used on the device for objects of type T + /// + /// To do this the invoker_type must be created on the device to get the + /// device function pointer. The createOnDevice parameter is responsible for + /// providing the device context and returning the invoker object created. + /// The createOnDevice object uses an invoker factory provided as an argument + /// to create the invoker object. This allows for a separation between + /// object creation and the device context (cuda, hip, etc) and copying. + /// + template< typename T, typename CreateOnDevice, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) { + return { mover_type{&s_move_construct_destroy}, + invoker_type{std::forward(createOnDevice)(DeviceInvokerFactory{})}, + destroyer_type{&s_destroy}, + sizeof(T) + }; + } + + mover_type move_construct_destroy; + invoker_type invoke; + destroyer_type destroy; + size_t size; +}; + + +template < typename holder_type > +struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> { + using type = ::RAJA::indirect_virtual_function_dispatch; +}; + +/*! + * Version of Dispatcher that uses a class hierarchy and virtual functions to + * implement the interface. + * + * DispatcherID can be helpful to avoid function signature collisions + * with functions that will not be used through this class. This is useful + * during device linking when functions with high register counts may cause + * device linking to fail. + */ +template < Platform platform, typename DispatcherID, typename ... CallArgs > +struct Dispatcher { + static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); + using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch; + using void_ptr_wrapper = DispatcherVoidPtrWrapper; + using void_cptr_wrapper = DispatcherVoidConstPtrWrapper; + + struct impl_base { + virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0; + virtual void destroy(void_ptr_wrapper obj) const = 0; + }; + + struct host_impl_base { + virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0; + }; + + struct device_impl_base { + virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0; + }; + + template < typename T > + struct base_impl_type : impl_base + { + /// + /// move construct an object of type T in dest as a copy of a T from src and + /// destroy the T obj in src + /// + virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override + { + T* dest_as_T = static_cast(dest.ptr); + T* src_as_T = static_cast(src.ptr); + new(dest_as_T) T(std::move(*src_as_T)); + (*src_as_T).~T(); + } + + /// + /// destroy the object of type T in obj + /// + virtual void destroy(void_ptr_wrapper obj) const override + { + T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T).~T(); + } + }; + + template < typename T > + struct host_impl_type : host_impl_base + { + /// + /// invoke the call operator of the object of type T in obj with args + /// + virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const override + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + }; + + template < typename T > + struct device_impl_type : device_impl_base + { + /// + /// invoke the call operator of the object of type T in obj with args + /// + virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + }; + + struct mover_type { + impl_base* m_impl; + void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const + { + m_impl->move_destroy(dest, src); + } + }; + + struct host_invoker_type { + host_impl_base* m_impl; + void operator()(void_cptr_wrapper obj, CallArgs... args) const + { + m_impl->invoke(obj, std::forward(args)...); + } + }; + /// + struct device_invoker_type { + device_impl_base* m_impl; + RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const + { + m_impl->invoke(obj, std::forward(args)...); + } + }; + using invoker_type = std::conditional_t; + + struct destroyer_type { + impl_base* m_impl; + void operator()(void_ptr_wrapper obj) const + { + m_impl->destroy(obj); + } + }; + + // This can't be a cuda device lambda due to compiler limitations + template < typename T > + struct DeviceImplTypeFactory { + using value_type = device_impl_type*; + RAJA_DEVICE value_type operator()() { +#if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL) + return nullptr; +#else + static device_impl_type s_device_impl; + return &s_device_impl; +#endif + } + }; + + /// + /// create a Dispatcher that can be used on the host for objects of type T + /// + template< typename T, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher() { + static base_impl_type s_base_impl; + static host_impl_type s_host_impl; + return { mover_type{&s_base_impl}, + host_invoker_type{&s_host_impl}, + destroyer_type{&s_base_impl}, + sizeof(T) + }; + } + /// + /// create a Dispatcher that can be used on the device for objects of type T + /// + /// To do this the invoker_type must be created on the device to get the + /// device function pointer. The createOnDevice parameter is responsible for + /// providing the device context and returning the invoker object created. + /// The createOnDevice object uses an invoker factory provided as an argument + /// to create the invoker object. This allows for a separation between + /// object creation and the device context (cuda, hip, etc) and copying. + /// + template< typename T, typename CreateOnDevice, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr> + static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) { + static base_impl_type s_base_impl; + static device_impl_type* s_device_impl_ptr{ + std::forward(createOnDevice)(DeviceImplTypeFactory{}) }; + return { mover_type{&s_base_impl}, + device_invoker_type{s_device_impl_ptr}, + destroyer_type{&s_base_impl}, + sizeof(T) + }; + } + + mover_type move_construct_destroy; + invoker_type invoke; + destroyer_type destroy; + size_t size; +}; + + +// direct_dispatch expects a list of types +template < typename ... Ts, typename holder_type > +struct dispatcher_transform_types<::RAJA::direct_dispatch, holder_type> { + using type = ::RAJA::direct_dispatch...>; +}; + +/*! + * Version of Dispatcher that does direct dispatch to zero callable types. + * It implements the interface with callable objects. + */ +template < Platform platform, typename DispatcherID, typename ... CallArgs > +struct Dispatcher, DispatcherID, CallArgs...> { + static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); + using dispatch_policy = ::RAJA::direct_dispatch<>; + using void_ptr_wrapper = DispatcherVoidPtrWrapper; + using void_cptr_wrapper = DispatcherVoidConstPtrWrapper; + + /// + /// move construct an object of type T in dest as a copy of a T from src and + /// destroy the T obj in src + /// + struct mover_type { + void operator()(void_ptr_wrapper, void_ptr_wrapper) const + { } + }; + + /// + /// invoke the call operator of the object of type T in obj with args + /// + struct host_invoker_type { + void operator()(void_cptr_wrapper, CallArgs...) const + { } + }; + struct device_invoker_type { + RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const + { } + }; + using invoker_type = std::conditional_t; + + /// + /// destroy the object of type T in obj + /// + struct destroyer_type { + void operator()(void_ptr_wrapper) const + { } + }; + + /// + /// create a Dispatcher that can be used on the host for objects of type T + /// + template< typename T, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher() { + return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)}; + } + /// + /// create a Dispatcher that can be used on the device for objects of type T + /// + /// Ignore the CreateOnDevice object as the same invoker object can be used + /// on the host and device. + /// + template< typename T, typename CreateOnDevice, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher(CreateOnDevice&&) { + return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)}; + } + + mover_type move_construct_destroy; + invoker_type invoke; + destroyer_type destroy; + size_t size; +}; + +/*! + * Version of Dispatcher that does direct dispatch to a single callable type. + * It implements the interface with callable objects. + */ +template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs > +struct Dispatcher, DispatcherID, CallArgs...> { + static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); + using dispatch_policy = ::RAJA::direct_dispatch; + using void_ptr_wrapper = DispatcherVoidPtrWrapper; + using void_cptr_wrapper = DispatcherVoidConstPtrWrapper; + + /// + /// move construct an object of type T in dest as a copy of a T from src and + /// destroy the T obj in src + /// + struct mover_type { + void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const + { + T* dest_as_T = static_cast(dest.ptr); + T* src_as_T = static_cast(src.ptr); + new(dest_as_T) T(std::move(*src_as_T)); + (*src_as_T).~T(); + } + }; + + /// + /// invoke the call operator of the object of type T in obj with args + /// + struct host_invoker_type { + void operator()(void_cptr_wrapper obj, CallArgs... args) const + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + }; + struct device_invoker_type { + RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + }; + using invoker_type = std::conditional_t; + + /// + /// destroy the object of type T in obj + /// + struct destroyer_type { + void operator()(void_ptr_wrapper obj) const + { + T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T).~T(); + } + }; + + /// + /// create a Dispatcher that can be used on the host for objects of type T + /// + template< typename U, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher() { + static_assert(std::is_same::value, "U must be in direct_dispatch types"); + return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)}; + } + /// + /// create a Dispatcher that can be used on the device for objects of type T + /// + /// Ignore the CreateOnDevice object as the same invoker object can be used + /// on the host and device. + /// + template< typename U, typename CreateOnDevice, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher(CreateOnDevice&&) { + static_assert(std::is_same::value, "U must be in direct_dispatch types"); + return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)}; + } + + mover_type move_construct_destroy; + invoker_type invoke; + destroyer_type destroy; + size_t size; +}; + +/*! + * Version of Dispatcher that does direct dispatch to multiple callable types. + * It implements the interface with callable objects. + */ +template < typename T0, typename T1, typename ... TNs, + Platform platform, typename DispatcherID, typename ... CallArgs > +struct Dispatcher, + DispatcherID, CallArgs...> { + static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform); + using dispatch_policy = ::RAJA::direct_dispatch; + using void_ptr_wrapper = DispatcherVoidPtrWrapper; + using void_cptr_wrapper = DispatcherVoidConstPtrWrapper; + + using id_type = int; + using callable_indices = camp::make_int_seq_t; + using callable_types = camp::list; + + /// + /// move construct an object of type T in dest as a copy of a T from src and + /// destroy the T obj in src + /// + struct mover_type { + id_type id; + + void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const + { + impl_helper(callable_indices{}, callable_types{}, + dest, src); + } + + private: + template < int ... id_types, typename ... Ts > + void impl_helper(camp::int_seq, camp::list, + void_ptr_wrapper dest, void_ptr_wrapper src) const + { + camp::sink(((id_types == id) ? (impl(dest, src), 0) : 0)...); + } + + template < typename T > + void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const + { + T* dest_as_T = static_cast(dest.ptr); + T* src_as_T = static_cast(src.ptr); + new(dest_as_T) T(std::move(*src_as_T)); + (*src_as_T).~T(); + } + }; + + /// + /// invoke the call operator of the object of type T in obj with args + /// + struct host_invoker_type { + id_type id; + + void operator()(void_cptr_wrapper obj, CallArgs... args) const + { + impl_helper(callable_indices{}, callable_types{}, + obj, std::forward(args)...); + } + + private: + template < int ... id_types, typename ... Ts > + void impl_helper(camp::int_seq, camp::list, + void_cptr_wrapper obj, CallArgs... args) const + { + camp::sink(((id_types == id) ? (impl(obj, std::forward(args)...), 0) : 0)...); + } + + template < typename T > + void impl(void_cptr_wrapper obj, CallArgs... args) const + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + }; + struct device_invoker_type { + id_type id; + + RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const + { + impl_helper(callable_indices{}, callable_types{}, + obj, std::forward(args)...); + } + + private: + template < int ... id_types, typename ... Ts > + RAJA_DEVICE void impl_helper(camp::int_seq, camp::list, + void_cptr_wrapper obj, CallArgs... args) const + { + camp::sink(((id_types == id) ? (impl(obj, std::forward(args)...), 0) : 0)...); + } + + template < typename T > + RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const + { + const T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T)(std::forward(args)...); + } + }; + using invoker_type = std::conditional_t; + + /// + /// destroy the object of type T in obj + /// + struct destroyer_type { + id_type id; + + void operator()(void_ptr_wrapper obj) const + { + impl_helper(callable_indices{}, callable_types{}, + obj); + } + + private: + template < int ... id_types, typename ... Ts > + void impl_helper(camp::int_seq, camp::list, + void_ptr_wrapper obj) const + { + camp::sink(((id_types == id) ? (impl(obj), 0) : 0)...); + } + + template < typename T > + void impl(void_ptr_wrapper obj) const + { + T* obj_as_T = static_cast(obj.ptr); + (*obj_as_T).~T(); + } + }; + + /// + /// get the id of type T + /// + /// The id is just the index of T in the list of callable_types. + /// If T is not in Ts return -1. + /// + template < typename T, int ... id_types, typename ... Ts > + static constexpr id_type get_id(camp::int_seq, camp::list) + { + id_type id{-1}; + // quiet UB warning by sequencing assignment to id with list initialization + int unused[] {0, (std::is_same::value ? ((id = id_types), 0) : 0)...}; + camp::sink(unused); // quiet unused var warning + return id; + } + + /// + /// create a Dispatcher that can be used on the host for objects of type T + /// + template< typename T, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher() { + static constexpr id_type id = get_id(callable_indices{}, callable_types{}); + static_assert(id != id_type(-1), "T must be in direct_dispatch types"); + return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)}; + } + /// + /// create a Dispatcher that can be used on the device for objects of type T + /// + /// Ignore the CreateOnDevice object as the same invoker object can be used + /// on the host and device. + /// + template< typename T, typename CreateOnDevice, + bool uhi = use_host_invoke, std::enable_if_t* = nullptr > + static inline Dispatcher makeDispatcher(CreateOnDevice&&) { + static constexpr id_type id = get_id(callable_indices{}, callable_types{}); + static_assert(id != id_type(-1), "T must be in direct_dispatch types"); + return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)}; + } + + mover_type move_construct_destroy; + invoker_type invoke; + destroyer_type destroy; + size_t size; +}; + +/*! + * Populate and return a pointer to a Dispatcher object for the given policy. + * NOTE: there is a function overload is in each policy/WorkGroup/Dispatcher.hpp + */ +// template < typename T, typename Dispatcher_T > +// inline const Dispatcher_T* get_Dispatcher(work_policy const&); + +} // namespace detail + +} // namespace RAJA + +#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/WorkGroup/Vtable.hpp b/include/RAJA/pattern/WorkGroup/Vtable.hpp deleted file mode 100644 index 4dfd9c6718..0000000000 --- a/include/RAJA/pattern/WorkGroup/Vtable.hpp +++ /dev/null @@ -1,127 +0,0 @@ -/*! - ****************************************************************************** - * - * \file - * - * \brief Header file providing RAJA Vtable for workgroup. - * - ****************************************************************************** - */ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef RAJA_PATTERN_WORKGROUP_Vtable_HPP -#define RAJA_PATTERN_WORKGROUP_Vtable_HPP - - -#include "RAJA/config.hpp" - -#include - - -namespace RAJA -{ - -namespace detail -{ - -template < typename > -struct VtableVoidPtrWrapper -{ - void* ptr; - VtableVoidPtrWrapper() = default; - // implicit constructor from void* - RAJA_HOST_DEVICE VtableVoidPtrWrapper(void* p) : ptr(p) { } -}; - -template < typename > -struct VtableVoidConstPtrWrapper -{ - const void* ptr; - VtableVoidConstPtrWrapper() = default; - // implicit constructor from const void* - RAJA_HOST_DEVICE VtableVoidConstPtrWrapper(const void* p) : ptr(p) { } -}; - -/*! - * A vtable abstraction - * - * Provides function pointers for basic functions. - * - * VtableID is used to differentiate function pointers based on their - * function signature. This is helpful to avoid function signature collisions - * with functions that will not be used through this class. This is useful - * during device linking when functions with high register counts may cause - * device linking to fail. - */ -template < typename VtableID, typename ... CallArgs > -struct Vtable { - using void_ptr_wrapper = VtableVoidPtrWrapper; - using void_cptr_wrapper = VtableVoidConstPtrWrapper; - using move_sig = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/); - using call_sig = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/); - using destroy_sig = void(*)(void_ptr_wrapper /*obj*/); - - /// - /// move construct an object of type T in dest as a copy of a T from src and - /// destroy the T obj in src - /// - template < typename T > - static void move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) - { - T* dest_as_T = static_cast(dest.ptr); - T* src_as_T = static_cast(src.ptr); - new(dest_as_T) T(std::move(*src_as_T)); - (*src_as_T).~T(); - } - - /// - /// call the call operator of the object of type T in obj with args - /// - template < typename T > - static void host_call(void_cptr_wrapper obj, CallArgs... args) - { - const T* obj_as_T = static_cast(obj.ptr); - (*obj_as_T)(std::forward(args)...); - } - /// - template < typename T > - static RAJA_DEVICE void device_call(void_cptr_wrapper obj, CallArgs... args) - { - const T* obj_as_T = static_cast(obj.ptr); - (*obj_as_T)(std::forward(args)...); - } - - /// - /// destoy the object of type T in obj - /// - template < typename T > - static void destroy(void_ptr_wrapper obj) - { - T* obj_as_T = static_cast(obj.ptr); - (*obj_as_T).~T(); - } - - move_sig move_construct_destroy_function_ptr; - call_sig call_function_ptr; - destroy_sig destroy_function_ptr; - size_t size; -}; - -/*! - * Populate and return a pointer to a Vtable object for the given policy. - * NOTE: there is a function overload is in each policy/WorkGroup/Vtable.hpp - */ -// template < typename T, typename Vtable_T > -// inline const Vtable_T* get_Vtable(work_policy const&); - -} // namespace detail - -} // namespace RAJA - -#endif // closing endif for header file include guard diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp index 3168b22ca0..b2775b3226 100644 --- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp +++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp @@ -27,7 +27,7 @@ #include "RAJA/pattern/forall.hpp" -#include "RAJA/pattern/WorkGroup/Vtable.hpp" +#include "RAJA/pattern/WorkGroup/Dispatcher.hpp" #include "RAJA/policy/WorkGroup.hpp" @@ -140,6 +140,7 @@ struct HoldForall */ template @@ -152,6 +153,7 @@ struct WorkRunner; template @@ -159,12 +161,34 @@ struct WorkRunnerForallOrdered_base { using exec_policy = EXEC_POLICY_T; using order_policy = ORDER_POLICY_T; + using dispatch_policy = DISPATCH_POLICY_T; using Allocator = ALLOCATOR_T; using index_type = INDEX_T; using resource_type = typename resources::get_resource::type; using forall_exec_policy = FORALL_EXEC_POLICY; - using vtable_type = Vtable; + + // The type that will hold the segment and loop body in work storage + struct holder_type { + template < typename T > + using type = HoldForall>::type, // segment_type + typename camp::at>::type, // loop_type + index_type, Args...>; + }; + /// + template < typename T > + using holder_type_t = typename holder_type::template type; + + // The policy indicating where the call function is invoked + // in this case the values are called on the host in a loop + using dispatcher_exec_policy = RAJA::loop_work; + + // The Dispatcher policy with holder_types used internally to handle the + // ranges and callables passed in by the user. + using dispatcher_holder_policy = dispatcher_transform_types_t; + + using dispatcher_type = Dispatcher; WorkRunnerForallOrdered_base() = default; @@ -174,24 +198,15 @@ struct WorkRunnerForallOrdered_base WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default; WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default; - // The type that will hold the segment and loop body in work storage - template < typename segment_type, typename loop_type > - using holder_type = HoldForall; - - // The policy indicating where the call function is invoked - // in this case the values are called on the host in a loop - using vtable_exec_policy = RAJA::loop_work; - // runner interfaces with storage to enqueue so the runner can get // information from the segment and loop at enqueue time template < typename WorkContainer, typename segment_T, typename loop_T > inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop) { - using holder = holder_type, camp::decay>; + using holder = holder_type_t, camp::decay>>; storage.template emplace( - get_Vtable(vtable_exec_policy{}), + get_Dispatcher(dispatcher_exec_policy{}), std::forward(seg), std::forward(loop)); } @@ -209,6 +224,7 @@ struct WorkRunnerForallOrdered_base template @@ -217,6 +233,7 @@ struct WorkRunnerForallOrdered FORALL_EXEC_POLICY, EXEC_POLICY_T, ORDER_POLICY_T, + DISPATCH_POLICY_T, ALLOCATOR_T, INDEX_T, Args...> @@ -225,6 +242,7 @@ struct WorkRunnerForallOrdered FORALL_EXEC_POLICY, EXEC_POLICY_T, ORDER_POLICY_T, + DISPATCH_POLICY_T, ALLOCATOR_T, INDEX_T, Args...>; @@ -242,7 +260,7 @@ struct WorkRunnerForallOrdered auto end = storage.end(); for (auto iter = storage.begin(); iter != end; ++iter) { - value_type::call(&*iter, r, args...); + value_type::host_call(&*iter, r, args...); } return run_storage; @@ -255,6 +273,7 @@ struct WorkRunnerForallOrdered template @@ -263,6 +282,7 @@ struct WorkRunnerForallReverse FORALL_EXEC_POLICY, EXEC_POLICY_T, ORDER_POLICY_T, + DISPATCH_POLICY_T, ALLOCATOR_T, INDEX_T, Args...> @@ -271,6 +291,7 @@ struct WorkRunnerForallReverse FORALL_EXEC_POLICY, EXEC_POLICY_T, ORDER_POLICY_T, + DISPATCH_POLICY_T, ALLOCATOR_T, INDEX_T, Args...>; @@ -288,7 +309,7 @@ struct WorkRunnerForallReverse auto begin = storage.begin(); for (auto iter = storage.end(); iter != begin; --iter) { - value_type::call(&*(iter-1), r, args...); + value_type::host_call(&*(iter-1), r, args...); } return run_storage; diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp index b4e0bb4632..8cc442c01e 100644 --- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp +++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp @@ -191,11 +191,11 @@ struct random_access_iterator : iterator_base /*! * A storage container for work groups */ -template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Vtable_T > +template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T > class WorkStorage; -template < typename ALLOCATOR_T, typename Vtable_T > -class WorkStorage +template < typename ALLOCATOR_T, typename Dispatcher_T > +class WorkStorage { using allocator_traits_type = std::allocator_traits; using propagate_on_container_copy_assignment = @@ -208,12 +208,12 @@ class WorkStorage "WorkStorage expects an allocator for 'char's."); public: using storage_policy = RAJA::array_of_pointers; - using vtable_type = Vtable_T; + using dispatcher_type = Dispatcher_T; template < typename holder > - using true_value_type = WorkStruct; + using true_value_type = WorkStruct; - using value_type = GenericWorkStruct; + using value_type = GenericWorkStruct; using allocator_type = ALLOCATOR_T; using size_type = std::size_t; using difference_type = std::ptrdiff_t; @@ -338,10 +338,10 @@ class WorkStorage } template < typename holder, typename ... holder_ctor_args > - void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args) + void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { m_vec.emplace_back(create_value( - vtable, std::forward(ctor_args)...)); + dispatcher, std::forward(ctor_args)...)); } // destroy all stored loops, deallocates all storage @@ -390,7 +390,7 @@ class WorkStorage // allocate and construct value in storage template < typename holder, typename ... holder_ctor_args > - pointer_and_size create_value(const vtable_type* vtable, + pointer_and_size create_value(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { const size_type value_size = sizeof(true_value_type); @@ -399,7 +399,7 @@ class WorkStorage allocator_traits_type::allocate(m_aloc, value_size)); value_type::template construct( - value_ptr, vtable, std::forward(ctor_args)...); + value_ptr, dispatcher, std::forward(ctor_args)...); return pointer_and_size{value_ptr, value_size}; } @@ -429,8 +429,8 @@ class WorkStorage } }; -template < typename ALLOCATOR_T, typename Vtable_T > -class WorkStorage +template < typename ALLOCATOR_T, typename Dispatcher_T > +class WorkStorage { using allocator_traits_type = std::allocator_traits; using propagate_on_container_copy_assignment = @@ -443,12 +443,12 @@ class WorkStorage "WorkStorage expects an allocator for 'char's."); public: using storage_policy = RAJA::ragged_array_of_objects; - using vtable_type = Vtable_T; + using dispatcher_type = Dispatcher_T; template < typename holder > - using true_value_type = WorkStruct; + using true_value_type = WorkStruct; - using value_type = GenericWorkStruct; + using value_type = GenericWorkStruct; using allocator_type = ALLOCATOR_T; using size_type = std::size_t; using difference_type = std::ptrdiff_t; @@ -568,11 +568,11 @@ class WorkStorage } template < typename holder, typename ... holder_ctor_args > - void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args) + void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { size_type value_offset = storage_size(); size_type value_size = create_value(value_offset, - vtable, std::forward(ctor_args)...); + dispatcher, std::forward(ctor_args)...); m_offsets.emplace_back(value_offset); m_array_end += value_size; } @@ -698,7 +698,7 @@ class WorkStorage // and store the loop body template < typename holder, typename ... holder_ctor_args > size_type create_value(size_type value_offset, - const vtable_type* vtable, + const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { const size_type value_size = sizeof(true_value_type); @@ -710,7 +710,7 @@ class WorkStorage pointer value_ptr = reinterpret_cast(m_array_begin + value_offset); value_type::template construct( - value_ptr, vtable, std::forward(ctor_args)...); + value_ptr, dispatcher, std::forward(ctor_args)...); return value_size; } @@ -732,10 +732,10 @@ class WorkStorage } }; -template < typename ALLOCATOR_T, typename Vtable_T > +template < typename ALLOCATOR_T, typename Dispatcher_T > class WorkStorage + Dispatcher_T> { using allocator_traits_type = std::allocator_traits; using propagate_on_container_copy_assignment = @@ -748,12 +748,12 @@ class WorkStorage - using true_value_type = WorkStruct; + using true_value_type = WorkStruct; - using value_type = GenericWorkStruct; + using value_type = GenericWorkStruct; using allocator_type = ALLOCATOR_T; using size_type = std::size_t; using difference_type = std::ptrdiff_t; @@ -873,9 +873,9 @@ class WorkStorage - void emplace(const vtable_type* vtable, holder_ctor_args&&... ctor_args) + void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { - create_value(vtable, std::forward(ctor_args)...); + create_value(dispatcher, std::forward(ctor_args)...); m_array_end += m_stride; } @@ -1003,7 +1003,7 @@ class WorkStorage - void create_value(const vtable_type* vtable, + void create_value(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { const size_type value_size = sizeof(true_value_type); @@ -1020,7 +1020,7 @@ class WorkStorage(m_array_begin + value_offset); value_type::template construct( - value_ptr, vtable, std::forward(ctor_args)...); + value_ptr, dispatcher, std::forward(ctor_args)...); } // move construct the loop body in value from other and diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp index 09399d43c6..6bdd56a3c3 100644 --- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp +++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp @@ -23,7 +23,7 @@ #include #include -#include "RAJA/pattern/WorkGroup/Vtable.hpp" +#include "RAJA/pattern/WorkGroup/Dispatcher.hpp" namespace RAJA @@ -35,7 +35,7 @@ namespace detail /*! * A struct that gives a generic way to layout memory for different loops */ -template < size_t size, typename Vtable_T > +template < size_t size, typename Dispatcher_T > struct WorkStruct; /*! @@ -44,22 +44,22 @@ struct WorkStruct; * offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct, obj) * sizeof(GenericWorkStruct) <= sizeof(WorkStruct) */ -template < typename Vtable_T > -using GenericWorkStruct = WorkStruct; +template < typename Dispatcher_T > +using GenericWorkStruct = WorkStruct; -template < size_t size, typename VtableID, typename ... CallArgs > -struct WorkStruct> +template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs > +struct WorkStruct> { - using vtable_type = Vtable; + using dispatcher_type = Dispatcher; // construct a WorkStruct with a value of type holder from the args and // check a variety of constraints at compile time template < typename holder, typename ... holder_ctor_args > static RAJA_INLINE - void construct(void* ptr, const vtable_type* vtable, holder_ctor_args&&... ctor_args) + void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args) { - using true_value_type = WorkStruct; - using value_type = GenericWorkStruct; + using true_value_type = WorkStruct; + using value_type = GenericWorkStruct; static_assert(sizeof(holder) <= sizeof(true_value_type::obj), "holder must fit in WorkStruct::obj"); @@ -74,8 +74,8 @@ struct WorkStruct> true_value_type* value_ptr = static_cast(ptr); - value_ptr->vtable = vtable; - value_ptr->call_function_ptr = vtable->call_function_ptr; + value_ptr->dispatcher = dispatcher; + value_ptr->invoke = dispatcher->invoke; new(&value_ptr->obj) holder(std::forward(ctor_args)...); } @@ -84,27 +84,34 @@ struct WorkStruct> void move_destroy(WorkStruct* value_dst, WorkStruct* value_src) { - value_dst->vtable = value_src->vtable; - value_dst->call_function_ptr = value_src->call_function_ptr; - value_dst->vtable->move_construct_destroy_function_ptr(&value_dst->obj, &value_src->obj); + value_dst->dispatcher = value_src->dispatcher; + value_dst->invoke = value_src->invoke; + value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj); } // destroy the value ptr static RAJA_INLINE void destroy(WorkStruct* value_ptr) { - value_ptr->vtable->destroy_function_ptr(&value_ptr->obj); + value_ptr->dispatcher->destroy(&value_ptr->obj); } - // call the call operator of the value ptr with args - static RAJA_HOST_DEVICE RAJA_INLINE - void call(const WorkStruct* value_ptr, CallArgs... args) + // invoke the call operator of the value ptr with args + static RAJA_INLINE + void host_call(const WorkStruct* value_ptr, CallArgs... args) + { + value_ptr->invoke(&value_ptr->obj, std::forward(args)...); + } + /// + // invoke the call operator of the value ptr with args + static RAJA_DEVICE RAJA_INLINE + void device_call(const WorkStruct* value_ptr, CallArgs... args) { - value_ptr->call_function_ptr(&value_ptr->obj, std::forward(args)...); + value_ptr->invoke(&value_ptr->obj, std::forward(args)...); } - const vtable_type* vtable; - typename vtable_type::call_sig call_function_ptr; + const dispatcher_type* dispatcher; + typename dispatcher_type::invoker_type invoke; typename std::aligned_storage::type obj; }; diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp index 0daf58ab1e..f0a2d85bbe 100644 --- a/include/RAJA/pattern/forall.hpp +++ b/include/RAJA/pattern/forall.hpp @@ -120,15 +120,15 @@ struct icount_adapter { }; struct CallForall { - template - RAJA_INLINE camp::resources::EventProxy operator()(T const&, ExecPol, Body, Res) const; + template + RAJA_INLINE camp::resources::EventProxy operator()(T const&, ExecPol, Body, Res, ForallParams) const; }; struct CallForallIcount { constexpr CallForallIcount(int s); - template - RAJA_INLINE camp::resources::EventProxy operator()(T const&, ExecPol, Body, Res) const; + template + RAJA_INLINE camp::resources::EventProxy operator()(T const&, ExecPol, Body, Res, ForallParams) const; const int start; }; @@ -152,6 +152,21 @@ namespace wrap * ****************************************************************************** */ +template +RAJA_INLINE concepts::enable_if_t< + RAJA::resources::EventProxy, + concepts::negate>, + type_traits::is_range> +forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params) +{ + RAJA_FORCEINLINE_RECURSIVE + return forall_impl(r, + std::forward(p), + std::forward(c), + std::forward(loop_body), + std::forward(f_params)); +} + template RAJA_INLINE concepts::enable_if_t< RAJA::resources::EventProxy, @@ -163,7 +178,8 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body) return forall_impl(r, std::forward(p), std::forward(c), - std::forward(loop_body)); + std::forward(loop_body), + expt::get_empty_forall_param_pack()); } @@ -178,12 +194,14 @@ template + typename LoopBody, + typename ForallParams> RAJA_INLINE resources::EventProxy forall_Icount(Res r, ExecutionPolicy&& p, Container&& c, IndexType&& icount, - LoopBody&& loop_body) + LoopBody&& loop_body, + ForallParams&& f_params) { using std::begin; using std::distance; @@ -194,7 +212,7 @@ RAJA_INLINE resources::EventProxy forall_Icount(Res r, icount); using policy::sequential::forall_impl; RAJA_FORCEINLINE_RECURSIVE - return forall_impl(r, std::forward(p), range, adapted); + return forall_impl(r, std::forward(p), range, adapted, std::forward(f_params)); } /*! @@ -210,12 +228,14 @@ template + typename LoopBody, + typename ForallParams> RAJA_INLINE resources::EventProxy forall_Icount(Res r, ExecPolicy, const TypedIndexSet& iset, - LoopBody loop_body) + LoopBody loop_body, + ForallParams f_params) { // no need for icount variant here auto segIterRes = resources::get_resource::type::get_default(); @@ -224,7 +244,8 @@ RAJA_INLINE resources::EventProxy forall_Icount(Res r, detail::CallForallIcount(iset.getStartingIcount(segID)), SegmentExecPolicy(), loop_body, - r); + r, + f_params); }); return RAJA::resources::EventProxy(r); } @@ -233,16 +254,18 @@ template + typename... SegmentTypes, + typename ForallParams> RAJA_INLINE resources::EventProxy forall(Res r, ExecPolicy, const TypedIndexSet& iset, - LoopBody loop_body) + LoopBody loop_body, + ForallParams f_params) { auto segIterRes = resources::get_resource::type::get_default(); wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) { - iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r); + iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params); }); return RAJA::resources::EventProxy(r); } @@ -271,16 +294,20 @@ inline namespace policy_by_value_interface * ****************************************************************************** */ -template +template RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, Res r, IdxSet&& c, - LoopBody&& loop_body) + Params&&... params) { static_assert(type_traits::is_index_set::value, "Expected a TypedIndexSet but did not get one. Are you using " "a TypedIndexSet policy by mistake?"); + auto f_params = expt::make_forall_param_pack(std::forward(params)...); + auto&& loop_body = expt::get_lambda(std::forward(params)...); + //expt::check_forall_optional_args(loop_body, f_params); + util::PluginContext context{util::make_context>()}; util::callPreCapturePlugins(context); @@ -295,7 +322,8 @@ RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, r, std::forward(p), std::forward(c), - std::move(body)); + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; @@ -321,16 +349,20 @@ RAJA_INLINE resources::EventProxy forall_Icount(ExecutionPolicy&& p, * ****************************************************************************** */ -template +template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, type_traits::is_indexset_policy> -forall(ExecutionPolicy&& p, Res r, IdxSet&& c, LoopBody&& loop_body) +forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params) { static_assert(type_traits::is_index_set::value, "Expected a TypedIndexSet but did not get one. Are you using " "a TypedIndexSet policy by mistake?"); + auto f_params = expt::make_forall_param_pack(std::forward(params)...); + auto&& loop_body = expt::get_lambda(std::forward(params)...); + expt::check_forall_optional_args(loop_body, f_params); + util::PluginContext context{util::make_context>()}; util::callPreCapturePlugins(context); @@ -345,7 +377,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, LoopBody&& loop_body) r, std::forward(p), std::forward(c), - std::move(body)); + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; @@ -374,7 +407,8 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body) */ template ::type > -RAJA_INLINE concepts::enable_if< +RAJA_INLINE concepts::enable_if_t< + resources::EventProxy, type_traits::is_multi_policy, type_traits::is_range> forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body) @@ -385,7 +419,7 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body) auto r = Res::get_default(); // plugins handled in multipolicy policy_invoker - forall_impl(r, + return forall_impl(r, std::forward(p), std::forward(c), std::forward(loop_body)); @@ -402,7 +436,8 @@ template + typename FirstParam, + typename... Params> RAJA_INLINE concepts::enable_if_t< resources::EventProxy, type_traits::is_range, @@ -411,11 +446,16 @@ forall_Icount(ExecutionPolicy&& p, Res r, Container&& c, IndexType icount, - LoopBody&& loop_body) + FirstParam&& first, + Params&&... params) { static_assert(type_traits::is_random_access_range::value, "Container does not model RandomAccessIterator"); + auto f_params = expt::make_forall_param_pack(std::forward(first), std::forward(params)...); + auto&& loop_body = expt::get_lambda(std::forward(first), std::forward(params)...); + //expt::check_forall_optional_args(loop_body, f_params); + util::PluginContext context{util::make_context>()}; util::callPreCapturePlugins(context); @@ -431,7 +471,8 @@ forall_Icount(ExecutionPolicy&& p, std::forward(p), std::forward(c), icount, - std::move(body)); + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; @@ -467,17 +508,22 @@ forall_Icount(ExecutionPolicy&& p, * ****************************************************************************** */ -template + +template RAJA_INLINE concepts::enable_if_t< resources::EventProxy, concepts::negate>, concepts::negate>, type_traits::is_range> -forall(ExecutionPolicy&& p, Res r, Container&& c, LoopBody&& loop_body) +forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params) { static_assert(type_traits::is_random_access_range::value, "Container does not model RandomAccessIterator"); + auto f_params = expt::make_forall_param_pack(std::forward(params)...); + auto&& loop_body = expt::get_lambda(std::forward(params)...); + expt::check_forall_optional_args(loop_body, f_params); + util::PluginContext context{util::make_context>()}; util::callPreCapturePlugins(context); @@ -492,11 +538,13 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, LoopBody&& loop_body) r, std::forward(p), std::forward(c), - std::move(body)); + std::move(body), + f_params); util::callPostLaunchPlugins(context); return e; } + template ::type > RAJA_INLINE concepts::enable_if_t< @@ -563,32 +611,142 @@ forall_Icount(Res r, Args&&... args) namespace detail { -template +template RAJA_INLINE camp::resources::EventProxy CallForall::operator()(T const& segment, ExecutionPolicy, LoopBody body, - Res r) const + Res r, + ForallParams f_params) const { // this is only called inside a region, use impl using policy::sequential::forall_impl; RAJA_FORCEINLINE_RECURSIVE - return forall_impl(r, ExecutionPolicy(), segment, body); + return forall_impl(r, ExecutionPolicy(), segment, body, f_params); } constexpr CallForallIcount::CallForallIcount(int s) : start(s) {} -template +template RAJA_INLINE camp::resources::EventProxy CallForallIcount::operator()(T const& segment, ExecutionPolicy, LoopBody body, - Res r) const + Res r, + ForallParams f_params) const { // go through wrap to unwrap icount - return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body); + return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params); } } // namespace detail +// +// Experimental support for dynamic policy selection +// +// Future directions: +// - Tuple of resources one for each platform +// - Returns a generic event proxy only if a resource is provided +// avoids overhead of constructing a typed erased resource +// +namespace expt +{ + + template + struct dynamic_helper + { + template + static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body) + { + if(IDX==pol){ + using t_pol = typename camp::at>::type; + RAJA::forall(seg, body); + return; + } + dynamic_helper::invoke_forall(pol, seg, body); + } + + template + static resources::EventProxy + invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) + { + + using t_pol = typename camp::at>::type; + using resource_type = typename resources::get_resource::type; + + if(IDX==pol){ + RAJA::forall(r.get(), seg, body); + + //Return a generic event proxy from r, + //because forall returns a typed event proxy + return {r}; + } + + return dynamic_helper::invoke_forall(r, pol, seg, body); + } + + }; + + template + struct dynamic_helper<0, POLICY_LIST> + { + template + static void + invoke_forall(const int pol, SEGMENT const &seg, BODY const &body) + { + if(0==pol){ + using t_pol = typename camp::at>::type; + RAJA::forall(seg, body); + return; + } + RAJA_ABORT_OR_THROW("Policy enum not supported "); + } + + template + static resources::EventProxy + invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) + { + if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range "); + + using t_pol = typename camp::at>::type; + using resource_type = typename resources::get_resource::type; + + RAJA::forall(r.get(), seg, body); + + //Return a generic event proxy from r, + //because forall returns a typed event proxy + return {r}; + } + + }; + + template + void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body) + { + constexpr int N = camp::size::value; + static_assert(N > 0, "RAJA policy list must not be empty"); + + if(pol > N-1) { + RAJA_ABORT_OR_THROW("Policy enum not supported"); + } + dynamic_helper::invoke_forall(pol, seg, body); + } + + template + resources::EventProxy + dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body) + { + constexpr int N = camp::size::value; + static_assert(N > 0, "RAJA policy list must not be empty"); + + if(pol > N-1) { + RAJA_ABORT_OR_THROW("Policy value out of range"); + } + + return dynamic_helper::invoke_forall(r, pol, seg, body); + } + +} // namespace expt + + } // namespace RAJA diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp index 36b72f9fe8..1f5165a9e3 100644 --- a/include/RAJA/pattern/kernel/For.hpp +++ b/include/RAJA/pattern/kernel/For.hpp @@ -103,7 +103,7 @@ struct StatementExecutor< auto r = data.res; - forall_impl(r, ExecPolicy{}, TypedRangeSegment(0, len), for_wrapper); + forall_impl(r, ExecPolicy{}, TypedRangeSegment(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack()); } }; diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp index 1f28fb4740..efb4b6fb10 100644 --- a/include/RAJA/pattern/kernel/ForICount.hpp +++ b/include/RAJA/pattern/kernel/ForICount.hpp @@ -112,7 +112,7 @@ struct StatementExecutor< auto r = resources::get_resource::type::get_default(); - forall_impl(r, ExecPolicy{}, TypedRangeSegment(0, len), for_wrapper); + forall_impl(r, ExecPolicy{}, TypedRangeSegment(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack()); } }; diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp index 9501876bdc..403ae905ff 100644 --- a/include/RAJA/pattern/kernel/Hyperplane.hpp +++ b/include/RAJA/pattern/kernel/Hyperplane.hpp @@ -148,7 +148,8 @@ struct StatementExecutor::type::get_default(); forall_impl(r, HpExecPolicy{}, TypedRangeSegment(0, hp_len), - outer_wrapper); + outer_wrapper, + RAJA::expt::get_empty_forall_param_pack()); } }; diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp index db3e7fb8e3..013babbaef 100644 --- a/include/RAJA/pattern/kernel/Tile.hpp +++ b/include/RAJA/pattern/kernel/Tile.hpp @@ -243,7 +243,7 @@ struct StatementExecutor< // Loop over tiles, executing enclosed statement list auto r = resources::get_resource::type::get_default(); - forall_impl(r, EPol{}, tiled_iterable, tile_wrapper); + forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack()); // Set range back to original values camp::get(data.segment_tuple) = tiled_iterable.it; @@ -277,7 +277,7 @@ struct StatementExecutor< // Loop over tiles, executing enclosed statement list auto r = resources::get_resource::type::get_default(); - forall_impl(r, EPol{}, tiled_iterable, tile_wrapper); + forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack()); // Set range back to original values camp::get(data.segment_tuple) = tiled_iterable.it; diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp index 4068e36904..f8bc431cfe 100644 --- a/include/RAJA/pattern/kernel/TileTCount.hpp +++ b/include/RAJA/pattern/kernel/TileTCount.hpp @@ -124,7 +124,7 @@ struct StatementExecutor< // Loop over tiles, executing enclosed statement list auto r = resources::get_resource::type::get_default(); - forall_impl(r, EPol{}, tiled_iterable, tile_wrapper); + forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack()); // Set range back to original values camp::get(data.segment_tuple) = tiled_iterable.it; diff --git a/include/RAJA/pattern/teams.hpp b/include/RAJA/pattern/launch.hpp similarity index 56% rename from include/RAJA/pattern/teams.hpp rename to include/RAJA/pattern/launch.hpp index a61b1fde8c..e590bf33b2 100644 --- a/include/RAJA/pattern/teams.hpp +++ b/include/RAJA/pattern/launch.hpp @@ -3,7 +3,7 @@ * * \file * - * \brief RAJA header file containing headers for RAJA::Teams backends + * \brief RAJA header file containing headers for RAJA::Launch backends * ****************************************************************************** */ @@ -15,28 +15,32 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifndef RAJA_pattern_teams_HPP -#define RAJA_pattern_teams_HPP +#ifndef RAJA_pattern_launch_HPP +#define RAJA_pattern_launch_HPP -#include "RAJA/pattern/teams/teams_core.hpp" +#include "RAJA/pattern/launch/launch_core.hpp" // // All platforms must support host execution. // -#include "RAJA/policy/sequential/teams.hpp" -#include "RAJA/policy/loop/teams.hpp" -#include "RAJA/policy/simd/teams.hpp" +#include "RAJA/policy/sequential/launch.hpp" +#include "RAJA/policy/loop/launch.hpp" +#include "RAJA/policy/simd/launch.hpp" #if defined(RAJA_CUDA_ACTIVE) -#include "RAJA/policy/cuda/teams.hpp" +#include "RAJA/policy/cuda/launch.hpp" #endif #if defined(RAJA_HIP_ACTIVE) -#include "RAJA/policy/hip/teams.hpp" +#include "RAJA/policy/hip/launch.hpp" #endif #if defined(RAJA_ENABLE_OPENMP) -#include "RAJA/policy/openmp/teams.hpp" +#include "RAJA/policy/openmp/launch.hpp" #endif -#endif /* RAJA_pattern_teams_HPP */ +#if defined(RAJA_ENABLE_SYCL) +#include "RAJA/policy/sycl/launch.hpp" +#endif + +#endif /* RAJA_pattern_launch_HPP */ diff --git a/include/RAJA/pattern/teams/teams_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp similarity index 74% rename from include/RAJA/pattern/teams/teams_core.hpp rename to include/RAJA/pattern/launch/launch_core.hpp index 0c178df3b7..12d6f16f6f 100644 --- a/include/RAJA/pattern/teams/teams_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -15,8 +15,8 @@ // SPDX-License-Identifier: (BSD-3-Clause) //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -#ifndef RAJA_pattern_teams_core_HPP -#define RAJA_pattern_teams_core_HPP +#ifndef RAJA_pattern_launch_core_HPP +#define RAJA_pattern_launch_core_HPP #include "RAJA/config.hpp" #include "RAJA/internal/get_platform.hpp" @@ -28,7 +28,10 @@ #include "camp/concepts.hpp" #include "camp/tuple.hpp" -#if defined(RAJA_DEVICE_CODE) +//Odd dependecy with atomics is breaking CI builds +//#include "RAJA/util/View.hpp" + +#if defined(RAJA_DEVICE_CODE) && !defined(RAJA_ENABLE_SYCL) #define RAJA_TEAM_SHARED __shared__ #else #define RAJA_TEAM_SHARED @@ -37,11 +40,9 @@ namespace RAJA { -namespace expt -{ - // GPU or CPU threads available -enum ExecPlace { HOST, DEVICE, NUM_PLACES }; +//strongly type the ExecPlace (guards agaist errors) +enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES }; struct null_launch_t { }; @@ -128,18 +129,17 @@ struct Lanes { constexpr Lanes(int i) : value(i) {} }; -struct Grid { +struct LaunchParams { public: Teams teams; Threads threads; - Lanes lanes; - const char *kernel_name{nullptr}; + size_t shared_mem_size; RAJA_INLINE - Grid() = default; + LaunchParams() = default; - Grid(Teams in_teams, Threads in_threads, const char *in_kernel_name = nullptr) - : teams(in_teams), threads(in_threads), kernel_name(in_kernel_name){}; + LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0) + : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {}; private: RAJA_HOST_DEVICE @@ -149,26 +149,63 @@ struct Grid { RAJA_HOST_DEVICE RAJA_INLINE Threads apply(Threads const &a) { return (threads = a); } - - RAJA_HOST_DEVICE - RAJA_INLINE - Lanes apply(Lanes const &a) { return (lanes = a); } }; - -class LaunchContext : public Grid +class LaunchContext { public: - LaunchContext(Grid const &base) - : Grid(base) + //Bump style allocator used to + //get memory from the pool + size_t shared_mem_offset; + + void *shared_mem_ptr; + +#if defined(RAJA_ENABLE_SYCL) + mutable cl::sycl::nd_item<3> *itm; +#endif + + RAJA_HOST_DEVICE LaunchContext() + : shared_mem_offset(0), shared_mem_ptr(nullptr) { } + //TODO handle alignment + template + RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes) + { + T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; + + shared_mem_offset += bytes*sizeof(T); + return mem_ptr; + } + + /* + //Odd dependecy with atomics is breaking CI builds + template + RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs) + { + T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; + + shared_mem_offset += bytes*sizeof(T); + return RAJA::View>(mem_ptr, idx, idxs...); + } + */ + + RAJA_HOST_DEVICE void releaseSharedMemory() + { + //On the cpu/gpu we want to restart the count + shared_mem_offset = 0; + } + RAJA_HOST_DEVICE void teamSync() { -#if defined(RAJA_DEVICE_CODE) +#if defined(RAJA_DEVICE_CODE) && defined(RAJA_ENABLE_SYCL) + itm->barrier(sycl::access::fence_space::local_space); +#endif + +#if defined(RAJA_DEVICE_CODE) && !defined(RAJA_ENABLE_SYCL) __syncthreads(); #endif } @@ -177,31 +214,44 @@ class LaunchContext : public Grid template struct LaunchExecute; +//Policy based launch without name argument +template +void launch(LaunchParams const ¶ms, BODY const &body) +{ + launch(params, nullptr, body); +} + //Policy based launch template -void launch(Grid const &grid, BODY const &body) +void launch(LaunchParams const ¶ms, const char *kernel_name, BODY const &body) { //Take the first policy as we assume the second policy is not user defined. //We rely on the user to pair launch and loop policies correctly. using launch_t = LaunchExecute; - launch_t::exec(LaunchContext(grid), body); + launch_t::exec(params, kernel_name, body); } //Run time based policy launch template -void launch(ExecPlace place, Grid const &grid, BODY const &body) +void launch(ExecPlace place, LaunchParams const ¶ms, BODY const &body) +{ + launch(place, params, nullptr, body); +} + +template +void launch(ExecPlace place, const LaunchParams ¶ms, const char *kernel_name, BODY const &body) { switch (place) { - case HOST: { + case ExecPlace::HOST: { using launch_t = LaunchExecute; - launch_t::exec(LaunchContext(grid), body); + launch_t::exec(params, kernel_name, body); break; } #ifdef RAJA_DEVICE_ACTIVE - case DEVICE: { + case ExecPlace::DEVICE: { using launch_t = LaunchExecute; - launch_t::exec(LaunchContext(grid), body); + launch_t::exec(params, kernel_name, body); break; } #endif @@ -211,16 +261,16 @@ void launch(ExecPlace place, Grid const &grid, BODY const &body) } // Helper function to retrieve a resource based on the run-time policy - if a device is active -#if defined(RAJA_DEVICE_ACTIVE) +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) template -RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::expt::ExecPlace device){ - if(device == RAJA::expt::DEVICE) {return RAJA::resources::Resource(device_res);} +RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){ + if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);} else { return RAJA::resources::Resource(host_res); } } #else template -RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::expt::ExecPlace device){ - if(device == RAJA::expt::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");} +RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){ + if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");} return RAJA::resources::Resource(host_res); } @@ -230,25 +280,32 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::expt::ExecPlace de //Launch API which takes team resource struct template resources::EventProxy -launch(RAJA::resources::Resource res, Grid const &grid, BODY const &body) +launch(RAJA::resources::Resource res, LaunchParams const ¶ms, BODY const &body) +{ + return launch(res, params, nullptr, body); +} + +template +resources::EventProxy +launch(RAJA::resources::Resource res, LaunchParams const ¶ms, const char *kernel_name, BODY const &body) { ExecPlace place; if(res.get_platform() == camp::resources::v1::Platform::host) { - place = RAJA::expt::HOST; + place = RAJA::ExecPlace::HOST; }else{ - place = RAJA::expt::DEVICE; + place = RAJA::ExecPlace::DEVICE; } switch (place) { - case HOST: { + case ExecPlace::HOST: { using launch_t = LaunchExecute; - return launch_t::exec(res, LaunchContext(grid), body); break; + return launch_t::exec(res, params, kernel_name, body); break; } #ifdef RAJA_DEVICE_ACTIVE - case DEVICE: { + case ExecPlace::DEVICE: { using launch_t = LaunchExecute; - return launch_t::exec(res, LaunchContext(grid), body); break; + return launch_t::exec(res, params, kernel_name, body); break; } #endif default: { @@ -301,6 +358,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx, body); } +namespace expt +{ + RAJA_SUPPRESS_HD_WARN template struct TileExecute; @@ -376,6 +435,9 @@ RAJA_HOST_DEVICE RAJA_INLINE void tile_icount(CONTEXT const &ctx, body); } +namespace expt +{ + template + struct ForallParamPack { + + friend struct ParamMultiplexer; + + using Base = camp::tuple; + Base param_tup; + + static constexpr size_t param_tup_sz = camp::tuple_size::value; + using params_seq = camp::make_idx_seq_t< param_tup_sz >; + + private: + + // Init + template + static constexpr void detail_init(EXEC_POL, camp::idx_seq, ForallParamPack& f_params, Args&& ...args) { + CAMP_EXPAND(expt::detail::init( camp::get(f_params.param_tup), std::forward(args)... )); + } + + // Combine + template + RAJA_HOST_DEVICE + static constexpr void detail_combine(EXEC_POL, camp::idx_seq, ForallParamPack& out, const ForallParamPack& in ) { + CAMP_EXPAND(detail::combine( camp::get(out.param_tup), camp::get(in.param_tup))); + } + + template + RAJA_HOST_DEVICE + static constexpr void detail_combine(EXEC_POL, camp::idx_seq, ForallParamPack& f_params ) { + CAMP_EXPAND(detail::combine( camp::get(f_params.param_tup) )); + } + + // Resolve + template + static constexpr void detail_resolve(EXEC_POL, camp::idx_seq, ForallParamPack& f_params ) { + CAMP_EXPAND(detail::resolve( camp::get(f_params.param_tup) )); + } + + // Used to construct the argument TYPES that will be invoked with the lambda. + template + static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; }; + template + static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); }; + template + static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T()); }; + + using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T()); + + //Use the size of param_tup to generate the argument list. + RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); } + RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get(param_tup).get_lambda_arg_tup(); } + template + RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num) { + return camp::tuple_cat_pair( camp::get(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num()) ); + } + + public: + ForallParamPack(){} + + RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num());} + + using lambda_arg_seq = camp::make_idx_seq_t::value>; + + template + ForallParamPack(camp::tuple&& t) : param_tup(std::move(t)) {}; + }; // struct ForallParamPack + + + + //=========================================================================== + // + // + // ParamMultiplexer is how we hook into the individual calls within forall_impl. + // + // + struct ParamMultiplexer { + template> + static void constexpr init( ForallParamPack& f_params, Args&& ...args) { + FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward(args)... ); + } + template> + static void constexpr combine(ForallParamPack& f_params, Args&& ...args){ + FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward(args)... ); + } + template> + static void constexpr resolve( ForallParamPack& f_params, Args&& ...args){ + FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward(args)... ); + } + }; + //=========================================================================== + + + + //=========================================================================== + // + // + // ForallParamPack generators. + // + // + RAJA_INLINE static auto get_empty_forall_param_pack(){ + static ForallParamPack<> p; + return p; + } + + namespace detail { + // all_true trick to perform variadic expansion in static asserts. + // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template + template struct bool_pack; + template + using all_true = std::is_same, bool_pack>; + + template + using check_types_derive_base = all_true::value...>; + } // namespace detail + + + template + constexpr auto make_forall_param_pack_from_tuple(camp::tuple&& tuple) { + static_assert(detail::check_types_derive_base...>::value, + "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ; + return ForallParamPack...>(std::move(tuple)); + } + + + + namespace detail { + // Maybe we should do a lot of these with structs... + template + constexpr auto tuple_from_seq (const camp::idx_seq&, TupleType&& tuple){ + return camp::forward_as_tuple( camp::get< Seq >(std::forward(tuple))... ); + }; + + template + constexpr auto strip_last_elem(camp::tuple&& tuple){ + return tuple_from_seq(camp::make_idx_seq_t{},std::move(tuple)); + }; + } // namespace detail + + + // Make a tuple of the param pack except the final element... + template + constexpr auto make_forall_param_pack(Args&&... args){ + // We assume the last element of the pack is the lambda so we need to strip it from the list. + auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward(args)...) ); + return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple)); + } + //=========================================================================== + + + + //=========================================================================== + // + // + // Callable should be the last argument in the param pack, just extract it... + // + // + template + constexpr auto&& get_lambda(Args&&... args){ + return camp::get( camp::forward_as_tuple(std::forward(args)...) ); + } + //=========================================================================== + + + + //=========================================================================== + // + // + // Checking expected argument list against the assumed lambda. + // + // + namespace detail { + + // + // + // Lambda traits Utilities + // + // + template + struct lambda_traits; + + template + struct lambda_traits + { // non-const specialization + using arg_type = First; + }; + template + struct lambda_traits + { // const specialization + using arg_type = First; + }; + + template + typename lambda_traits::arg_type* lambda_arg_helper(T); + + + // + // + // List manipulation Utilities + // + // + template + constexpr auto list_remove_pointer(const camp::list&){ + return camp::list::type>...>{}; + } + + template + constexpr auto list_add_lvalue_ref(const camp::list&){ + return camp::list::type...>{}; + } + + template + constexpr auto tuple_to_list(const camp::tuple&) { + return camp::list{}; + } + + // TODO : Change to std::is_invocable at c++17 + template + struct is_invocable : + std::is_constructible< + std::function, + std::reference_wrapper::type> + >{}; + + template + using void_t = void; + + template + struct has_empty_op : std::false_type{}; + + template + struct has_empty_op)>> : std::true_type{}; + + template + struct get_lambda_index_type { + typedef typename std::remove_pointer< + decltype(lambda_arg_helper( + &camp::decay::operator()) + ) + >::type type; + }; + + // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args. + template + constexpr concepts::enable_if>> check_invocable(LAMBDA&&, const camp::list&) {} + + template + constexpr concepts::enable_if> check_invocable(LAMBDA&&, const camp::list&) { +#if !defined(RAJA_ENABLE_HIP) + static_assert(is_invocable::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS."); +#endif + } + + } // namespace detail + + + template + constexpr + void + check_forall_optional_args(Lambda&& l, ForallParams& fpp) { + + using expected_arg_type_list = decltype( detail::list_add_lvalue_ref( + detail::list_remove_pointer( + detail::tuple_to_list( + fpp.lambda_args() + ) + ) + )); + + detail::check_invocable(std::forward(l), expected_arg_type_list{}); + } + //=========================================================================== + + + + //=========================================================================== + // + // + // Type trailts for SFINAE work. + // + // + namespace type_traits + { + template struct is_ForallParamPack : std::false_type {}; + template struct is_ForallParamPack> : std::true_type {}; + + template struct is_ForallParamPack_empty : std::true_type {}; + template struct is_ForallParamPack_empty> : std::false_type {}; + template <> struct is_ForallParamPack_empty> : std::true_type {}; + } + //=========================================================================== + + + + //=========================================================================== + // + // + // Invoke Forall with Params. + // + // + namespace detail { + template + RAJA_HOST_DEVICE + constexpr + auto get_lambda_args(FP& fpp) + -> decltype( *camp::get( fpp.lambda_args() ) ) { + return ( *camp::get( fpp.lambda_args() ) ); + } + + CAMP_SUPPRESS_HD_WARN + template + RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params, + Fn&& f, + camp::idx_seq, + Ts&&... extra) + { + return f(std::forward(extra...), ( get_lambda_args(params) )...); + } + } // namespace detail + + //CAMP_SUPPRESS_HD_WARN + template + RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra) + { + return detail::invoke_with_order( + camp::forward(params), + camp::forward(f), + typename camp::decay::lambda_arg_seq(), + camp::forward(extra)...); + } + //=========================================================================== + +} // namespace expt +} // namespace RAJA + +#endif // FORALL_PARAM_HPP diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp new file mode 100644 index 0000000000..a3cb18cf46 --- /dev/null +++ b/include/RAJA/pattern/params/kernel_name.hpp @@ -0,0 +1,32 @@ +#ifndef RAJA_KERNEL_NAME_HPP +#define RAJA_KERNEL_NAME_HPP + +#include "RAJA/pattern/params/params_base.hpp" + +namespace RAJA +{ +namespace expt +{ +namespace detail +{ + + struct KernelName : public ForallParamBase { + RAJA_HOST_DEVICE KernelName() {} + KernelName(const char* name_in) : name(name_in) {} + const char* name; + }; + +} // namespace detail + +auto KernelName(const char * n) +{ + return detail::KernelName(n); +} +} // namespace expt + + +} // namespace RAJA + + + +#endif // KERNEL_NAME_HPP diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp new file mode 100644 index 0000000000..51e96260f8 --- /dev/null +++ b/include/RAJA/pattern/params/params_base.hpp @@ -0,0 +1,29 @@ +#ifndef RAJA_PARAMS_BASE +#define RAJA_PARAMS_BASE + + +namespace RAJA +{ +namespace expt +{ +namespace detail +{ + + struct ForallParamBase { + + // Some of this can be made virtual in c++20, for now must be defined in each child class + // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.) + using ARG_TUP_T = camp::tuple<>; + using ARG_LIST_T = typename ARG_TUP_T::TList; + RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); } + static constexpr size_t num_lambda_args = camp::tuple_size::value; + + }; + +} // namespace detail + +} // namespace expt + +} // namespace RAJA + +#endif // RAJA_PARAMS_BASE diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp new file mode 100644 index 0000000000..5c4858a14a --- /dev/null +++ b/include/RAJA/pattern/params/reducer.hpp @@ -0,0 +1,145 @@ +#ifndef NEW_REDUCE_HPP +#define NEW_REDUCE_HPP + +#include "RAJA/pattern/params/params_base.hpp" +#include "RAJA/util/SoAPtr.hpp" + +#if defined(RAJA_CUDA_ACTIVE) +#define DEVICE cuda +#include "RAJA/policy/cuda/MemUtils_CUDA.hpp" +#elif defined(RAJA_HIP_ACTIVE) +#define DEVICE hip +#include "RAJA/policy/hip/MemUtils_HIP.hpp" +#endif + +namespace RAJA +{ + +namespace expt +{ + +template +struct ValLoc { + using index_type = RAJA::Index_type; + using value_type = T; + + RAJA_HOST_DEVICE ValLoc() {} + RAJA_HOST_DEVICE ValLoc(value_type v) : val(v) {} + RAJA_HOST_DEVICE ValLoc(value_type v, RAJA::Index_type l) : val(v), loc(l) {} + + RAJA_HOST_DEVICE void min(value_type v, index_type l) { if (v < val) { val = v; loc = l; } } + RAJA_HOST_DEVICE void max(value_type v, index_type l) { if (v > val) { val = v; loc = l; } } + + bool constexpr operator < (const ValLoc& rhs) const { return val <= rhs.val; } + bool constexpr operator <=(const ValLoc& rhs) const { return val < rhs.val; } + bool constexpr operator > (const ValLoc& rhs) const { return val >= rhs.val; } + bool constexpr operator >=(const ValLoc& rhs) const { return val > rhs.val; } + + value_type getVal() {return val;} + RAJA::Index_type getLoc() {return loc;} + +private: + value_type val; + index_type loc = -1; +}; + +} // namespace expt + +namespace operators +{ + +template +struct limits> { + RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc min() + { + return RAJA::expt::ValLoc(RAJA::operators::limits::min()); + } + RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc max() + { + return RAJA::expt::ValLoc(RAJA::operators::limits::max()); + } +}; + +} // namespace operators + +} // namespace RAJA + +namespace RAJA +{ + +namespace expt +{ +namespace detail +{ + + // + // + // Basic Reducer + // + // + template + struct Reducer : public ForallParamBase { + using op = Op; + using value_type = T; + + RAJA_HOST_DEVICE Reducer() {} + Reducer(value_type *target_in) : target(target_in), val(op::identity()) {} + + value_type *target = nullptr; + value_type val = op::identity(); + +#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) + // Device related attributes. + value_type * devicetarget = nullptr; + RAJA::detail::SoAPtr device_mem; + unsigned int * device_count = nullptr; +#endif + + using ARG_TUP_T = camp::tuple; + RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&val); } + + using ARG_LIST_T = typename ARG_TUP_T::TList; + static constexpr size_t num_lambda_args = camp::tuple_size::value ; + }; + +} // namespace detail + +template