-
Notifications
You must be signed in to change notification settings - Fork 100
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8a07b68
commit 7d0da79
Showing
1 changed file
with
135 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,126 +1,177 @@ | ||
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// | ||
// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC | ||
// and RAJA project contributors. See the RAJA/LICENSE file for details. | ||
// | ||
// SPDX-License-Identifier: (BSD-3-Clause) | ||
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// | ||
|
||
// RAJA/RAJA.hpp cannot be included here because the include logic will | ||
// default all atomic implementations to a desul backend. | ||
#include "RAJA/policy/loop/policy.hpp" | ||
#include "RAJA/policy/openmp/atomic.hpp" | ||
#include "RAJA/RAJA.hpp" | ||
#include "RAJA/policy/openmp/policy.hpp" | ||
#if defined RAJA_ENABLE_CUDA | ||
#include "RAJA/policy/cuda.hpp" | ||
#include "RAJA/policy/cuda/atomic.hpp" | ||
#elif defined RAJA_ENABLE_HIP | ||
#include "RAJA/policy/hip.hpp" | ||
#include "RAJA/policy/hip/atomic.hpp" | ||
#endif | ||
|
||
#include "desul/atomics.hpp" | ||
#include "benchmark/benchmark.h" | ||
#include "RAJA/util/Timer.hpp" | ||
|
||
#include <type_traits> | ||
#include <iostream> | ||
|
||
#define N 100000000 | ||
#define N 1000000000 | ||
using raja_default_desul_order = desul::MemoryOrderRelaxed; | ||
using raja_default_desul_scope = desul::MemoryScopeDevice; | ||
|
||
template<int BLOCK_SZ> | ||
struct ExecPolicyGPU { | ||
#if defined RAJA_ENABLE_CUDA | ||
using policy = RAJA::cuda_exec<BLOCK_SZ>; | ||
#elif defined RAJA_ENABLE_HIP | ||
using policy = RAJA::hip_exec<BLOCK_SZ>; | ||
#endif | ||
}; | ||
|
||
struct GPUAtomic { | ||
#if defined RAJA_ENABLE_CUDA | ||
using policy = RAJA::policy::cuda::cuda_atomic; | ||
#elif defined RAJA_ENABLE_HIP | ||
using policy = RAJA::policy::hip::hip_atomic; | ||
#endif | ||
}; | ||
|
||
// Desul atomics have a different signature than RAJA's built in ops. The following code provides some | ||
// helper function templates so that they can be called using the same signature in timing code. | ||
|
||
// Struct holding Desul atomic signature typedef | ||
template<typename AtomicType> | ||
struct DesulAtomicSignature { | ||
using signature = int(*)(AtomicType*, const AtomicType, raja_default_desul_order, raja_default_desul_scope); | ||
using signature = AtomicType(*)(AtomicType*, const AtomicType, raja_default_desul_order, raja_default_desul_scope); | ||
}; | ||
|
||
// Struct holding RAJA atomic signature typedef | ||
template<typename AtomicType> | ||
struct RajaAtomicSignature { | ||
using signature = int(*)(AtomicType*, const AtomicType); | ||
using signature = AtomicType(*)(AtomicType*, const AtomicType); | ||
}; | ||
|
||
template<typename T, typename Policy> | ||
RAJA_HOST_DEVICE T atomicAddWrapperDesul(T * acc, T value) { | ||
return desul::atomic_fetch_add(acc, value, raja_default_desul_order{}, | ||
raja_default_desul_scope{}); | ||
/// RAJA::atomicAdd is overloaded and has an ambiguous type so it can't be passed as a template parameter. | ||
/// The following wrappers disambiguate the call and provide a signature comaptible with the DESUL | ||
/// wrapper. | ||
template<typename AtomicType> | ||
RAJA_HOST_DEVICE AtomicType GPUAtomicAdd(AtomicType* acc, const AtomicType val) { | ||
return RAJA::atomicAdd(GPUAtomic::policy {}, acc, val); | ||
} | ||
|
||
template<typename T, typename Policy> | ||
RAJA_HOST_DEVICE T atomicMaxWrappeDesul(T * acc, T value) { | ||
return desul::atomic_fetch_max(acc, value, raja_default_desul_order{}, | ||
raja_default_desul_scope{}); | ||
template<typename AtomicType> | ||
RAJA_HOST_DEVICE AtomicType GPUAtomicMax(AtomicType* acc, const AtomicType val) { | ||
return RAJA::atomicMax(GPUAtomic::policy {}, acc, val); | ||
} | ||
|
||
template<typename T, typename Policy> | ||
RAJA_HOST_DEVICE T atomicAddWrapper(T * acc, T value) { | ||
return RAJA::atomicAdd(Policy{}, acc, value); | ||
template<typename AtomicType> | ||
RAJA_HOST_DEVICE AtomicType OpenMPAtomicAdd(AtomicType* acc, const AtomicType val) { | ||
return RAJA::atomicAdd(RAJA::policy::omp::omp_atomic{}, acc, val); | ||
} | ||
|
||
template<typename T, typename Policy> | ||
RAJA_HOST_DEVICE T atomicMaxWrapper(T * acc, T value) { | ||
return RAJA::atomicMax(Policy{}, acc, value); | ||
/// Function template that allows invoking DESUL atomic with a (int*)(T*, T) signature | ||
template<typename T, typename Policy, typename DesulAtomicSignature<T>::signature AtomicImpl> | ||
RAJA_HOST_DEVICE T atomicWrapperDesul(T * acc, T value) { | ||
|
||
return AtomicImpl(acc, value, raja_default_desul_order{}, | ||
raja_default_desul_scope{}); | ||
} | ||
|
||
template<typename> | ||
struct IsCuda : public std::false_type {}; | ||
struct IsGPU : public std::false_type {}; | ||
|
||
/// These helper templates are used to deduce if device memory allocations are necessary | ||
/// inside the body of the benchmark, using the type of the execution policy. | ||
#if defined RAJA_ENABLE_CUDA | ||
template<int M> | ||
struct IsCuda<RAJA::cuda_exec<M>> : public std::true_type {}; | ||
|
||
template <class ExecPolicy, typename DesulAtomicSignature<int>::signature AtomicImpl> | ||
void DesulAtomicOpLoopInt(benchmark::State& state) { | ||
for (auto _ : state) { | ||
int* value; | ||
int zero = 0; | ||
if (IsCuda<ExecPolicy>::value) { | ||
#if defined(RAJA_ENABLE_CUDA) | ||
cudaErrchk( | ||
cudaMallocManaged((void **)&value, sizeof(int), cudaMemAttachGlobal)); | ||
cudaMemset(value, 0, sizeof(int)); | ||
#endif | ||
} else { | ||
value = &zero; | ||
} | ||
RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N), | ||
[=]RAJA_HOST_DEVICE(int) { | ||
AtomicImpl(value, 1, raja_default_desul_order{}, | ||
raja_default_desul_scope{}); | ||
}); | ||
assert(*value == N); | ||
} | ||
} | ||
struct IsGPU<RAJA::cuda_exec<M>> : public std::true_type {}; | ||
#elif defined RAJA_ENABLE_HIP | ||
template<int M> | ||
struct IsGPU<RAJA::hip_exec<M>> : public std::true_type {}; | ||
#endif | ||
|
||
template <class ExecPolicy, typename AtomicType, typename RajaAtomicSignature<AtomicType>::signature AtomicImpl, bool test_array = false> | ||
void TimeAtomicOp(const std::string& test_name, int num_iterations = 2, int array_size = 100) { | ||
std::cout << "EXECUTING " << test_name << "; "; | ||
RAJA::Timer timer; | ||
|
||
template <class ExecPolicy, typename RajaAtomicSignature<int>::signature AtomicImpl> | ||
void AtomicOpLoopInt(benchmark::State& state) { | ||
for (auto _ : state) { | ||
int* value; | ||
int zero = 0; | ||
for (int i = 0; i < num_iterations; ++i) { | ||
AtomicType* device_value = nullptr; | ||
AtomicType zero = 0; | ||
int len_array = test_array ? array_size : 1; | ||
|
||
if (IsGPU<ExecPolicy>::value) { | ||
#if defined(RAJA_ENABLE_CUDA) | ||
if (IsCuda<ExecPolicy>::value) { | ||
cudaErrchk( | ||
cudaMallocManaged((void **)&value, sizeof(int), cudaMemAttachGlobal)); | ||
cudaMemset(value, 0, sizeof(int)); | ||
} | ||
#else | ||
value = &zero; | ||
cudaErrchk(cudaMalloc((void **)&device_value, len_array * sizeof(AtomicType))); | ||
cudaMemset(device_value, 0, len_array * sizeof(AtomicType)); | ||
#elif defined(RAJA_ENABLE_HIP) | ||
hipMalloc((void **)&device_value, len_array * sizeof(AtomicType)); | ||
hipMemset(device_value, 0, len_array * sizeof(AtomicType)); | ||
#endif | ||
} | ||
std::cout << "here 1\n"; | ||
timer.start(); | ||
RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N), | ||
[=]RAJA_HOST_DEVICE(int) { | ||
AtomicImpl(value, 1); | ||
[=] RAJA_HOST_DEVICE(int tid) { | ||
printf("tid %d\n", tid); | ||
if (test_array) { | ||
AtomicImpl(&(device_value[tid % array_size]), 1); | ||
} else { | ||
AtomicImpl(device_value, 1); | ||
} | ||
}); | ||
assert(*value == N); | ||
std::cout << "here 2\n"; | ||
timer.stop(); | ||
if (IsGPU<ExecPolicy>::value) { | ||
#if defined(RAJA_ENABLE_CUDA) | ||
cudaErrchk(cudaFree((void *)device_value)); | ||
#elif defined(RAJA_ENABLE_HIP) | ||
hipFree((void *)device_value); | ||
#endif | ||
} | ||
std::cout << "here 3\n"; | ||
} | ||
|
||
double t = timer.elapsed(); | ||
std::cout << "ELAPSED TIME = " << t << std::endl; | ||
} | ||
|
||
BENCHMARK(DesulAtomicOpLoopInt<RAJA::omp_for_exec, desul::atomic_fetch_add>); | ||
BENCHMARK(AtomicOpLoopInt<RAJA::omp_for_exec, atomicAddWrapper<int, RAJA::policy::omp::omp_atomic>>); | ||
// CUDA addition | ||
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<1024>, desul::atomic_fetch_add>); | ||
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<1024>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<512>, desul::atomic_fetch_add>); | ||
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<512>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<256>, desul::atomic_fetch_add>); | ||
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<256>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<128>, desul::atomic_fetch_add>); | ||
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<128>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<64>, desul::atomic_fetch_add>); | ||
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<64>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
// CUDA max | ||
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<1024>, desul::atomic_fetch_max>); | ||
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<1024>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<512>, desul::atomic_fetch_max>); | ||
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<512>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<256>, desul::atomic_fetch_max>); | ||
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<256>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<128>, desul::atomic_fetch_max>); | ||
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<128>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<64>, desul::atomic_fetch_max>); | ||
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<64>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>); | ||
|
||
BENCHMARK_MAIN(); | ||
int main () { | ||
// GPU benchmarks | ||
TimeAtomicOp<ExecPolicyGPU<32>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 32, RAJA builtin atomic", 4); | ||
TimeAtomicOp<ExecPolicyGPU<32>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 32, DESUL atomic", 4); | ||
TimeAtomicOp<ExecPolicyGPU<64>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 64, RAJA builtin atomic", 4); | ||
TimeAtomicOp<ExecPolicyGPU<64>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 64, DESUL atomic", 4); | ||
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 128, RAJA builtin atomic", 4); | ||
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 128, DESUL atomic", 4); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 4); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 4); | ||
|
||
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 128, RAJA builtin atomic", 2, 10); | ||
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 128, DESUL atomic", 2, 10); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 2, 10); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 2, 10); | ||
|
||
TimeAtomicOp<ExecPolicyGPU<128>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 128, RAJA builtin atomic"); | ||
TimeAtomicOp<ExecPolicyGPU<128>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 128, DESUL atomic"); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 256, RAJA builtin atomic"); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 256, DESUL atomic"); | ||
|
||
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicMax<int>>("CUDA Block size 128, RAJA builtin atomic max"); | ||
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 128, DESUL atomic max"); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicMax<int>>("CUDA Block size 256, RAJA builtin atomic max"); | ||
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 256, DESUL atomic max"); | ||
// OpenMP benchmarks | ||
TimeAtomicOp<RAJA::omp_for_exec, int, OpenMPAtomicAdd<int>>("OpenMP, int, RAJA builtin atomic"); | ||
TimeAtomicOp<RAJA::omp_for_exec, int, atomicWrapperDesul<int, RAJA::policy::omp::omp_atomic, desul::atomic_fetch_add>>("OpenMP, desul atomic"); | ||
} |