Skip to content

Commit

Permalink
Move away from google benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
johnbowen42 committed Apr 15, 2024
1 parent 8a07b68 commit 7d0da79
Showing 1 changed file with 135 additions and 84 deletions.
219 changes: 135 additions & 84 deletions benchmark/benchmark-atomic.cpp
Original file line number Diff line number Diff line change
@@ -1,126 +1,177 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
// and RAJA project contributors. See the RAJA/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

// RAJA/RAJA.hpp cannot be included here because the include logic will
// default all atomic implementations to a desul backend.
#include "RAJA/policy/loop/policy.hpp"
#include "RAJA/policy/openmp/atomic.hpp"
#include "RAJA/RAJA.hpp"
#include "RAJA/policy/openmp/policy.hpp"
#if defined RAJA_ENABLE_CUDA
#include "RAJA/policy/cuda.hpp"
#include "RAJA/policy/cuda/atomic.hpp"
#elif defined RAJA_ENABLE_HIP
#include "RAJA/policy/hip.hpp"
#include "RAJA/policy/hip/atomic.hpp"
#endif

#include "desul/atomics.hpp"
#include "benchmark/benchmark.h"
#include "RAJA/util/Timer.hpp"

#include <type_traits>
#include <iostream>

#define N 100000000
#define N 1000000000
using raja_default_desul_order = desul::MemoryOrderRelaxed;
using raja_default_desul_scope = desul::MemoryScopeDevice;

template<int BLOCK_SZ>
struct ExecPolicyGPU {
#if defined RAJA_ENABLE_CUDA
using policy = RAJA::cuda_exec<BLOCK_SZ>;
#elif defined RAJA_ENABLE_HIP
using policy = RAJA::hip_exec<BLOCK_SZ>;
#endif
};

struct GPUAtomic {
#if defined RAJA_ENABLE_CUDA
using policy = RAJA::policy::cuda::cuda_atomic;
#elif defined RAJA_ENABLE_HIP
using policy = RAJA::policy::hip::hip_atomic;
#endif
};

// Desul atomics have a different signature than RAJA's built in ops. The following code provides some
// helper function templates so that they can be called using the same signature in timing code.

// Struct holding Desul atomic signature typedef
template<typename AtomicType>
struct DesulAtomicSignature {
using signature = int(*)(AtomicType*, const AtomicType, raja_default_desul_order, raja_default_desul_scope);
using signature = AtomicType(*)(AtomicType*, const AtomicType, raja_default_desul_order, raja_default_desul_scope);
};

// Struct holding RAJA atomic signature typedef
template<typename AtomicType>
struct RajaAtomicSignature {
using signature = int(*)(AtomicType*, const AtomicType);
using signature = AtomicType(*)(AtomicType*, const AtomicType);
};

template<typename T, typename Policy>
RAJA_HOST_DEVICE T atomicAddWrapperDesul(T * acc, T value) {
return desul::atomic_fetch_add(acc, value, raja_default_desul_order{},
raja_default_desul_scope{});
/// RAJA::atomicAdd is overloaded and has an ambiguous type so it can't be passed as a template parameter.
/// The following wrappers disambiguate the call and provide a signature comaptible with the DESUL
/// wrapper.
template<typename AtomicType>
RAJA_HOST_DEVICE AtomicType GPUAtomicAdd(AtomicType* acc, const AtomicType val) {
return RAJA::atomicAdd(GPUAtomic::policy {}, acc, val);
}

template<typename T, typename Policy>
RAJA_HOST_DEVICE T atomicMaxWrappeDesul(T * acc, T value) {
return desul::atomic_fetch_max(acc, value, raja_default_desul_order{},
raja_default_desul_scope{});
template<typename AtomicType>
RAJA_HOST_DEVICE AtomicType GPUAtomicMax(AtomicType* acc, const AtomicType val) {
return RAJA::atomicMax(GPUAtomic::policy {}, acc, val);
}

template<typename T, typename Policy>
RAJA_HOST_DEVICE T atomicAddWrapper(T * acc, T value) {
return RAJA::atomicAdd(Policy{}, acc, value);
template<typename AtomicType>
RAJA_HOST_DEVICE AtomicType OpenMPAtomicAdd(AtomicType* acc, const AtomicType val) {
return RAJA::atomicAdd(RAJA::policy::omp::omp_atomic{}, acc, val);
}

template<typename T, typename Policy>
RAJA_HOST_DEVICE T atomicMaxWrapper(T * acc, T value) {
return RAJA::atomicMax(Policy{}, acc, value);
/// Function template that allows invoking DESUL atomic with a (int*)(T*, T) signature
template<typename T, typename Policy, typename DesulAtomicSignature<T>::signature AtomicImpl>
RAJA_HOST_DEVICE T atomicWrapperDesul(T * acc, T value) {

return AtomicImpl(acc, value, raja_default_desul_order{},
raja_default_desul_scope{});
}

template<typename>
struct IsCuda : public std::false_type {};
struct IsGPU : public std::false_type {};

/// These helper templates are used to deduce if device memory allocations are necessary
/// inside the body of the benchmark, using the type of the execution policy.
#if defined RAJA_ENABLE_CUDA
template<int M>
struct IsCuda<RAJA::cuda_exec<M>> : public std::true_type {};

template <class ExecPolicy, typename DesulAtomicSignature<int>::signature AtomicImpl>
void DesulAtomicOpLoopInt(benchmark::State& state) {
for (auto _ : state) {
int* value;
int zero = 0;
if (IsCuda<ExecPolicy>::value) {
#if defined(RAJA_ENABLE_CUDA)
cudaErrchk(
cudaMallocManaged((void **)&value, sizeof(int), cudaMemAttachGlobal));
cudaMemset(value, 0, sizeof(int));
#endif
} else {
value = &zero;
}
RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N),
[=]RAJA_HOST_DEVICE(int) {
AtomicImpl(value, 1, raja_default_desul_order{},
raja_default_desul_scope{});
});
assert(*value == N);
}
}
struct IsGPU<RAJA::cuda_exec<M>> : public std::true_type {};
#elif defined RAJA_ENABLE_HIP
template<int M>
struct IsGPU<RAJA::hip_exec<M>> : public std::true_type {};
#endif

template <class ExecPolicy, typename AtomicType, typename RajaAtomicSignature<AtomicType>::signature AtomicImpl, bool test_array = false>
void TimeAtomicOp(const std::string& test_name, int num_iterations = 2, int array_size = 100) {
std::cout << "EXECUTING " << test_name << "; ";
RAJA::Timer timer;

template <class ExecPolicy, typename RajaAtomicSignature<int>::signature AtomicImpl>
void AtomicOpLoopInt(benchmark::State& state) {
for (auto _ : state) {
int* value;
int zero = 0;
for (int i = 0; i < num_iterations; ++i) {
AtomicType* device_value = nullptr;
AtomicType zero = 0;
int len_array = test_array ? array_size : 1;

if (IsGPU<ExecPolicy>::value) {
#if defined(RAJA_ENABLE_CUDA)
if (IsCuda<ExecPolicy>::value) {
cudaErrchk(
cudaMallocManaged((void **)&value, sizeof(int), cudaMemAttachGlobal));
cudaMemset(value, 0, sizeof(int));
}
#else
value = &zero;
cudaErrchk(cudaMalloc((void **)&device_value, len_array * sizeof(AtomicType)));
cudaMemset(device_value, 0, len_array * sizeof(AtomicType));
#elif defined(RAJA_ENABLE_HIP)
hipMalloc((void **)&device_value, len_array * sizeof(AtomicType));
hipMemset(device_value, 0, len_array * sizeof(AtomicType));
#endif
}
std::cout << "here 1\n";
timer.start();
RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N),
[=]RAJA_HOST_DEVICE(int) {
AtomicImpl(value, 1);
[=] RAJA_HOST_DEVICE(int tid) {
printf("tid %d\n", tid);
if (test_array) {
AtomicImpl(&(device_value[tid % array_size]), 1);
} else {
AtomicImpl(device_value, 1);
}
});
assert(*value == N);
std::cout << "here 2\n";
timer.stop();
if (IsGPU<ExecPolicy>::value) {
#if defined(RAJA_ENABLE_CUDA)
cudaErrchk(cudaFree((void *)device_value));
#elif defined(RAJA_ENABLE_HIP)
hipFree((void *)device_value);
#endif
}
std::cout << "here 3\n";
}

double t = timer.elapsed();
std::cout << "ELAPSED TIME = " << t << std::endl;
}

BENCHMARK(DesulAtomicOpLoopInt<RAJA::omp_for_exec, desul::atomic_fetch_add>);
BENCHMARK(AtomicOpLoopInt<RAJA::omp_for_exec, atomicAddWrapper<int, RAJA::policy::omp::omp_atomic>>);
// CUDA addition
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<1024>, desul::atomic_fetch_add>);
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<1024>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<512>, desul::atomic_fetch_add>);
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<512>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<256>, desul::atomic_fetch_add>);
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<256>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<128>, desul::atomic_fetch_add>);
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<128>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<64>, desul::atomic_fetch_add>);
BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<64>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
// CUDA max
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<1024>, desul::atomic_fetch_max>);
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<1024>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<512>, desul::atomic_fetch_max>);
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<512>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<256>, desul::atomic_fetch_max>);
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<256>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<128>, desul::atomic_fetch_max>);
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<128>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<64>, desul::atomic_fetch_max>);
//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<64>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);

BENCHMARK_MAIN();
int main () {
// GPU benchmarks
TimeAtomicOp<ExecPolicyGPU<32>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 32, RAJA builtin atomic", 4);
TimeAtomicOp<ExecPolicyGPU<32>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 32, DESUL atomic", 4);
TimeAtomicOp<ExecPolicyGPU<64>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 64, RAJA builtin atomic", 4);
TimeAtomicOp<ExecPolicyGPU<64>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 64, DESUL atomic", 4);
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 128, RAJA builtin atomic", 4);
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 128, DESUL atomic", 4);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 4);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 4);

TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 128, RAJA builtin atomic", 2, 10);
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 128, DESUL atomic", 2, 10);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 2, 10);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 2, 10);

TimeAtomicOp<ExecPolicyGPU<128>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 128, RAJA builtin atomic");
TimeAtomicOp<ExecPolicyGPU<128>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 128, DESUL atomic");
TimeAtomicOp<ExecPolicyGPU<256>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 256, RAJA builtin atomic");
TimeAtomicOp<ExecPolicyGPU<256>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 256, DESUL atomic");

TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicMax<int>>("CUDA Block size 128, RAJA builtin atomic max");
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 128, DESUL atomic max");
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicMax<int>>("CUDA Block size 256, RAJA builtin atomic max");
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 256, DESUL atomic max");
// OpenMP benchmarks
TimeAtomicOp<RAJA::omp_for_exec, int, OpenMPAtomicAdd<int>>("OpenMP, int, RAJA builtin atomic");
TimeAtomicOp<RAJA::omp_for_exec, int, atomicWrapperDesul<int, RAJA::policy::omp::omp_atomic, desul::atomic_fetch_add>>("OpenMP, desul atomic");
}

0 comments on commit 7d0da79

Please sign in to comment.