Move away from google benchmark

LLNL · Apr 15, 2024 · 7d0da79 · 7d0da79
1 parent 8a07b68
commit 7d0da79
Showing 1 changed file with 135 additions and 84 deletions.
diff --git a/benchmark/benchmark-atomic.cpp b/benchmark/benchmark-atomic.cpp
@@ -1,126 +1,177 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
 // RAJA/RAJA.hpp cannot be included here because the include logic will
 // default all atomic implementations to a desul backend.
 #include "RAJA/policy/loop/policy.hpp"
 #include "RAJA/policy/openmp/atomic.hpp"
 #include "RAJA/RAJA.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
+#if defined RAJA_ENABLE_CUDA
 #include "RAJA/policy/cuda.hpp"
 #include "RAJA/policy/cuda/atomic.hpp"
+#elif defined RAJA_ENABLE_HIP
+#include "RAJA/policy/hip.hpp"
+#include "RAJA/policy/hip/atomic.hpp"
+#endif
+
 #include "desul/atomics.hpp"
-#include "benchmark/benchmark.h"
+#include "RAJA/util/Timer.hpp"
+
 #include <type_traits>
 #include <iostream>
 
-#define N 100000000
+#define N 1000000000
 using raja_default_desul_order = desul::MemoryOrderRelaxed;
 using raja_default_desul_scope = desul::MemoryScopeDevice;
 
+template<int BLOCK_SZ>
+struct ExecPolicyGPU {
+#if defined RAJA_ENABLE_CUDA
+    using policy = RAJA::cuda_exec<BLOCK_SZ>;
+#elif defined RAJA_ENABLE_HIP
+    using policy = RAJA::hip_exec<BLOCK_SZ>;
+#endif
+};
+
+struct GPUAtomic {
+    #if defined RAJA_ENABLE_CUDA
+        using policy = RAJA::policy::cuda::cuda_atomic;
+    #elif defined RAJA_ENABLE_HIP
+        using policy = RAJA::policy::hip::hip_atomic;
+    #endif
+};
+
+// Desul atomics have a different signature than RAJA's built in ops. The following code provides some
+// helper function templates so that they can be called using the same signature in timing code.
+
+// Struct holding Desul atomic signature typedef
 template<typename AtomicType>
 struct DesulAtomicSignature {
-    using signature = int(*)(AtomicType*, const AtomicType, raja_default_desul_order, raja_default_desul_scope);
+    using signature = AtomicType(*)(AtomicType*, const AtomicType, raja_default_desul_order, raja_default_desul_scope);
 };
 
+// Struct holding RAJA atomic signature typedef
 template<typename AtomicType>
 struct RajaAtomicSignature {
-    using signature = int(*)(AtomicType*, const AtomicType);
+    using signature = AtomicType(*)(AtomicType*, const AtomicType);
 };
 
-template<typename T, typename Policy>
-RAJA_HOST_DEVICE T atomicAddWrapperDesul(T * acc, T value) {
-    return desul::atomic_fetch_add(acc, value, raja_default_desul_order{},
-                    raja_default_desul_scope{});
+/// RAJA::atomicAdd is overloaded and has an ambiguous type so it can't be passed as a template parameter.
+/// The following wrappers disambiguate the call and provide a signature comaptible with the DESUL
+/// wrapper.
+template<typename AtomicType>
+RAJA_HOST_DEVICE AtomicType GPUAtomicAdd(AtomicType* acc, const AtomicType val) {
+    return RAJA::atomicAdd(GPUAtomic::policy {}, acc, val);
 }
 
-template<typename T, typename Policy>
-RAJA_HOST_DEVICE T atomicMaxWrappeDesul(T * acc, T value) {
-    return desul::atomic_fetch_max(acc, value, raja_default_desul_order{},
-                    raja_default_desul_scope{});
+template<typename AtomicType>
+RAJA_HOST_DEVICE AtomicType GPUAtomicMax(AtomicType* acc, const AtomicType val) {
+    return RAJA::atomicMax(GPUAtomic::policy {}, acc, val);
 }
 
-template<typename T, typename Policy>
-RAJA_HOST_DEVICE T atomicAddWrapper(T * acc, T value) {
-    return RAJA::atomicAdd(Policy{}, acc, value);
+template<typename AtomicType>
+RAJA_HOST_DEVICE AtomicType OpenMPAtomicAdd(AtomicType* acc, const AtomicType val) {
+    return RAJA::atomicAdd(RAJA::policy::omp::omp_atomic{}, acc, val);
 }
 
-template<typename T, typename Policy>
-RAJA_HOST_DEVICE T atomicMaxWrapper(T * acc, T value) {
-    return RAJA::atomicMax(Policy{}, acc, value);
+/// Function template that allows invoking DESUL atomic with a (int*)(T*, T) signature
+template<typename T, typename Policy, typename DesulAtomicSignature<T>::signature AtomicImpl>
+RAJA_HOST_DEVICE T atomicWrapperDesul(T * acc, T value) {
+
+    return AtomicImpl(acc, value, raja_default_desul_order{},
+                    raja_default_desul_scope{});
 }
 
 template<typename>
-struct IsCuda : public std::false_type {};
+struct IsGPU : public std::false_type {};
 
+/// These helper templates are used to deduce if device memory allocations are necessary
+/// inside the body of the benchmark, using the type of the execution policy.
+#if defined RAJA_ENABLE_CUDA
 template<int M>
-struct IsCuda<RAJA::cuda_exec<M>> : public std::true_type {};
-
-template <class ExecPolicy, typename DesulAtomicSignature<int>::signature AtomicImpl>
-void DesulAtomicOpLoopInt(benchmark::State& state) {
-    for (auto _ : state) {
-        int* value;
-        int zero = 0;
-        if (IsCuda<ExecPolicy>::value) {
-            #if defined(RAJA_ENABLE_CUDA)
-            cudaErrchk(
-            cudaMallocManaged((void **)&value, sizeof(int), cudaMemAttachGlobal));
-            cudaMemset(value, 0, sizeof(int));
-            #endif
-        } else {
-            value = &zero;
-        }
-        RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N),
-        [=]RAJA_HOST_DEVICE(int)  {
-            AtomicImpl(value, 1, raja_default_desul_order{},
-                    raja_default_desul_scope{});
-        });
-        assert(*value == N);
-    }
-}
+struct IsGPU<RAJA::cuda_exec<M>> : public std::true_type {};
+#elif defined RAJA_ENABLE_HIP
+template<int M>
+struct IsGPU<RAJA::hip_exec<M>> : public std::true_type {};
+#endif
+
+template <class ExecPolicy, typename AtomicType, typename RajaAtomicSignature<AtomicType>::signature AtomicImpl, bool test_array = false>
+void TimeAtomicOp(const std::string& test_name, int num_iterations = 2, int array_size = 100) {
+    std::cout << "EXECUTING " << test_name << "; ";
+    RAJA::Timer timer;
 
-template <class ExecPolicy, typename RajaAtomicSignature<int>::signature AtomicImpl>
-void AtomicOpLoopInt(benchmark::State& state) {
-    for (auto _ : state) {
-        int* value;
-        int zero = 0;
+    for (int i = 0; i < num_iterations; ++i) {
+        AtomicType* device_value = nullptr;
+        AtomicType zero = 0;
+        int len_array = test_array ? array_size : 1;
+
+        if (IsGPU<ExecPolicy>::value) {
 #if defined(RAJA_ENABLE_CUDA)
-    if (IsCuda<ExecPolicy>::value) {
-        cudaErrchk(
-        cudaMallocManaged((void **)&value, sizeof(int), cudaMemAttachGlobal));
-        cudaMemset(value, 0, sizeof(int));
-    }
-#else
-    value = &zero;
+            cudaErrchk(cudaMalloc((void **)&device_value, len_array * sizeof(AtomicType)));
+            cudaMemset(device_value, 0, len_array * sizeof(AtomicType));
+#elif defined(RAJA_ENABLE_HIP)
+            hipMalloc((void **)&device_value, len_array * sizeof(AtomicType));
+            hipMemset(device_value, 0, len_array * sizeof(AtomicType));
 #endif
+        }
+        std::cout << "here 1\n";
+        timer.start();
         RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N),
-        [=]RAJA_HOST_DEVICE(int)  {
-            AtomicImpl(value, 1);
+        [=] RAJA_HOST_DEVICE(int tid)  {
+            printf("tid %d\n", tid);
+            if (test_array) {
+                AtomicImpl(&(device_value[tid % array_size]), 1);
+            } else {
+                AtomicImpl(device_value, 1);
+            }
         });
-        assert(*value == N);
+        std::cout << "here 2\n";
+        timer.stop();
+        if (IsGPU<ExecPolicy>::value) {
+#if defined(RAJA_ENABLE_CUDA)
+            cudaErrchk(cudaFree((void *)device_value));
+#elif defined(RAJA_ENABLE_HIP)
+            hipFree((void *)device_value);
+#endif
+        }
+        std::cout << "here 3\n";
     }
+
+    double t = timer.elapsed();
+    std::cout << "ELAPSED TIME = " << t << std::endl;
 }
 
-BENCHMARK(DesulAtomicOpLoopInt<RAJA::omp_for_exec, desul::atomic_fetch_add>);
-BENCHMARK(AtomicOpLoopInt<RAJA::omp_for_exec, atomicAddWrapper<int, RAJA::policy::omp::omp_atomic>>);
-// CUDA addition
-BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<1024>, desul::atomic_fetch_add>);
-BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<1024>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<512>, desul::atomic_fetch_add>);
-BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<512>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<256>, desul::atomic_fetch_add>);
-BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<256>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<128>, desul::atomic_fetch_add>);
-BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<128>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<64>, desul::atomic_fetch_add>);
-BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<64>, atomicAddWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-// CUDA max
-//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<1024>, desul::atomic_fetch_max>);
-//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<1024>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<512>, desul::atomic_fetch_max>);
-//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<512>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<256>, desul::atomic_fetch_max>);
-//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<256>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<128>, desul::atomic_fetch_max>);
-//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<128>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-//BENCHMARK(DesulAtomicOpLoopInt<RAJA::cuda_exec<64>, desul::atomic_fetch_max>);
-//BENCHMARK(AtomicOpLoopInt<RAJA::cuda_exec<64>, atomicMaxWrapper<int, RAJA::policy::cuda::cuda_atomic>>);
-
-BENCHMARK_MAIN();
+int main () {
+    // GPU benchmarks
+    TimeAtomicOp<ExecPolicyGPU<32>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention.  CUDA Block size 32, RAJA builtin atomic", 4);
+    TimeAtomicOp<ExecPolicyGPU<32>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention.  CUDA Block size 32, DESUL atomic", 4);
+    TimeAtomicOp<ExecPolicyGPU<64>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 64, RAJA builtin atomic", 4);
+    TimeAtomicOp<ExecPolicyGPU<64>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 64, DESUL atomic", 4);
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention.  CUDA Block size 128, RAJA builtin atomic", 4);
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention.  CUDA Block size 128, DESUL atomic", 4);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 4);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 4);
+
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention.  CUDA Block size 128, RAJA builtin atomic", 2, 10);
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention.  CUDA Block size 128, DESUL atomic", 2, 10);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 2, 10);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 2, 10);
+
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 128, RAJA builtin atomic");
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 128, DESUL atomic");
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 256, RAJA builtin atomic");
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 256, DESUL atomic");
+
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicMax<int>>("CUDA Block size 128, RAJA builtin atomic max");
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 128, DESUL atomic max");
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicMax<int>>("CUDA Block size 256, RAJA builtin atomic max");
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 256, DESUL atomic max");
+    // OpenMP benchmarks
+    TimeAtomicOp<RAJA::omp_for_exec, int, OpenMPAtomicAdd<int>>("OpenMP, int, RAJA builtin atomic");
+    TimeAtomicOp<RAJA::omp_for_exec, int, atomicWrapperDesul<int, RAJA::policy::omp::omp_atomic, desul::atomic_fetch_add>>("OpenMP, desul atomic");
+}