reformat benchmark printing and cleanup conditional compilation

LLNL · Apr 23, 2024 · c4a9582 · c4a9582
1 parent 7bcbe68
commit c4a9582
Showing 1 changed file with 130 additions and 82 deletions.
diff --git a/benchmark/benchmark-atomic.cpp b/benchmark/benchmark-atomic.cpp
@@ -11,41 +11,90 @@
 #include "RAJA/policy/openmp/atomic.hpp"
 #include "RAJA/RAJA.hpp"
 #include "RAJA/policy/openmp/policy.hpp"
+// Conditional compilation for CUDA benchmarks.
+#include <type_traits>
+#include <iostream>
+#include <sstream>
+
+/// This helper template is used to deduce if device memory allocations are necessary
+/// inside the body of the benchmark, using the type of the execution policy.
+template<typename>
+struct IsGPU : public std::false_type {};
+
 #if defined RAJA_ENABLE_CUDA
 #include "RAJA/policy/cuda.hpp"
 #include "RAJA/policy/cuda/atomic.hpp"
+
+template<int M>
+struct IsGPU<RAJA::cuda_exec<M>> : public std::true_type {};
+
+template<int BLOCK_SZ>
+struct ExecPolicyGPU {
+    using policy = RAJA::cuda_exec<BLOCK_SZ>;
+    std::string PolicyName() {
+        std::stringstream ss;
+        ss << "CUDA execution with block size " << BLOCK_SZ;
+        return ss.str();
+    }
+};
+
+struct GPUAtomic {
+    using policy = RAJA::policy::cuda::cuda_atomic;
+};
+
+template<typename AtomicType>
+void AllocateAtomicDevice(AtomicType** atomic, int array_length) {
+    cudaErrchk(cudaMalloc((void **)atomic, array_length * sizeof(AtomicType)));
+    cudaMemset(*atomic, 0, array_length * sizeof(AtomicType));
+}
+
+template<typename AtomicType>
+void DeallocateDeviceAtomic(AtomicType* atomic) {
+    cudaErrchk(cudaFree((void *)atomic));
+}
+
 #elif defined RAJA_ENABLE_HIP
 #include "RAJA/policy/hip.hpp"
 #include "RAJA/policy/hip/atomic.hpp"
-#endif
-
-#include "desul/atomics.hpp"
-#include "RAJA/util/Timer.hpp"
-
-#include <type_traits>
-#include <iostream>
 
-#define N 1000000000
-using raja_default_desul_order = desul::MemoryOrderRelaxed;
-using raja_default_desul_scope = desul::MemoryScopeDevice;
+template<int M>
+struct IsGPU<RAJA::hip_exec<M>> : public std::true_type {};
 
 template<int BLOCK_SZ>
 struct ExecPolicyGPU {
-#if defined RAJA_ENABLE_CUDA
-    using policy = RAJA::cuda_exec<BLOCK_SZ>;
-#elif defined RAJA_ENABLE_HIP
     using policy = RAJA::hip_exec<BLOCK_SZ>;
-#endif
+    std::string PolicyName() {
+        std::stringstream ss;
+        ss << "CUDA execution with block size " << BLOCK_SZ;
+        return ss.str();
+    }
 };
 
 struct GPUAtomic {
-    #if defined RAJA_ENABLE_CUDA
-        using policy = RAJA::policy::cuda::cuda_atomic;
-    #elif defined RAJA_ENABLE_HIP
-        using policy = RAJA::policy::hip::hip_atomic;
-    #endif
+    using policy = RAJA::policy::hip::hip_atomic;
 };
 
+template<typename AtomicType>
+void AllocateAtomicDevice(AtomicType** atomic, int array_length) {
+    hipMalloc((void **)atomic, len_array * sizeof(AtomicType));
+    hipMemset(*atomic, 0, len_array * sizeof(AtomicType));
+}
+
+template<typename AtomicType>
+void DeallocateDeviceAtomic(AtomicType* atomic) {
+    hipFree((void *)atomic);
+}
+
+#endif
+
+#include "desul/atomics.hpp"
+#include "RAJA/util/Timer.hpp"
+
+#define N 1000000000
+#define INDENT "  "
+using raja_default_desul_order = desul::MemoryOrderRelaxed;
+using raja_default_desul_scope = desul::MemoryScopeDevice;
+
 // Desul atomics have a different signature than RAJA's built in ops. The following code provides some
 // helper function templates so that they can be called using the same signature in timing code.
 
@@ -64,58 +113,46 @@ struct RajaAtomicSignature {
 /// RAJA::atomicAdd is overloaded and has an ambiguous type so it can't be passed as a template parameter.
 /// The following wrappers disambiguate the call and provide a signature comaptible with the DESUL
 /// wrapper.
-template<typename AtomicType>
-RAJA_HOST_DEVICE AtomicType GPUAtomicAdd(AtomicType* acc, const AtomicType val) {
-    return RAJA::atomicAdd(GPUAtomic::policy {}, acc, val);
-}
-
-template<typename AtomicType>
-RAJA_HOST_DEVICE AtomicType GPUAtomicMax(AtomicType* acc, const AtomicType val) {
-    return RAJA::atomicMax(GPUAtomic::policy {}, acc, val);
+template<typename AtomicType, typename Policy>
+RAJA_HOST_DEVICE AtomicType AtomicAdd(AtomicType* acc, const AtomicType val) {
+    return RAJA::atomicAdd(Policy {}, acc, val);
 }
 
-template<typename AtomicType>
-RAJA_HOST_DEVICE AtomicType OpenMPAtomicAdd(AtomicType* acc, const AtomicType val) {
-    return RAJA::atomicAdd(RAJA::policy::omp::omp_atomic{}, acc, val);
+template<typename AtomicType, typename Policy>
+RAJA_HOST_DEVICE AtomicType AtomicMax(AtomicType* acc, const AtomicType val) {
+    return RAJA::atomicMax(Policy {}, acc, val);
 }
 
 /// Function template that allows invoking DESUL atomic with a (int*)(T*, T) signature
 template<typename T, typename Policy, typename DesulAtomicSignature<T>::signature AtomicImpl>
 RAJA_HOST_DEVICE T atomicWrapperDesul(T * acc, T value) {
-
     return AtomicImpl(acc, value, raja_default_desul_order{},
                     raja_default_desul_scope{});
 }
 
-template<typename>
-struct IsGPU : public std::false_type {};
+//template<typename T, typename RajaAtomicSignature<T>::signature atomic>
+//class IsDesul : public std::false_type {};
+//
+//template<typename T, typename Policy, typename DesulAtomicSignature<T>::signature AtomicImpl>
+//class IsDesul<atomicWrapperDesul<T, Policy, AtomicImpl>> : public std::true_type {};
 
-/// These helper templates are used to deduce if device memory allocations are necessary
-/// inside the body of the benchmark, using the type of the execution policy.
-#if defined RAJA_ENABLE_CUDA
-template<int M>
-struct IsGPU<RAJA::cuda_exec<M>> : public std::true_type {};
-#elif defined RAJA_ENABLE_HIP
-template<int M>
-struct IsGPU<RAJA::hip_exec<M>> : public std::true_type {};
-#endif
+
+template<typename AtomicType, typename Policy>
+std::string GetImplName (typename DesulAtomicSignature<AtomicType>::signature) {
+    return "Desul atomic";
+}
 
 template <class ExecPolicy, typename AtomicType, typename RajaAtomicSignature<AtomicType>::signature AtomicImpl, bool test_array = false>
-void TimeAtomicOp(const std::string& test_name, int num_iterations = 2, int array_size = 100) {
-    std::cout << "EXECUTING " << test_name << "; ";
+void TimeAtomicOp(int num_iterations = 2, int array_size = 100) {
     RAJA::Timer timer;
 
     for (int i = 0; i < num_iterations; ++i) {
         AtomicType* device_value = nullptr;
         int len_array = test_array ? array_size : 1;
         if (IsGPU<ExecPolicy>::value) {
-#if defined(RAJA_ENABLE_CUDA)
-            cudaErrchk(cudaMalloc((void **)&device_value, len_array * sizeof(AtomicType)));
-            cudaMemset(device_value, 0, len_array * sizeof(AtomicType));
-#elif defined(RAJA_ENABLE_HIP)
-            hipMalloc((void **)&device_value, len_array * sizeof(AtomicType));
-            hipMemset(device_value, 0, len_array * sizeof(AtomicType));
-#endif
+            AllocateAtomicDevice(&device_value, len_array);
+        } else {
+            device_value = new AtomicType [len_array];
         }
         timer.start();
         RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N),
@@ -129,47 +166,58 @@ void TimeAtomicOp(const std::string& test_name, int num_iterations = 2, int arra
 
         timer.stop();
         if (IsGPU<ExecPolicy>::value) {
-#if defined(RAJA_ENABLE_CUDA)
-            cudaErrchk(cudaFree((void *)device_value));
-#elif defined(RAJA_ENABLE_HIP)
-            hipFree((void *)device_value);
-#endif
+            DeallocateDeviceAtomic(device_value);
+        } else {
+            delete device_value;
         }
 
     }
 
     double t = timer.elapsed();
-    std::cout << "ELAPSED TIME = " << t << std::endl;
+    std::cout << INDENT << INDENT << t << "s" << INDENT;
+    //std::cout << GetImplName(AtomicImpl) << ", ";
+    std::cout << "Number of atomics under contention " << array_size << ", ";
+    std::cout << num_iterations * N << " many atomic operations" << ", ";
+    //std::cout << ExecPolicy::PolicyName();
+    std::cout << std::endl;
 }
 
 int main () {
     // GPU benchmarks
-    TimeAtomicOp<ExecPolicyGPU<32>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention.  CUDA Block size 32, RAJA builtin atomic", 4);
-    TimeAtomicOp<ExecPolicyGPU<32>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention.  CUDA Block size 32, DESUL atomic", 4);
-    TimeAtomicOp<ExecPolicyGPU<64>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 64, RAJA builtin atomic", 4);
-    TimeAtomicOp<ExecPolicyGPU<64>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 64, DESUL atomic", 4);
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention.  CUDA Block size 128, RAJA builtin atomic", 4);
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention.  CUDA Block size 128, DESUL atomic", 4);
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 4);
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 4);
-
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention.  CUDA Block size 128, RAJA builtin atomic", 2, 10);
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention.  CUDA Block size 128, DESUL atomic", 2, 10);
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicAdd<int>, true>("Benchmark array contention. CUDA Block size 256, RAJA builtin atomic", 2, 10);
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>("Benchmark array contention. CUDA Block size 256, DESUL atomic", 2, 10);
-
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 128, RAJA builtin atomic");
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 128, DESUL atomic");
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, double, GPUAtomicAdd<double>>("CUDA Block size 256, RAJA builtin atomic");
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>("CUDA Block size 256, DESUL atomic");
-
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, GPUAtomicMax<int>>("CUDA Block size 128, RAJA builtin atomic max");
-    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 128, DESUL atomic max");
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, GPUAtomicMax<int>>("CUDA Block size 256, RAJA builtin atomic max");
-    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>("CUDA Block size 256, DESUL atomic max");
+    std::cout << "Executing CUDA benchmarks" << std::endl;
+    std::cout << INDENT << "Executing atomic add benchmarks" << std::endl;
+    TimeAtomicOp<ExecPolicyGPU<64>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4);
+    TimeAtomicOp<ExecPolicyGPU<64>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4);
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4);
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4);
+
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4, 10);
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4, 10);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4, 10);
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4, 10);
+
+    std::cout << INDENT << "Executing atomic add benchmarks" << std::endl;
+
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, double, AtomicAdd<double, typename GPUAtomic::policy>>();
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>();
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, double, AtomicAdd<double, typename GPUAtomic::policy>>();
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>();
+
+    std::cout << INDENT << "Executing atomic max benchmarks" << std::endl;
+
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, AtomicMax<int, GPUAtomic::policy>>();
+    TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>();
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, AtomicMax<int, GPUAtomic::policy>>();
+    TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>();
     // OpenMP benchmarks
-    //TimeAtomicOp<RAJA::omp_for_exec, int, OpenMPAtomicAdd<int>>("OpenMP, int, RAJA builtin atomic");
-    //TimeAtomicOp<RAJA::omp_for_exec, int, atomicWrapperDesul<int, RAJA::policy::omp::omp_atomic, desul::atomic_fetch_add>>("OpenMP, desul atomic");
+    std::cout << "Executing OpenMP benchmarks" << std::endl;
+    std::cout << INDENT << "Executing atomic add benchmarks" << std::endl;
+    TimeAtomicOp<RAJA::omp_for_exec, int, AtomicAdd<int, RAJA::policy::omp::omp_atomic>>();
+    TimeAtomicOp<RAJA::omp_for_exec, int, atomicWrapperDesul<int, RAJA::policy::omp::omp_atomic, desul::atomic_fetch_add>>();
 
     return 0;
 }
+
+