Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bench/bowen/desul raja atomics #1624

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ if (RAJA_ENABLE_CUDA)
message(STATUS "CUDA compute architecture set to RAJA default 35 since it was not specified")
set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "Set CMAKE_CUDA_ARCHITECTURES to RAJA minimum supported" FORCE)
endif()
message(STATUS "CMAKE_CUDA_ARCHITECTURES set to ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "CMAKE_CUDA_ARCHITECTURES set to ${CMAKE_CUDA_ARCHITECTURES}")
if ( (CMAKE_CXX_COMPILER_ID MATCHES GNU) AND (CMAKE_SYSTEM_PROCESSOR MATCHES ppc64le) )
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.0)
set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -mno-float128")
Expand Down Expand Up @@ -225,6 +225,7 @@ if (RAJA_ENABLE_SYCL)
sycl)
endif ()


message(STATUS "Desul Atomics support is ${RAJA_ENABLE_DESUL_ATOMICS}")
if (RAJA_ENABLE_DESUL_ATOMICS)
# NOTE: ENABLE_OPENMP in Desul implies OpenMP OFFLOAD
Expand Down
4 changes: 4 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ if (RAJA_ENABLE_CUDA)
SOURCES host-device-lambda-benchmark.cpp)
endif()

raja_add_benchmark(
NAME benchmark-atomic
SOURCES benchmark-atomic.cpp)

raja_add_benchmark(
NAME ltimes
SOURCES ltimes.cpp)
223 changes: 223 additions & 0 deletions benchmark/benchmark-atomic.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
// and RAJA project contributors. See the RAJA/LICENSE file for details.
//
// SPDX-License-Identifier: (BSD-3-Clause)
//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//

// RAJA/RAJA.hpp cannot be included here because the include logic will
// default all atomic implementations to a desul backend.
#include "RAJA/policy/loop/policy.hpp"
#include "RAJA/policy/openmp/atomic.hpp"
#include "RAJA/RAJA.hpp"
#include "RAJA/policy/openmp/policy.hpp"
// Conditional compilation for CUDA benchmarks.
#include <type_traits>
#include <iostream>
#include <sstream>

/// This helper template is used to deduce if device memory allocations are necessary
/// inside the body of the benchmark, using the type of the execution policy.
template<typename>
struct IsGPU : public std::false_type {};

#if defined RAJA_ENABLE_CUDA
#include "RAJA/policy/cuda.hpp"
#include "RAJA/policy/cuda/atomic.hpp"

template<int M>
struct IsGPU<RAJA::cuda_exec<M>> : public std::true_type {};

template<int BLOCK_SZ>
struct ExecPolicyGPU {
using policy = RAJA::cuda_exec<BLOCK_SZ>;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a synchronous policy.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
using policy = RAJA::cuda_exec<BLOCK_SZ>;
using policy = RAJA::cuda_exec<BLOCK_SZ, true /*asynchronous*/>;

By default, RAJA exec policies are synchronous. To make them asynchronous, you add a template parameter with a bool true value.

std::string PolicyName() {
std::stringstream ss;
ss << "CUDA execution with block size " << BLOCK_SZ;
return ss.str();
}
};

struct GPUAtomic {
using policy = RAJA::policy::cuda::cuda_atomic;
};

template<typename AtomicType>
void AllocateAtomicDevice(AtomicType** atomic, int array_length) {
cudaErrchk(cudaMalloc((void **)atomic, array_length * sizeof(AtomicType)));
cudaMemset(*atomic, 0, array_length * sizeof(AtomicType));
}

template<typename AtomicType>
void DeallocateDeviceAtomic(AtomicType* atomic) {
cudaErrchk(cudaFree((void *)atomic));
}

#elif defined RAJA_ENABLE_HIP
#include "RAJA/policy/hip.hpp"
#include "RAJA/policy/hip/atomic.hpp"

template<int M>
struct IsGPU<RAJA::hip_exec<M>> : public std::true_type {};

template<int BLOCK_SZ>
struct ExecPolicyGPU {
using policy = RAJA::hip_exec<BLOCK_SZ>;
std::string PolicyName() {
std::stringstream ss;
ss << "CUDA execution with block size " << BLOCK_SZ;
return ss.str();
}
};

struct GPUAtomic {
using policy = RAJA::policy::hip::hip_atomic;
};

template<typename AtomicType>
void AllocateAtomicDevice(AtomicType** atomic, int array_length) {
hipMalloc((void **)atomic, len_array * sizeof(AtomicType));
hipMemset(*atomic, 0, len_array * sizeof(AtomicType));
}

template<typename AtomicType>
void DeallocateDeviceAtomic(AtomicType* atomic) {
hipFree((void *)atomic);
}

#endif

#include "desul/atomics.hpp"
#include "RAJA/util/Timer.hpp"

#define N 1000000000
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think N should be a command line arg to select at run time.

#define INDENT " "
using raja_default_desul_order = desul::MemoryOrderRelaxed;
using raja_default_desul_scope = desul::MemoryScopeDevice;

// Desul atomics have a different signature than RAJA's built in ops. The following code provides some
// helper function templates so that they can be called using the same signature in timing code.

// Struct holding Desul atomic signature typedef
template<typename AtomicType>
struct DesulAtomicSignature {
using signature = AtomicType(*)(AtomicType*, const AtomicType, raja_default_desul_order, raja_default_desul_scope);
};

// Struct holding RAJA atomic signature typedef
template<typename AtomicType>
struct RajaAtomicSignature {
using signature = AtomicType(*)(AtomicType*, const AtomicType);
};

/// RAJA::atomicAdd is overloaded and has an ambiguous type so it can't be passed as a template parameter.
/// The following wrappers disambiguate the call and provide a signature comaptible with the DESUL
/// wrapper.
template<typename AtomicType, typename Policy>
RAJA_HOST_DEVICE AtomicType AtomicAdd(AtomicType* acc, const AtomicType val) {
return RAJA::atomicAdd(Policy {}, acc, val);
}

template<typename AtomicType, typename Policy>
RAJA_HOST_DEVICE AtomicType AtomicMax(AtomicType* acc, const AtomicType val) {
return RAJA::atomicMax(Policy {}, acc, val);
}

/// Function template that allows invoking DESUL atomic with a (int*)(T*, T) signature
template<typename T, typename Policy, typename DesulAtomicSignature<T>::signature AtomicImpl>
RAJA_HOST_DEVICE T atomicWrapperDesul(T * acc, T value) {
return AtomicImpl(acc, value, raja_default_desul_order{},
raja_default_desul_scope{});
}

//template<typename T, typename RajaAtomicSignature<T>::signature atomic>
//class IsDesul : public std::false_type {};
//
//template<typename T, typename Policy, typename DesulAtomicSignature<T>::signature AtomicImpl>
//class IsDesul<atomicWrapperDesul<T, Policy, AtomicImpl>> : public std::true_type {};


template<typename AtomicType, typename Policy>
std::string GetImplName (typename DesulAtomicSignature<AtomicType>::signature) {
return "Desul atomic";
}

template <class ExecPolicy, typename AtomicType, typename RajaAtomicSignature<AtomicType>::signature AtomicImpl, bool test_array = false>
void TimeAtomicOp(int num_iterations = 2, int array_size = 100) {
RAJA::Timer timer;

for (int i = 0; i < num_iterations; ++i) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be good to not time the first run of the kernel. Then time multiple iterations of the loop, running asynchronously for gpus, instead of timing each loop individually.

AtomicType* device_value = nullptr;
int len_array = test_array ? array_size : 1;
if (IsGPU<ExecPolicy>::value) {
AllocateAtomicDevice(&device_value, len_array);
} else {
device_value = new AtomicType [len_array];
}
timer.start();
RAJA::forall<ExecPolicy>(RAJA::RangeSegment(0, N),
[=] RAJA_HOST_DEVICE(int tid) {
if (test_array) {
AtomicImpl(&(device_value[tid % array_size]), 1);
} else {
AtomicImpl(device_value, 1);
}
});

timer.stop();
if (IsGPU<ExecPolicy>::value) {
DeallocateDeviceAtomic(device_value);
} else {
delete device_value;
}

}

double t = timer.elapsed();
std::cout << INDENT << INDENT << t << "s" << INDENT;
//std::cout << GetImplName(AtomicImpl) << ", ";
std::cout << "Number of atomics under contention " << array_size << ", ";
std::cout << num_iterations * N << " many atomic operations" << ", ";
//std::cout << ExecPolicy::PolicyName();
std::cout << std::endl;
}

int main () {
// GPU benchmarks
std::cout << "Executing CUDA benchmarks" << std::endl;
std::cout << INDENT << "Executing atomic add benchmarks" << std::endl;
TimeAtomicOp<ExecPolicyGPU<64>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may be cleaner and easier to work with to set a constexpr thread block size variable to a default value, such as 256, at the top of the file. I think we agreed in a recent group meeting that thread block size doesn't have a significant performance impact for the simple kernels in this file. @MrBurmark what do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds good to me, then we could vary it fairly easily if we wanted to.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And we could follow the pattern in RAJA Perf if we wanted to try block size sweeps.

Copy link
Member

@MrBurmark MrBurmark May 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a for_each_type function into RAJA for just these kind of use cases https://github.com/LLNL/RAJA/blob/develop/include/RAJA/util/for_each.hpp#L88

TimeAtomicOp<ExecPolicyGPU<64>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4);
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4);
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4);

TimeAtomicOp<ExecPolicyGPU<128>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4, 10);
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4, 10);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, AtomicAdd<int, typename GPUAtomic::policy>, true>(4, 10);
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_add>, true>(4, 10);

std::cout << INDENT << "Executing atomic add benchmarks" << std::endl;

TimeAtomicOp<ExecPolicyGPU<128>::policy, double, AtomicAdd<double, typename GPUAtomic::policy>>();
TimeAtomicOp<ExecPolicyGPU<128>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>();
TimeAtomicOp<ExecPolicyGPU<256>::policy, double, AtomicAdd<double, typename GPUAtomic::policy>>();
TimeAtomicOp<ExecPolicyGPU<256>::policy, double, atomicWrapperDesul<double, typename GPUAtomic::policy, desul::atomic_fetch_add>>();

std::cout << INDENT << "Executing atomic max benchmarks" << std::endl;

TimeAtomicOp<ExecPolicyGPU<128>::policy, int, AtomicMax<int, GPUAtomic::policy>>();
TimeAtomicOp<ExecPolicyGPU<128>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>();
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, AtomicMax<int, GPUAtomic::policy>>();
TimeAtomicOp<ExecPolicyGPU<256>::policy, int, atomicWrapperDesul<int, typename GPUAtomic::policy, desul::atomic_fetch_max>>();
// OpenMP benchmarks
std::cout << "Executing OpenMP benchmarks" << std::endl;
std::cout << INDENT << "Executing atomic add benchmarks" << std::endl;
TimeAtomicOp<RAJA::omp_for_exec, int, AtomicAdd<int, RAJA::policy::omp::omp_atomic>>();
TimeAtomicOp<RAJA::omp_for_exec, int, atomicWrapperDesul<int, RAJA::policy::omp::omp_atomic, desul::atomic_fetch_add>>();

return 0;
}