Skip to content

Commit

Permalink
Converting some runtime benchmarks to use our C API. (#17336)
Browse files Browse the repository at this point in the history
The API is incomplete and only works from C++ today but allows us to
hide the google benchmark dependency behind the iree::testing::benchmark
library. Some bazel/cmake goo can be used to switch off the google
benchmark library and no-op it (until we have an embedded-friendly impl)
but that's left as future work.

This should make solving what #16110 was doing much easier.
  • Loading branch information
benvanik committed May 10, 2024
1 parent 0568bd2 commit 0b8b13c
Show file tree
Hide file tree
Showing 15 changed files with 363 additions and 167 deletions.
2 changes: 1 addition & 1 deletion runtime/src/iree/base/internal/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,8 @@ cc_binary_benchmark(
deps = [
":fpu_state",
"//runtime/src/iree/base",
"//runtime/src/iree/testing:benchmark",
"//runtime/src/iree/testing:benchmark_main",
"@com_google_benchmark//:benchmark",
],
)

Expand Down
2 changes: 1 addition & 1 deletion runtime/src/iree/base/internal/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,8 @@ iree_cc_binary_benchmark(
"fpu_state_benchmark.cc"
DEPS
::fpu_state
benchmark
iree::base
iree::testing::benchmark
iree::testing::benchmark_main
TESTONLY
)
Expand Down
96 changes: 50 additions & 46 deletions runtime/src/iree/base/internal/fpu_state_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,28 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <cstddef>
#include <stddef.h>

#include "benchmark/benchmark.h"
#include "iree/base/api.h"
#include "iree/base/internal/fpu_state.h"
#include "iree/testing/benchmark.h"

namespace {

constexpr size_t kElementBufferSize = 2048;
#define ELEMENT_BUFFER_SIZE 2048

// Scales a buffer of floats by |scale| and disables autovectorization.
// Will generally be normal scalar floating point math and indicate whether the
// FPU has issues with denormals.
static float UnvectorizedScaleBufferByValue(float scale) {
float buffer[kElementBufferSize];
float buffer[ELEMENT_BUFFER_SIZE];
for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
buffer[i] = 1.0f;
}
benchmark::DoNotOptimize(*buffer);
iree_optimization_barrier(*buffer);
for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
buffer[i] *= scale;
benchmark::DoNotOptimize(buffer[i]);
iree_optimization_barrier(buffer[i]);
}
benchmark::DoNotOptimize(*buffer);
iree_optimization_barrier(*buffer);
float sum = 0.0f;
for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
sum += buffer[i];
Expand All @@ -39,86 +37,92 @@ static float UnvectorizedScaleBufferByValue(float scale) {
// Will generally be SIMD floating point math and indicate whether the vector
// units (NEON, AVX, etc) have issues with denormals.
static float VectorizedScaleBufferByValue(float scale) {
float buffer[kElementBufferSize];
float buffer[ELEMENT_BUFFER_SIZE];
for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
buffer[i] = 1.0f;
}
benchmark::DoNotOptimize(*buffer);
iree_optimization_barrier(*buffer);
for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
buffer[i] *= scale;
}
benchmark::DoNotOptimize(*buffer);
iree_optimization_barrier(*buffer);
float sum = 0.0f;
for (size_t i = 0; i < IREE_ARRAYSIZE(buffer); ++i) {
sum += buffer[i];
}
return sum;
}

void BM_UnvectorizedNormals(benchmark::State& state) {
for (auto _ : state) {
benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1.0f));
IREE_BENCHMARK_FN(BM_UnvectorizedNormals) {
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(UnvectorizedScaleBufferByValue(1.0f));
}
return iree_ok_status();
}
BENCHMARK(BM_UnvectorizedNormals);
IREE_BENCHMARK_REGISTER(BM_UnvectorizedNormals);

void BM_UnvectorizedDenormals(benchmark::State& state) {
for (auto _ : state) {
benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1e-39f));
IREE_BENCHMARK_FN(BM_UnvectorizedDenormals) {
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(UnvectorizedScaleBufferByValue(1e-39f));
}
return iree_ok_status();
}
BENCHMARK(BM_UnvectorizedDenormals);
IREE_BENCHMARK_REGISTER(BM_UnvectorizedDenormals);

void BM_UnvectorizedDenormalsFlushedToZero(benchmark::State& state) {
IREE_BENCHMARK_FN(BM_UnvectorizedDenormalsFlushedToZero) {
iree_fpu_state_t fpu_state =
iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO);
for (auto _ : state) {
benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1e-39f));
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(UnvectorizedScaleBufferByValue(1e-39f));
}
iree_fpu_state_pop(fpu_state);
return iree_ok_status();
}
BENCHMARK(BM_UnvectorizedDenormalsFlushedToZero);
IREE_BENCHMARK_REGISTER(BM_UnvectorizedDenormalsFlushedToZero);

void BM_UnvectorizedDenormalsNotFlushedToZero(benchmark::State& state) {
IREE_BENCHMARK_FN(BM_UnvectorizedDenormalsNotFlushedToZero) {
iree_fpu_state_t fpu_state = iree_fpu_state_push(IREE_FPU_STATE_DEFAULT);
for (auto _ : state) {
benchmark::DoNotOptimize(UnvectorizedScaleBufferByValue(1e-39f));
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(UnvectorizedScaleBufferByValue(1e-39f));
}
iree_fpu_state_pop(fpu_state);
return iree_ok_status();
}
BENCHMARK(BM_UnvectorizedDenormalsNotFlushedToZero);
IREE_BENCHMARK_REGISTER(BM_UnvectorizedDenormalsNotFlushedToZero);

void BM_VectorizedNormals(benchmark::State& state) {
for (auto _ : state) {
benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1.0f));
IREE_BENCHMARK_FN(BM_VectorizedNormals) {
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(VectorizedScaleBufferByValue(1.0f));
}
return iree_ok_status();
}
BENCHMARK(BM_VectorizedNormals);
IREE_BENCHMARK_REGISTER(BM_VectorizedNormals);

void BM_VectorizedDenormals(benchmark::State& state) {
for (auto _ : state) {
benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1e-39f));
IREE_BENCHMARK_FN(BM_VectorizedDenormals) {
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(VectorizedScaleBufferByValue(1e-39f));
}
return iree_ok_status();
}
BENCHMARK(BM_VectorizedDenormals);
IREE_BENCHMARK_REGISTER(BM_VectorizedDenormals);

void BM_VectorizedDenormalsFlushedToZero(benchmark::State& state) {
IREE_BENCHMARK_FN(BM_VectorizedDenormalsFlushedToZero) {
iree_fpu_state_t fpu_state =
iree_fpu_state_push(IREE_FPU_STATE_FLAG_FLUSH_DENORMALS_TO_ZERO);
for (auto _ : state) {
benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1e-39f));
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(VectorizedScaleBufferByValue(1e-39f));
}
iree_fpu_state_pop(fpu_state);
return iree_ok_status();
}
BENCHMARK(BM_VectorizedDenormalsFlushedToZero);
IREE_BENCHMARK_REGISTER(BM_VectorizedDenormalsFlushedToZero);

void BM_VectorizedDenormalsNotFlushedToZero(benchmark::State& state) {
IREE_BENCHMARK_FN(BM_VectorizedDenormalsNotFlushedToZero) {
iree_fpu_state_t fpu_state = iree_fpu_state_push(IREE_FPU_STATE_DEFAULT);
for (auto _ : state) {
benchmark::DoNotOptimize(VectorizedScaleBufferByValue(1e-39f));
while (iree_benchmark_keep_running(benchmark_state, 1)) {
iree_optimization_barrier(VectorizedScaleBufferByValue(1e-39f));
}
iree_fpu_state_pop(fpu_state);
return iree_ok_status();
}
BENCHMARK(BM_VectorizedDenormalsNotFlushedToZero);

} // namespace
IREE_BENCHMARK_REGISTER(BM_VectorizedDenormalsNotFlushedToZero);
6 changes: 2 additions & 4 deletions runtime/src/iree/builtins/device/tools/libdevice_benchmark.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ static iree_status_t iree_h2f_ieee_benchmark(
while (iree_benchmark_keep_running(benchmark_state,
/*batch_count=*/FLAG_batch_count)) {
for (int i = 0; i < FLAG_batch_count; ++i) {
// TODO(benvanik): iree_do_not_optimize barrier.
iree_h2f_ieee(0x3400 + i);
iree_optimization_barrier(iree_h2f_ieee(0x3400 + i));
}
}
return iree_ok_status();
Expand All @@ -31,8 +30,7 @@ static iree_status_t iree_f2h_ieee_benchmark(
while (iree_benchmark_keep_running(benchmark_state,
/*batch_count=*/FLAG_batch_count)) {
for (int i = 0; i < FLAG_batch_count; ++i) {
// TODO(benvanik): iree_do_not_optimize barrier.
iree_f2h_ieee(0.25f + i);
iree_optimization_barrier(iree_f2h_ieee(0.25f + i));
}
}
return iree_ok_status();
Expand Down
1 change: 1 addition & 0 deletions runtime/src/iree/testing/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ iree_runtime_cc_library(
],
deps = [
"//runtime/src/iree/base",
"//runtime/src/iree/base/internal",
"@com_google_benchmark//:benchmark",
],
)
Expand Down
149 changes: 145 additions & 4 deletions runtime/src/iree/testing/benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,111 @@
// systems and use some simple tooling while also allowing them to run on
// the full benchmark library with all its useful reporting and statistics.

#include <math.h>

#include "iree/base/api.h"

#ifdef __cplusplus
extern "C" {
#endif // __cplusplus

//===----------------------------------------------------------------------===//
// Benchmarking tools
//===----------------------------------------------------------------------===//

void iree_benchmark_use_ptr(char const volatile* x);

#if !defined(IREE_BENCHMARK_HAS_INLINE_ASSEMBLY)
#if defined(IREE_COMPILER_MSVC) || defined(IREE_PLATFORM_EMSCRIPTEN)
#define IREE_BENCHMARK_HAS_INLINE_ASSEMBLY 0
#elif defined(IREE_COMPILER_CLANG) || defined(IREE_COMPILER_GCC)
#define IREE_BENCHMARK_HAS_INLINE_ASSEMBLY 1
#else
#define IREE_BENCHMARK_HAS_INLINE_ASSEMBLY 0
#endif // non-asm-targets
#endif // !IREE_BENCHMARK_HAS_INLINE_ASSEMBLY

#if IREE_BENCHMARK_HAS_INLINE_ASSEMBLY == 0

#if defined(IREE_COMPILER_MSVC)
#define iree_benchmark_clobber() _ReadWriteBarrier()
#else
#define iree_benchmark_clobber()
#endif // IREE_COMPILER_MSVC

#if defined(__cplusplus)
} // extern "C"
template <typename T>
inline IREE_ATTRIBUTE_ALWAYS_INLINE void iree_optimization_barrier(T&& value) {
iree_benchmark_use_ptr(&reinterpret_cast<char const volatile&>(value));
iree_benchmark_clobber();
}
extern "C" {
#else
// TODO: a C-compatible optimization barrier.
#define iree_optimization_barrier(x)
#endif // __cplusplus

#elif defined(IREE_COMPILER_CLANG)

#if defined(__cplusplus)
} // extern "C"
inline IREE_ATTRIBUTE_ALWAYS_INLINE void iree_benchmark_clobber() {
asm volatile("" : : : "memory");
}
template <typename T>
inline IREE_ATTRIBUTE_ALWAYS_INLINE void iree_optimization_barrier(T&& value) {
asm volatile("" : "+r,m"(value) : : "memory");
}
extern "C" {
#else
// TODO: a C-compatible optimization barrier.
#define iree_optimization_barrier(x)
#endif // __cplusplus

#elif defined(IREE_COMPILER_GCC)

#if defined(__cplusplus)
} // extern "C"
inline IREE_ATTRIBUTE_ALWAYS_INLINE void iree_benchmark_clobber() {
asm volatile("" : : : "memory");
}
template <typename T>
inline IREE_ATTRIBUTE_ALWAYS_INLINE
typename std::enable_if<std::is_trivially_copyable<T>::value &&
(sizeof(T) <= sizeof(T*))>::type
iree_optimization_barrier(T& value) {
asm volatile("" : "+m,r"(value) : : "memory");
}
template <typename T>
inline IREE_ATTRIBUTE_ALWAYS_INLINE
typename std::enable_if<!std::is_trivially_copyable<T>::value ||
(sizeof(T) > sizeof(T*))>::type
iree_optimization_barrier(T& value) {
asm volatile("" : "+m"(value) : : "memory");
}
template <typename T>
inline IREE_ATTRIBUTE_ALWAYS_INLINE
typename std::enable_if<std::is_trivially_copyable<T>::value &&
(sizeof(T) <= sizeof(T*))>::type
iree_optimization_barrier(T&& value) {
asm volatile("" : "+m,r"(value) : : "memory");
}
template <typename T>
inline IREE_ATTRIBUTE_ALWAYS_INLINE
typename std::enable_if<!std::is_trivially_copyable<T>::value ||
(sizeof(T) > sizeof(T*))>::type
iree_optimization_barrier(T&& value) {
asm volatile("" : "+m"(value) : : "memory");
}
extern "C" {
#else
// TODO: a C-compatible optimization barrier.
#define iree_optimization_barrier(x)
#endif // __cplusplus

#endif // IREE_BENCHMARK_HAS_INLINE_ASSEMBLY

//===----------------------------------------------------------------------===//
// iree_benchmark_state_t
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -98,6 +197,10 @@ typedef enum iree_benchmark_unit_e {

typedef struct iree_benchmark_def_t iree_benchmark_def_t;

typedef iree_status_t(IREE_API_PTR* iree_benchmark_fn_t)(
const iree_benchmark_def_t* benchmark_def,
iree_benchmark_state_t* benchmark_state);

// A benchmark case definition.
struct iree_benchmark_def_t {
// IREE_BENCHMARK_FLAG_* bitmask controlling benchmark behavior and reporting.
Expand All @@ -116,16 +219,54 @@ struct iree_benchmark_def_t {
// Runs the benchmark to completion.
// Implementations must call iree_benchmark_keep_running in a loop until it
// returns false.
iree_status_t (*run)(const iree_benchmark_def_t* benchmark_def,
iree_benchmark_state_t* benchmark_state);
iree_benchmark_fn_t run;

// User-defined data accessible in the run function.
const void* user_data;
};

// Registers a benchmark with the given definition.
void iree_benchmark_register(iree_string_view_t name,
const iree_benchmark_def_t* benchmark_def);
const iree_benchmark_def_t* iree_benchmark_register(
iree_string_view_t name, const iree_benchmark_def_t* benchmark_def);

//===----------------------------------------------------------------------===//
// Benchmark registration utilities
//===----------------------------------------------------------------------===//

#define IREE_BENCHMARK_IMPL_NAME_(name) \
IREE_BENCHMARK_IMPL_CONCAT_(iree_benchmark_, __COUNTER__, name)
#define IREE_BENCHMARK_IMPL_CONCAT_(a, b, c) \
IREE_BENCHMARK_IMPL_CONCAT2_(a, b, c)
#define IREE_BENCHMARK_IMPL_CONCAT2_(a, b, c) a##b##c

#define IREE_BENCHMARK_FN(name) \
static iree_status_t name(const iree_benchmark_def_t* benchmark_def, \
iree_benchmark_state_t* benchmark_state)

// Allocates a benchmark definition for the given function and returns it.
// The returned pointer is safe to store in a static variable.
// TODO(benvanik): allow optionally passing flags with variadic macros.
iree_benchmark_def_t* iree_make_function_benchmark(iree_benchmark_fn_t fn);

// TODO(benvanik): find a way to make this C-compatible.
// Today this requires C++ in order to initialize the benchmark via the function
// and C disallows this. We can probably use some tricky attributes to run
// functions instead.
//
// Defines a benchmark of a function with default parameters.
//
// Example:
// IREE_BENCHMARK_FN(my_benchmark) {
// while (iree_benchmark_keep_running(benchmark_state, 1000)) {
// // process 1000 elements
// }
// return iree_ok_status();
// }
// IREE_BENCHMARK_REGISTER(my_benchmark);
#define IREE_BENCHMARK_REGISTER(name) \
static const iree_benchmark_def_t* IREE_BENCHMARK_IMPL_NAME_(name) \
IREE_ATTRIBUTE_UNUSED = (iree_benchmark_def_t*)iree_benchmark_register( \
iree_make_cstring_view(#name), iree_make_function_benchmark(name))

//===----------------------------------------------------------------------===//
// Benchmark infra management
Expand Down

0 comments on commit 0b8b13c

Please sign in to comment.