Skip to content

Commit

Permalink
#136: Add BASE64_FORCE_INLINE macro
Browse files Browse the repository at this point in the history
Add a `BASE64_FORCE_INLINE' macro that has the effect of ensuring that a
function is always inlined, even when the compiler would normally not
inline it (e.g. due to disabling optimizations or when doing certain
debug builds).

This macro is applied to a number of very hot inner loop functions that
were always intended to be fully inlined, such as the various
`enc_translate' and `enc_reshuffle' functions, but which were broken out
into separate functions to make the data flow easier to follow. Making
them separate functions had the side effect that the compiler would
sometimes choose not to inline them. Applying this macro respects the
author's intent, and ensures that the library is performant even when
building with few or no optimizations.

Tests show that this increases benchmark scores for 32-bit SSSE3
decoding, and probably similar on other platforms.

Resolves #136.
  • Loading branch information
aklomp committed Feb 29, 2024
2 parents 06f9989 + 32e5eb6 commit b20a31a
Show file tree
Hide file tree
Showing 24 changed files with 37 additions and 27 deletions.
2 changes: 1 addition & 1 deletion lib/arch/avx2/dec_loop.c
@@ -1,4 +1,4 @@
static inline int
static BASE64_FORCE_INLINE int
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const __m256i lut_lo = _mm256_setr_epi8(
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/avx2/dec_reshuffle.c
@@ -1,4 +1,4 @@
static inline __m256i
static BASE64_FORCE_INLINE __m256i
dec_reshuffle (const __m256i in)
{
// in, lower lane, bits, upper case are most significant bits, lower
Expand Down
4 changes: 2 additions & 2 deletions lib/arch/avx2/enc_loop.c
@@ -1,4 +1,4 @@
static inline void
static BASE64_FORCE_INLINE void
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
{
// First load is done at s - 0 to not get a segfault:
Expand All @@ -17,7 +17,7 @@ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
*o += 32;
}

static inline void
static BASE64_FORCE_INLINE void
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/avx2/enc_reshuffle.c
@@ -1,4 +1,4 @@
static inline __m256i
static BASE64_FORCE_INLINE __m256i
enc_reshuffle (const __m256i input)
{
// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/avx2/enc_translate.c
@@ -1,4 +1,4 @@
static inline __m256i
static BASE64_FORCE_INLINE __m256i
enc_translate (const __m256i in)
{
// A lookup table containing the absolute offsets for all ranges:
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/avx512/enc_loop.c
@@ -1,4 +1,4 @@
static inline void
static BASE64_FORCE_INLINE void
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
{
// Load input.
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/avx512/enc_reshuffle_translate.c
@@ -1,7 +1,7 @@
// AVX512 algorithm is based on permutevar and multishift. The code is based on
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.

static inline __m512i
static BASE64_FORCE_INLINE __m512i
enc_reshuffle_translate (const __m512i input)
{
// 32-bit input
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/generic/32/dec_loop.c
@@ -1,4 +1,4 @@
static inline int
static BASE64_FORCE_INLINE int
dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const uint32_t str
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/generic/32/enc_loop.c
@@ -1,4 +1,4 @@
static inline void
static BASE64_FORCE_INLINE void
enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
{
uint32_t src;
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/generic/64/enc_loop.c
@@ -1,4 +1,4 @@
static inline void
static BASE64_FORCE_INLINE void
enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
{
uint64_t src;
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/neon32/codec.c
Expand Up @@ -22,7 +22,7 @@
#define BASE64_NEON32_USE_ASM
#endif

static inline uint8x16_t
static BASE64_FORCE_INLINE uint8x16_t
vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
{
// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
Expand Down
6 changes: 3 additions & 3 deletions lib/arch/neon32/dec_loop.c
@@ -1,4 +1,4 @@
static inline int
static BASE64_FORCE_INLINE int
is_nonzero (const uint8x16_t v)
{
uint64_t u64;
Expand All @@ -9,7 +9,7 @@ is_nonzero (const uint8x16_t v)
return u64 != 0;
}

static inline uint8x16_t
static BASE64_FORCE_INLINE uint8x16_t
delta_lookup (const uint8x16_t v)
{
const uint8x8_t lut = {
Expand All @@ -21,7 +21,7 @@ delta_lookup (const uint8x16_t v)
vtbl1_u8(lut, vget_high_u8(v)));
}

static inline uint8x16_t
static BASE64_FORCE_INLINE uint8x16_t
dec_loop_neon32_lane (uint8x16_t *lane)
{
// See the SSSE3 decoder for an explanation of the algorithm.
Expand Down
4 changes: 2 additions & 2 deletions lib/arch/neon32/enc_loop.c
@@ -1,5 +1,5 @@
#ifdef BASE64_NEON32_USE_ASM
static inline void
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
{
// This function duplicates the functionality of enc_loop_neon32_inner,
Expand Down Expand Up @@ -106,7 +106,7 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
}
#endif

static inline void
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
{
#ifdef BASE64_NEON32_USE_ASM
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/neon32/enc_reshuffle.c
@@ -1,4 +1,4 @@
static inline uint8x16x4_t
static BASE64_FORCE_INLINE uint8x16x4_t
enc_reshuffle (uint8x16x3_t in)
{
uint8x16x4_t out;
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/neon32/enc_translate.c
@@ -1,4 +1,4 @@
static inline uint8x16x4_t
static BASE64_FORCE_INLINE uint8x16x4_t
enc_translate (const uint8x16x4_t in)
{
// A lookup table containing the absolute offsets for all ranges:
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/neon64/codec.c
Expand Up @@ -22,7 +22,7 @@
#define BASE64_NEON64_USE_ASM
#endif

static inline uint8x16x4_t
static BASE64_FORCE_INLINE uint8x16x4_t
load_64byte_table (const uint8_t *p)
{
#ifdef BASE64_NEON64_USE_ASM
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/neon64/enc_loop.c
@@ -1,4 +1,4 @@
static inline void
static BASE64_FORCE_INLINE void
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
{
// Load 48 bytes and deinterleave:
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/neon64/enc_reshuffle.c
@@ -1,4 +1,4 @@
static inline uint8x16x4_t
static BASE64_FORCE_INLINE uint8x16x4_t
enc_reshuffle (const uint8x16x3_t in)
{
uint8x16x4_t out;
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/ssse3/dec_loop.c
Expand Up @@ -65,7 +65,7 @@
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10

static inline int
static BASE64_FORCE_INLINE int
dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const __m128i lut_lo = _mm_setr_epi8(
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/ssse3/dec_reshuffle.c
@@ -1,4 +1,4 @@
static inline __m128i
static BASE64_FORCE_INLINE __m128i
dec_reshuffle (const __m128i in)
{
// in, bits, upper case are most significant bits, lower case are least significant bits
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/ssse3/enc_loop.c
@@ -1,4 +1,4 @@
static inline void
static BASE64_FORCE_INLINE void
enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/ssse3/enc_reshuffle.c
@@ -1,4 +1,4 @@
static inline __m128i
static BASE64_FORCE_INLINE __m128i
enc_reshuffle (__m128i in)
{
// Input, bytes MSB to LSB:
Expand Down
2 changes: 1 addition & 1 deletion lib/arch/ssse3/enc_translate.c
@@ -1,4 +1,4 @@
static inline __m128i
static BASE64_FORCE_INLINE __m128i
enc_translate (const __m128i in)
{
// A lookup table containing the absolute offsets for all ranges:
Expand Down
10 changes: 10 additions & 0 deletions lib/env.h
Expand Up @@ -71,4 +71,14 @@
# define BASE64_FALLTHROUGH
#endif

// Declare macros to ensure that functions that are intended to be inlined, are
// actually inlined, even when no optimization is applied. A lot of inner loop
// code is factored into separate functions for reasons of readability, but
// that code should always be inlined (and optimized) in the main loop.
#ifdef _MSC_VER
# define BASE64_FORCE_INLINE __forceinline
#else
# define BASE64_FORCE_INLINE inline __attribute__((always_inline))
#endif

#endif // BASE64_ENV_H

0 comments on commit b20a31a

Please sign in to comment.