#136: Add BASE64_FORCE_INLINE macro

Add a `BASE64_FORCE_INLINE' macro that has the effect of ensuring that a function is always inlined, even when the compiler would normally not inline it (e.g. due to disabling optimizations or when doing certain debug builds). This macro is applied to a number of very hot inner loop functions that were always intended to be fully inlined, such as the various `enc_translate' and `enc_reshuffle' functions, but which were broken out into separate functions to make the data flow easier to follow. Making them separate functions had the side effect that the compiler would sometimes choose not to inline them. Applying this macro respects the author's intent, and ensures that the library is performant even when building with few or no optimizations. Tests show that this increases benchmark scores for 32-bit SSSE3 decoding, and probably similar on other platforms. Resolves #136.
aklomp · Feb 29, 2024 · b20a31a · b20a31a
2 parents 06f9989 + 32e5eb6
commit b20a31a
Show file tree

Hide file tree

Showing 24 changed files with 37 additions and 27 deletions.
diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const __m256i lut_lo = _mm256_setr_epi8(

diff --git a/lib/arch/avx2/dec_reshuffle.c b/lib/arch/avx2/dec_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 dec_reshuffle (const __m256i in)
 {
 	// in, lower lane, bits, upper case are most significant bits, lower

diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
 {
 	// First load is done at s - 0 to not get a segfault:
@@ -17,7 +17,7 @@ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
 	*o += 32;
 }
 
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input:

diff --git a/lib/arch/avx2/enc_reshuffle.c b/lib/arch/avx2/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 enc_reshuffle (const __m256i input)
 {
 	// Translation of the SSSE3 reshuffling algorithm to AVX2. This one

diff --git a/lib/arch/avx2/enc_translate.c b/lib/arch/avx2/enc_translate.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 enc_translate (const __m256i in)
 {
 	// A lookup table containing the absolute offsets for all ranges:

diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input.

diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c
@@ -1,7 +1,7 @@
 // AVX512 algorithm is based on permutevar and multishift. The code is based on
 // https://github.com/WojciechMula/base64simd which is under BSD-2 license.
 
-static inline __m512i
+static BASE64_FORCE_INLINE __m512i
 enc_reshuffle_translate (const __m512i input)
 {
 	// 32-bit input

diff --git a/lib/arch/generic/32/dec_loop.c b/lib/arch/generic/32/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const uint32_t str

diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
 {
 	uint32_t src;

diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
 {
 	uint64_t src;

diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c
@@ -22,7 +22,7 @@
 #define BASE64_NEON32_USE_ASM
 #endif
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
 {
 	// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate

diff --git a/lib/arch/neon32/dec_loop.c b/lib/arch/neon32/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 is_nonzero (const uint8x16_t v)
 {
 	uint64_t u64;
@@ -9,7 +9,7 @@ is_nonzero (const uint8x16_t v)
 	return u64 != 0;
 }
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 delta_lookup (const uint8x16_t v)
 {
 	const uint8x8_t lut = {
@@ -21,7 +21,7 @@ delta_lookup (const uint8x16_t v)
 		vtbl1_u8(lut, vget_high_u8(v)));
 }
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 dec_loop_neon32_lane (uint8x16_t *lane)
 {
 	// See the SSSE3 decoder for an explanation of the algorithm.

diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c
@@ -1,5 +1,5 @@
 #ifdef BASE64_NEON32_USE_ASM
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
 {
 	// This function duplicates the functionality of enc_loop_neon32_inner,
@@ -106,7 +106,7 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
 }
 #endif
 
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
 {
 #ifdef BASE64_NEON32_USE_ASM

diff --git a/lib/arch/neon32/enc_reshuffle.c b/lib/arch/neon32/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_reshuffle (uint8x16x3_t in)
 {
 	uint8x16x4_t out;

diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_translate (const uint8x16x4_t in)
 {
 	// A lookup table containing the absolute offsets for all ranges:

diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c
@@ -22,7 +22,7 @@
 #define BASE64_NEON64_USE_ASM
 #endif
 
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 load_64byte_table (const uint8_t *p)
 {
 #ifdef BASE64_NEON64_USE_ASM

diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
 {
 	// Load 48 bytes and deinterleave:

diff --git a/lib/arch/neon64/enc_reshuffle.c b/lib/arch/neon64/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_reshuffle (const uint8x16x3_t in)
 {
 	uint8x16x4_t out;

diff --git a/lib/arch/ssse3/dec_loop.c b/lib/arch/ssse3/dec_loop.c
@@ -65,7 +65,7 @@
 // 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 // 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const __m128i lut_lo = _mm_setr_epi8(

diff --git a/lib/arch/ssse3/dec_reshuffle.c b/lib/arch/ssse3/dec_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 dec_reshuffle (const __m128i in)
 {
 	// in, bits, upper case are most significant bits, lower case are least significant bits

diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input:

diff --git a/lib/arch/ssse3/enc_reshuffle.c b/lib/arch/ssse3/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 enc_reshuffle (__m128i in)
 {
 	// Input, bytes MSB to LSB:

diff --git a/lib/arch/ssse3/enc_translate.c b/lib/arch/ssse3/enc_translate.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 enc_translate (const __m128i in)
 {
 	// A lookup table containing the absolute offsets for all ranges:

diff --git a/lib/env.h b/lib/env.h
@@ -71,4 +71,14 @@
 #  define BASE64_FALLTHROUGH
 #endif
 
+// Declare macros to ensure that functions that are intended to be inlined, are
+// actually inlined, even when no optimization is applied. A lot of inner loop
+// code is factored into separate functions for reasons of readability, but
+// that code should always be inlined (and optimized) in the main loop.
+#ifdef _MSC_VER
+#  define BASE64_FORCE_INLINE	__forceinline
+#else
+#  define BASE64_FORCE_INLINE  inline __attribute__((always_inline))
+#endif
+
 #endif	// BASE64_ENV_H