diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c index f959fc4b..b8a4ccaf 100644 --- a/lib/arch/avx2/dec_loop.c +++ b/lib/arch/avx2/dec_loop.c @@ -1,4 +1,4 @@ -static inline int +static BASE64_FORCE_INLINE int dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { const __m256i lut_lo = _mm256_setr_epi8( diff --git a/lib/arch/avx2/dec_reshuffle.c b/lib/arch/avx2/dec_reshuffle.c index f3518098..bc875ce9 100644 --- a/lib/arch/avx2/dec_reshuffle.c +++ b/lib/arch/avx2/dec_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m256i +static BASE64_FORCE_INLINE __m256i dec_reshuffle (const __m256i in) { // in, lower lane, bits, upper case are most significant bits, lower diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c index b9e2736f..6f4aa0ab 100644 --- a/lib/arch/avx2/enc_loop.c +++ b/lib/arch/avx2/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) { // First load is done at s - 0 to not get a segfault: @@ -17,7 +17,7 @@ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o) *o += 32; } -static inline void +static BASE64_FORCE_INLINE void enc_loop_avx2_inner (const uint8_t **s, uint8_t **o) { // Load input: diff --git a/lib/arch/avx2/enc_reshuffle.c b/lib/arch/avx2/enc_reshuffle.c index ba166903..82c659b3 100644 --- a/lib/arch/avx2/enc_reshuffle.c +++ b/lib/arch/avx2/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m256i +static BASE64_FORCE_INLINE __m256i enc_reshuffle (const __m256i input) { // Translation of the SSSE3 reshuffling algorithm to AVX2. This one diff --git a/lib/arch/avx2/enc_translate.c b/lib/arch/avx2/enc_translate.c index 46173cd1..370da98f 100644 --- a/lib/arch/avx2/enc_translate.c +++ b/lib/arch/avx2/enc_translate.c @@ -1,4 +1,4 @@ -static inline __m256i +static BASE64_FORCE_INLINE __m256i enc_translate (const __m256i in) { // A lookup table containing the absolute offsets for all ranges: diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c index 4c71e160..cb44696b 100644 --- a/lib/arch/avx512/enc_loop.c +++ b/lib/arch/avx512/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_avx512_inner (const uint8_t **s, uint8_t **o) { // Load input. diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c index 5c332bb2..c6634f0f 100644 --- a/lib/arch/avx512/enc_reshuffle_translate.c +++ b/lib/arch/avx512/enc_reshuffle_translate.c @@ -1,7 +1,7 @@ // AVX512 algorithm is based on permutevar and multishift. The code is based on // https://github.com/WojciechMula/base64simd which is under BSD-2 license. -static inline __m512i +static BASE64_FORCE_INLINE __m512i enc_reshuffle_translate (const __m512i input) { // 32-bit input diff --git a/lib/arch/generic/32/dec_loop.c b/lib/arch/generic/32/dec_loop.c index 8a8260f2..aa290d7e 100644 --- a/lib/arch/generic/32/dec_loop.c +++ b/lib/arch/generic/32/dec_loop.c @@ -1,4 +1,4 @@ -static inline int +static BASE64_FORCE_INLINE int dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { const uint32_t str diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c index f4870a75..b5e6eefd 100644 --- a/lib/arch/generic/32/enc_loop.c +++ b/lib/arch/generic/32/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o) { uint32_t src; diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c index 0840bc73..e6a29cd5 100644 --- a/lib/arch/generic/64/enc_loop.c +++ b/lib/arch/generic/64/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o) { uint64_t src; diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c index 70c80e48..d552344f 100644 --- a/lib/arch/neon32/codec.c +++ b/lib/arch/neon32/codec.c @@ -22,7 +22,7 @@ #define BASE64_NEON32_USE_ASM #endif -static inline uint8x16_t +static BASE64_FORCE_INLINE uint8x16_t vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices) { // NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate diff --git a/lib/arch/neon32/dec_loop.c b/lib/arch/neon32/dec_loop.c index 2216b395..e4caed7a 100644 --- a/lib/arch/neon32/dec_loop.c +++ b/lib/arch/neon32/dec_loop.c @@ -1,4 +1,4 @@ -static inline int +static BASE64_FORCE_INLINE int is_nonzero (const uint8x16_t v) { uint64_t u64; @@ -9,7 +9,7 @@ is_nonzero (const uint8x16_t v) return u64 != 0; } -static inline uint8x16_t +static BASE64_FORCE_INLINE uint8x16_t delta_lookup (const uint8x16_t v) { const uint8x8_t lut = { @@ -21,7 +21,7 @@ delta_lookup (const uint8x16_t v) vtbl1_u8(lut, vget_high_u8(v))); } -static inline uint8x16_t +static BASE64_FORCE_INLINE uint8x16_t dec_loop_neon32_lane (uint8x16_t *lane) { // See the SSSE3 decoder for an explanation of the algorithm. diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c index d694b337..2adff48f 100644 --- a/lib/arch/neon32/enc_loop.c +++ b/lib/arch/neon32/enc_loop.c @@ -1,5 +1,5 @@ #ifdef BASE64_NEON32_USE_ASM -static inline void +static BASE64_FORCE_INLINE void enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) { // This function duplicates the functionality of enc_loop_neon32_inner, @@ -106,7 +106,7 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o) } #endif -static inline void +static BASE64_FORCE_INLINE void enc_loop_neon32_inner (const uint8_t **s, uint8_t **o) { #ifdef BASE64_NEON32_USE_ASM diff --git a/lib/arch/neon32/enc_reshuffle.c b/lib/arch/neon32/enc_reshuffle.c index d6e97cb5..fa94d279 100644 --- a/lib/arch/neon32/enc_reshuffle.c +++ b/lib/arch/neon32/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t enc_reshuffle (uint8x16x3_t in) { uint8x16x4_t out; diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c index e616d54b..ff3d88dd 100644 --- a/lib/arch/neon32/enc_translate.c +++ b/lib/arch/neon32/enc_translate.c @@ -1,4 +1,4 @@ -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t enc_translate (const uint8x16x4_t in) { // A lookup table containing the absolute offsets for all ranges: diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c index f5cda63e..6b664b40 100644 --- a/lib/arch/neon64/codec.c +++ b/lib/arch/neon64/codec.c @@ -22,7 +22,7 @@ #define BASE64_NEON64_USE_ASM #endif -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t load_64byte_table (const uint8_t *p) { #ifdef BASE64_NEON64_USE_ASM diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c index 59a1c597..8bdd0883 100644 --- a/lib/arch/neon64/enc_loop.c +++ b/lib/arch/neon64/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc) { // Load 48 bytes and deinterleave: diff --git a/lib/arch/neon64/enc_reshuffle.c b/lib/arch/neon64/enc_reshuffle.c index ea543e04..2655df10 100644 --- a/lib/arch/neon64/enc_reshuffle.c +++ b/lib/arch/neon64/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline uint8x16x4_t +static BASE64_FORCE_INLINE uint8x16x4_t enc_reshuffle (const uint8x16x3_t in) { uint8x16x4_t out; diff --git a/lib/arch/ssse3/dec_loop.c b/lib/arch/ssse3/dec_loop.c index 9da71abe..7ddb73bf 100644 --- a/lib/arch/ssse3/dec_loop.c +++ b/lib/arch/ssse3/dec_loop.c @@ -65,7 +65,7 @@ // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 -static inline int +static BASE64_FORCE_INLINE int dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds) { const __m128i lut_lo = _mm_setr_epi8( diff --git a/lib/arch/ssse3/dec_reshuffle.c b/lib/arch/ssse3/dec_reshuffle.c index fdf587fe..d3dd3954 100644 --- a/lib/arch/ssse3/dec_reshuffle.c +++ b/lib/arch/ssse3/dec_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m128i +static BASE64_FORCE_INLINE __m128i dec_reshuffle (const __m128i in) { // in, bits, upper case are most significant bits, lower case are least significant bits diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c index 6de652e1..9b67b70d 100644 --- a/lib/arch/ssse3/enc_loop.c +++ b/lib/arch/ssse3/enc_loop.c @@ -1,4 +1,4 @@ -static inline void +static BASE64_FORCE_INLINE void enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o) { // Load input: diff --git a/lib/arch/ssse3/enc_reshuffle.c b/lib/arch/ssse3/enc_reshuffle.c index b738591f..f9dc949f 100644 --- a/lib/arch/ssse3/enc_reshuffle.c +++ b/lib/arch/ssse3/enc_reshuffle.c @@ -1,4 +1,4 @@ -static inline __m128i +static BASE64_FORCE_INLINE __m128i enc_reshuffle (__m128i in) { // Input, bytes MSB to LSB: diff --git a/lib/arch/ssse3/enc_translate.c b/lib/arch/ssse3/enc_translate.c index 04f288fc..60d9a42b 100644 --- a/lib/arch/ssse3/enc_translate.c +++ b/lib/arch/ssse3/enc_translate.c @@ -1,4 +1,4 @@ -static inline __m128i +static BASE64_FORCE_INLINE __m128i enc_translate (const __m128i in) { // A lookup table containing the absolute offsets for all ranges: diff --git a/lib/env.h b/lib/env.h index d489ba54..08370650 100644 --- a/lib/env.h +++ b/lib/env.h @@ -71,4 +71,14 @@ # define BASE64_FALLTHROUGH #endif +// Declare macros to ensure that functions that are intended to be inlined, are +// actually inlined, even when no optimization is applied. A lot of inner loop +// code is factored into separate functions for reasons of readability, but +// that code should always be inlined (and optimized) in the main loop. +#ifdef _MSC_VER +# define BASE64_FORCE_INLINE __forceinline +#else +# define BASE64_FORCE_INLINE inline __attribute__((always_inline)) +#endif + #endif // BASE64_ENV_H