diff --git a/lib/arch/avx2/dec_loop.c b/lib/arch/avx2/dec_loop.c
index f959fc4b..b8a4ccaf 100644
--- a/lib/arch/avx2/dec_loop.c
+++ b/lib/arch/avx2/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const __m256i lut_lo = _mm256_setr_epi8(
diff --git a/lib/arch/avx2/dec_reshuffle.c b/lib/arch/avx2/dec_reshuffle.c
index f3518098..bc875ce9 100644
--- a/lib/arch/avx2/dec_reshuffle.c
+++ b/lib/arch/avx2/dec_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 dec_reshuffle (const __m256i in)
 {
 	// in, lower lane, bits, upper case are most significant bits, lower
diff --git a/lib/arch/avx2/enc_loop.c b/lib/arch/avx2/enc_loop.c
index b9e2736f..6f4aa0ab 100644
--- a/lib/arch/avx2/enc_loop.c
+++ b/lib/arch/avx2/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
 {
 	// First load is done at s - 0 to not get a segfault:
@@ -17,7 +17,7 @@ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
 	*o += 32;
 }
 
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input:
diff --git a/lib/arch/avx2/enc_reshuffle.c b/lib/arch/avx2/enc_reshuffle.c
index ba166903..82c659b3 100644
--- a/lib/arch/avx2/enc_reshuffle.c
+++ b/lib/arch/avx2/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 enc_reshuffle (const __m256i input)
 {
 	// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
diff --git a/lib/arch/avx2/enc_translate.c b/lib/arch/avx2/enc_translate.c
index 46173cd1..370da98f 100644
--- a/lib/arch/avx2/enc_translate.c
+++ b/lib/arch/avx2/enc_translate.c
@@ -1,4 +1,4 @@
-static inline __m256i
+static BASE64_FORCE_INLINE __m256i
 enc_translate (const __m256i in)
 {
 	// A lookup table containing the absolute offsets for all ranges:
diff --git a/lib/arch/avx512/enc_loop.c b/lib/arch/avx512/enc_loop.c
index 4c71e160..cb44696b 100644
--- a/lib/arch/avx512/enc_loop.c
+++ b/lib/arch/avx512/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input.
diff --git a/lib/arch/avx512/enc_reshuffle_translate.c b/lib/arch/avx512/enc_reshuffle_translate.c
index 5c332bb2..c6634f0f 100644
--- a/lib/arch/avx512/enc_reshuffle_translate.c
+++ b/lib/arch/avx512/enc_reshuffle_translate.c
@@ -1,7 +1,7 @@
 // AVX512 algorithm is based on permutevar and multishift. The code is based on
 // https://github.com/WojciechMula/base64simd which is under BSD-2 license.
 
-static inline __m512i
+static BASE64_FORCE_INLINE __m512i
 enc_reshuffle_translate (const __m512i input)
 {
 	// 32-bit input
diff --git a/lib/arch/generic/32/dec_loop.c b/lib/arch/generic/32/dec_loop.c
index 8a8260f2..aa290d7e 100644
--- a/lib/arch/generic/32/dec_loop.c
+++ b/lib/arch/generic/32/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const uint32_t str
diff --git a/lib/arch/generic/32/enc_loop.c b/lib/arch/generic/32/enc_loop.c
index f4870a75..b5e6eefd 100644
--- a/lib/arch/generic/32/enc_loop.c
+++ b/lib/arch/generic/32/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
 {
 	uint32_t src;
diff --git a/lib/arch/generic/64/enc_loop.c b/lib/arch/generic/64/enc_loop.c
index 0840bc73..e6a29cd5 100644
--- a/lib/arch/generic/64/enc_loop.c
+++ b/lib/arch/generic/64/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
 {
 	uint64_t src;
diff --git a/lib/arch/neon32/codec.c b/lib/arch/neon32/codec.c
index 70c80e48..d552344f 100644
--- a/lib/arch/neon32/codec.c
+++ b/lib/arch/neon32/codec.c
@@ -22,7 +22,7 @@
 #define BASE64_NEON32_USE_ASM
 #endif
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
 {
 	// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
diff --git a/lib/arch/neon32/dec_loop.c b/lib/arch/neon32/dec_loop.c
index 2216b395..e4caed7a 100644
--- a/lib/arch/neon32/dec_loop.c
+++ b/lib/arch/neon32/dec_loop.c
@@ -1,4 +1,4 @@
-static inline int
+static BASE64_FORCE_INLINE int
 is_nonzero (const uint8x16_t v)
 {
 	uint64_t u64;
@@ -9,7 +9,7 @@ is_nonzero (const uint8x16_t v)
 	return u64 != 0;
 }
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 delta_lookup (const uint8x16_t v)
 {
 	const uint8x8_t lut = {
@@ -21,7 +21,7 @@ delta_lookup (const uint8x16_t v)
 		vtbl1_u8(lut, vget_high_u8(v)));
 }
 
-static inline uint8x16_t
+static BASE64_FORCE_INLINE uint8x16_t
 dec_loop_neon32_lane (uint8x16_t *lane)
 {
 	// See the SSSE3 decoder for an explanation of the algorithm.
diff --git a/lib/arch/neon32/enc_loop.c b/lib/arch/neon32/enc_loop.c
index d694b337..2adff48f 100644
--- a/lib/arch/neon32/enc_loop.c
+++ b/lib/arch/neon32/enc_loop.c
@@ -1,5 +1,5 @@
 #ifdef BASE64_NEON32_USE_ASM
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
 {
 	// This function duplicates the functionality of enc_loop_neon32_inner,
@@ -106,7 +106,7 @@ enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
 }
 #endif
 
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
 {
 #ifdef BASE64_NEON32_USE_ASM
diff --git a/lib/arch/neon32/enc_reshuffle.c b/lib/arch/neon32/enc_reshuffle.c
index d6e97cb5..fa94d279 100644
--- a/lib/arch/neon32/enc_reshuffle.c
+++ b/lib/arch/neon32/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_reshuffle (uint8x16x3_t in)
 {
 	uint8x16x4_t out;
diff --git a/lib/arch/neon32/enc_translate.c b/lib/arch/neon32/enc_translate.c
index e616d54b..ff3d88dd 100644
--- a/lib/arch/neon32/enc_translate.c
+++ b/lib/arch/neon32/enc_translate.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_translate (const uint8x16x4_t in)
 {
 	// A lookup table containing the absolute offsets for all ranges:
diff --git a/lib/arch/neon64/codec.c b/lib/arch/neon64/codec.c
index f5cda63e..6b664b40 100644
--- a/lib/arch/neon64/codec.c
+++ b/lib/arch/neon64/codec.c
@@ -22,7 +22,7 @@
 #define BASE64_NEON64_USE_ASM
 #endif
 
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 load_64byte_table (const uint8_t *p)
 {
 #ifdef BASE64_NEON64_USE_ASM
diff --git a/lib/arch/neon64/enc_loop.c b/lib/arch/neon64/enc_loop.c
index 59a1c597..8bdd0883 100644
--- a/lib/arch/neon64/enc_loop.c
+++ b/lib/arch/neon64/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
 {
 	// Load 48 bytes and deinterleave:
diff --git a/lib/arch/neon64/enc_reshuffle.c b/lib/arch/neon64/enc_reshuffle.c
index ea543e04..2655df10 100644
--- a/lib/arch/neon64/enc_reshuffle.c
+++ b/lib/arch/neon64/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline uint8x16x4_t
+static BASE64_FORCE_INLINE uint8x16x4_t
 enc_reshuffle (const uint8x16x3_t in)
 {
 	uint8x16x4_t out;
diff --git a/lib/arch/ssse3/dec_loop.c b/lib/arch/ssse3/dec_loop.c
index 9da71abe..7ddb73bf 100644
--- a/lib/arch/ssse3/dec_loop.c
+++ b/lib/arch/ssse3/dec_loop.c
@@ -65,7 +65,7 @@
 // 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 // 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
 
-static inline int
+static BASE64_FORCE_INLINE int
 dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
 {
 	const __m128i lut_lo = _mm_setr_epi8(
diff --git a/lib/arch/ssse3/dec_reshuffle.c b/lib/arch/ssse3/dec_reshuffle.c
index fdf587fe..d3dd3954 100644
--- a/lib/arch/ssse3/dec_reshuffle.c
+++ b/lib/arch/ssse3/dec_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 dec_reshuffle (const __m128i in)
 {
 	// in, bits, upper case are most significant bits, lower case are least significant bits
diff --git a/lib/arch/ssse3/enc_loop.c b/lib/arch/ssse3/enc_loop.c
index 6de652e1..9b67b70d 100644
--- a/lib/arch/ssse3/enc_loop.c
+++ b/lib/arch/ssse3/enc_loop.c
@@ -1,4 +1,4 @@
-static inline void
+static BASE64_FORCE_INLINE void
 enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
 {
 	// Load input:
diff --git a/lib/arch/ssse3/enc_reshuffle.c b/lib/arch/ssse3/enc_reshuffle.c
index b738591f..f9dc949f 100644
--- a/lib/arch/ssse3/enc_reshuffle.c
+++ b/lib/arch/ssse3/enc_reshuffle.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 enc_reshuffle (__m128i in)
 {
 	// Input, bytes MSB to LSB:
diff --git a/lib/arch/ssse3/enc_translate.c b/lib/arch/ssse3/enc_translate.c
index 04f288fc..60d9a42b 100644
--- a/lib/arch/ssse3/enc_translate.c
+++ b/lib/arch/ssse3/enc_translate.c
@@ -1,4 +1,4 @@
-static inline __m128i
+static BASE64_FORCE_INLINE __m128i
 enc_translate (const __m128i in)
 {
 	// A lookup table containing the absolute offsets for all ranges:
diff --git a/lib/env.h b/lib/env.h
index d489ba54..08370650 100644
--- a/lib/env.h
+++ b/lib/env.h
@@ -71,4 +71,14 @@
 #  define BASE64_FALLTHROUGH
 #endif
 
+// Declare macros to ensure that functions that are intended to be inlined, are
+// actually inlined, even when no optimization is applied. A lot of inner loop
+// code is factored into separate functions for reasons of readability, but
+// that code should always be inlined (and optimized) in the main loop.
+#ifdef _MSC_VER
+#  define BASE64_FORCE_INLINE	__forceinline
+#else
+#  define BASE64_FORCE_INLINE  inline __attribute__((always_inline))
+#endif
+
 #endif	// BASE64_ENV_H