Merge pull request #24648 from charris/backport-24461

MAINT: Refactor partial load Workaround for Clang
numpy · Sep 5, 2023 · bbd7561 · bbd7561
2 parents acde96a + 9e8a7a8
commit bbd7561
Show file tree

Hide file tree

Showing 10 changed files with 332 additions and 302 deletions.
diff --git a/meson.build b/meson.build
@@ -55,11 +55,33 @@ add_project_arguments(
 #
 # Clang defaults to a non-strict floating error point model, but we need strict
 # behavior. `-ftrapping-math` is equivalent to `-ffp-exception-behavior=strict`.
-# Note that this is only supported on macOS arm64 as of XCode 14.3
-if cc.get_id() == 'clang'
-  add_project_arguments(
-    cc.get_supported_arguments('-ftrapping-math'), language: ['c', 'cpp'],
-  )
+# This flag is also required to prevent the activation of SIMD partial load workarounds.
+# For further clarification, refer to gh-24461.
+cc_id = cc.get_id()
+if cc_id.startswith('clang')
+  # Determine the compiler flags for trapping math exceptions.
+  trapping_math = {
+    'clang-cl': '/clang:-ftrapping-math',
+  }.get(cc_id, '-ftrapping-math')
+  # Check if the compiler supports the trapping math flag.
+  if cc.has_argument(trapping_math)
+    # TODO: Consider upgrading the vendored Meson to 1.3.0 to support the parameter `werror`
+    # Detect whether the compiler actually supports strict handling of floating-point exceptions
+    # by treating warnings as errors.
+    if cc.compiles('int main() { return 0; }', args: [trapping_math, '-Werror'])
+      trapping_math = [trapping_math, '-DNPY_HAVE_CLANG_FPSTRICT']
+    else
+      # Suppress warnings about unsupported floating-point optimization.
+      trapping_math = [trapping_math, '-Wno-unsupported-floating-point-opt']
+      # Inform the user about the workaround.
+      message(
+        'NumPy is being built against a version of Clang that does not strictly enforce ' +
+        'floating-point exception handling. Workarounds will be used, which may impact performance.\n' +
+        'Consider upgrading Clang to the latest version.'
+      )
+    endif
+    add_project_arguments(trapping_math, language: ['c', 'cpp'])
+  endif
 endif
 
 subdir('meson_cpu')

diff --git a/numpy/core/meson.build b/numpy/core/meson.build
@@ -838,9 +838,7 @@ foreach gen_mtargets : [
   [
     'loops_exponent_log.dispatch.h',
     src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
-    # Enabling SIMD on clang-cl raises spurious FP exceptions
-    # TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp
-    compiler_id == 'clang-cl' ? [] : [
+    [
       AVX512_SKX, AVX512F, [AVX2, FMA3]
     ]
   ],
@@ -884,9 +882,7 @@ foreach gen_mtargets : [
   [
     'loops_trigonometric.dispatch.h',
     src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
-    # Enabling SIMD on clang-cl raises spurious FP exceptions
-    # TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos
-    compiler_id == 'clang-cl' ? [] : [
+    [
       AVX512F, [AVX2, FMA3],
       VSX4, VSX3, VSX2,
       NEON_VFPV4,

diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
@@ -196,7 +196,12 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
     __m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     = _mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -205,7 +210,12 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
     const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
     __m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
     __m256i mask   = _mm256_cmpgt_epi32(vnlane, steps);
-    return _mm256_maskload_epi32((const int*)ptr, mask);
+    __m256i ret    = _mm256_maskload_epi32((const int*)ptr, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 //// 64
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
@@ -216,7 +226,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     = _mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
@@ -225,7 +240,12 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_maskload_epi64((const long long*)ptr, mask);
+    __m256i ret     = _mm256_maskload_epi64((const long long*)ptr, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 
 //// 64-bit nlane
@@ -241,7 +261,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
     __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
     __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
     __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     = _mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -251,19 +276,29 @@ NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
-    npy_int64 m = -((npy_int64)(nlane > 1));
+    npy_int64 m  = -((npy_int64)(nlane > 1));
     __m256i mask = npyv_set_s64(-1, -1, m, m);
-    return _mm256_maskload_epi64((const long long*)ptr, mask);
+    __m256i ret  = _mm256_maskload_epi64((const long long*)ptr, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
                                            npy_int64 fill_lo, npy_int64 fill_hi)
 {
     const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
-    npy_int64 m = -((npy_int64)(nlane > 1));
-    __m256i mask = npyv_set_s64(-1, -1, m, m);
+    npy_int64 m     = -((npy_int64)(nlane > 1));
+    __m256i mask    = npyv_set_s64(-1, -1, m, m);
     __m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
-    return _mm256_blendv_epi8(vfill, payload, mask);
+    __m256i ret     =_mm256_blendv_epi8(vfill, payload, mask);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 /*********************************
  * Non-contiguous partial load
@@ -277,9 +312,14 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m256i vfill = _mm256_set1_epi32(fill);
     const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
     const __m256i idx   = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
-    __m256i vnlane  = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
-    __m256i mask    = _mm256_cmpgt_epi32(vnlane, steps);
-    return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
+    __m256i vnlane      = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
+    __m256i mask        = _mm256_cmpgt_epi32(vnlane, steps);
+    __m256i ret         = _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32
@@ -293,9 +333,14 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m256i vfill = npyv_setall_s64(fill);
     const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
-    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
-    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
+    __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i ret    = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
@@ -313,17 +358,22 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
     );
     const __m256i idx   = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
     const __m256i steps = npyv_set_s64(0, 1, 2, 3);
-    __m256i vnlane  = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
-    __m256i mask    = _mm256_cmpgt_epi64(vnlane, steps);
-    return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+    __m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
+    __m256i mask   = _mm256_cmpgt_epi64(vnlane, steps);
+    __m256i ret    = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
 { return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }
 
 //// 128-bit load over 64-bit stride
 NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
-                                                  npy_int64 fill_lo, npy_int64 fill_hi)
+                                          npy_int64 fill_lo, npy_int64 fill_hi)
 {
     assert(nlane > 0);
     __m256i a = npyv_loadl_s64(ptr);
@@ -336,7 +386,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
     __m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
 #endif
     __m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
-    return _mm256_inserti128_si256(a, b, 1);
+    __m256i ret = _mm256_inserti128_si256(a, b, 1);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m256i workaround = ret;
+    ret = _mm256_or_si256(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)

diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
@@ -248,29 +248,49 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
     assert(nlane > 0);
     const __m512i vfill = _mm512_set1_epi32(fill);
     const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
     const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 //// 64
 NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
 {
     assert(nlane > 0);
     const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 
 //// 64-bit nlane
@@ -280,7 +300,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
     assert(nlane > 0);
     const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
@@ -293,14 +318,24 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
     assert(nlane > 0);
     const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
     const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
-    return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
 {
     assert(nlane > 0);
     const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
-    return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+    __m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 /*********************************
  * Non-contiguous partial load
@@ -317,7 +352,12 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
     const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
     const __m512i vfill = _mm512_set1_epi32(fill);
     const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
+    __m512i ret = _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32
@@ -334,7 +374,12 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
     );
     const __m512i vfill = npyv_setall_s64(fill);
     const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+    __m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64
@@ -352,7 +397,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
     );
     const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
     const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
-    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+    __m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
@@ -369,7 +419,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
     );
     const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
     const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
-    return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+    __m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
+#if NPY_SIMD_GUARD_PARTIAL_LOAD
+    volatile __m512i workaround = ret;
+    ret = _mm512_or_si512(workaround, ret);
+#endif
+    return ret;
 }
 // fill zero to rest lanes
 NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)