Skip to content

Commit

Permalink
Merge pull request #24648 from charris/backport-24461
Browse files Browse the repository at this point in the history
MAINT: Refactor partial load Workaround for Clang
  • Loading branch information
charris committed Sep 5, 2023
2 parents acde96a + 9e8a7a8 commit bbd7561
Show file tree
Hide file tree
Showing 10 changed files with 332 additions and 302 deletions.
32 changes: 27 additions & 5 deletions meson.build
Expand Up @@ -55,11 +55,33 @@ add_project_arguments(
#
# Clang defaults to a non-strict floating error point model, but we need strict
# behavior. `-ftrapping-math` is equivalent to `-ffp-exception-behavior=strict`.
# Note that this is only supported on macOS arm64 as of XCode 14.3
if cc.get_id() == 'clang'
add_project_arguments(
cc.get_supported_arguments('-ftrapping-math'), language: ['c', 'cpp'],
)
# This flag is also required to prevent the activation of SIMD partial load workarounds.
# For further clarification, refer to gh-24461.
cc_id = cc.get_id()
if cc_id.startswith('clang')
# Determine the compiler flags for trapping math exceptions.
trapping_math = {
'clang-cl': '/clang:-ftrapping-math',
}.get(cc_id, '-ftrapping-math')
# Check if the compiler supports the trapping math flag.
if cc.has_argument(trapping_math)
# TODO: Consider upgrading the vendored Meson to 1.3.0 to support the parameter `werror`
# Detect whether the compiler actually supports strict handling of floating-point exceptions
# by treating warnings as errors.
if cc.compiles('int main() { return 0; }', args: [trapping_math, '-Werror'])
trapping_math = [trapping_math, '-DNPY_HAVE_CLANG_FPSTRICT']
else
# Suppress warnings about unsupported floating-point optimization.
trapping_math = [trapping_math, '-Wno-unsupported-floating-point-opt']
# Inform the user about the workaround.
message(
'NumPy is being built against a version of Clang that does not strictly enforce ' +
'floating-point exception handling. Workarounds will be used, which may impact performance.\n' +
'Consider upgrading Clang to the latest version.'
)
endif
add_project_arguments(trapping_math, language: ['c', 'cpp'])
endif
endif

subdir('meson_cpu')
Expand Down
8 changes: 2 additions & 6 deletions numpy/core/meson.build
Expand Up @@ -838,9 +838,7 @@ foreach gen_mtargets : [
[
'loops_exponent_log.dispatch.h',
src_file.process('src/umath/loops_exponent_log.dispatch.c.src'),
# Enabling SIMD on clang-cl raises spurious FP exceptions
# TODO (seiko2plus): debug spurious FP exceptions for single-precision log/exp
compiler_id == 'clang-cl' ? [] : [
[
AVX512_SKX, AVX512F, [AVX2, FMA3]
]
],
Expand Down Expand Up @@ -884,9 +882,7 @@ foreach gen_mtargets : [
[
'loops_trigonometric.dispatch.h',
src_file.process('src/umath/loops_trigonometric.dispatch.c.src'),
# Enabling SIMD on clang-cl raises spurious FP exceptions
# TODO (seiko2plus): debug spurious FP exceptions for single-precision sin/cos
compiler_id == 'clang-cl' ? [] : [
[
AVX512F, [AVX2, FMA3],
VSX4, VSX3, VSX2,
NEON_VFPV4,
Expand Down
97 changes: 76 additions & 21 deletions numpy/core/src/common/simd/avx2/memory.h
Expand Up @@ -196,7 +196,12 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
__m256i payload = _mm256_maskload_epi32((const int*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
Expand All @@ -205,7 +210,12 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
return _mm256_maskload_epi32((const int*)ptr, mask);
__m256i ret = _mm256_maskload_epi32((const int*)ptr, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
//// 64
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
Expand All @@ -216,7 +226,12 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
Expand All @@ -225,7 +240,12 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_maskload_epi64((const long long*)ptr, mask);
__m256i ret = _mm256_maskload_epi64((const long long*)ptr, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}

//// 64-bit nlane
Expand All @@ -241,7 +261,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret = _mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
Expand All @@ -251,19 +276,29 @@ NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
NPY_FINLINE npyv_u64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
npy_int64 m = -((npy_int64)(nlane > 1));
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
return _mm256_maskload_epi64((const long long*)ptr, mask);
__m256i ret = _mm256_maskload_epi64((const long long*)ptr, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
npy_int64 fill_lo, npy_int64 fill_hi)
{
const __m256i vfill = npyv_set_s64(0, 0, fill_lo, fill_hi);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
npy_int64 m = -((npy_int64)(nlane > 1));
__m256i mask = npyv_set_s64(-1, -1, m, m);
__m256i payload = _mm256_maskload_epi64((const long long*)ptr, mask);
return _mm256_blendv_epi8(vfill, payload, mask);
__m256i ret =_mm256_blendv_epi8(vfill, payload, mask);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
/*********************************
* Non-contiguous partial load
Expand All @@ -277,9 +312,14 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i vfill = _mm256_set1_epi32(fill);
const __m256i steps = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
const __m256i idx = _mm256_mullo_epi32(_mm256_set1_epi32((int)stride), steps);
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
return _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
__m256i vnlane = _mm256_set1_epi32(nlane > 8 ? 8 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi32(vnlane, steps);
__m256i ret = _mm256_mask_i32gather_epi32(vfill, (const int*)ptr, idx, mask, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32
Expand All @@ -293,9 +333,14 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m256i vfill = npyv_setall_s64(fill);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i ret = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 8);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
Expand All @@ -313,17 +358,22 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
);
const __m256i idx = npyv_set_s64(0, 1*stride, 2*stride, 3*stride);
const __m256i steps = npyv_set_s64(0, 1, 2, 3);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
return _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
__m256i vnlane = npyv_setall_s64(nlane > 4 ? 4 : (int)nlane);
__m256i mask = _mm256_cmpgt_epi64(vnlane, steps);
__m256i ret = _mm256_mask_i64gather_epi64(vfill, (const long long*)ptr, idx, mask, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); }

//// 128-bit load over 64-bit stride
NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane,
npy_int64 fill_lo, npy_int64 fill_hi)
npy_int64 fill_lo, npy_int64 fill_hi)
{
assert(nlane > 0);
__m256i a = npyv_loadl_s64(ptr);
Expand All @@ -336,7 +386,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
__m128i fill = _mm_set_epi64x(fill_hi, fill_lo);
#endif
__m128i b = nlane > 1 ? _mm_loadu_si128((const __m128i*)(ptr + stride)) : fill;
return _mm256_inserti128_si256(a, b, 1);
__m256i ret = _mm256_inserti128_si256(a, b, 1);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m256i workaround = ret;
ret = _mm256_or_si256(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
Expand Down
77 changes: 66 additions & 11 deletions numpy/core/src/common/simd/avx512/memory.h
Expand Up @@ -248,29 +248,49 @@ NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, n
assert(nlane > 0);
const __m512i vfill = _mm512_set1_epi32(fill);
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi32(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
__m512i ret = _mm512_maskz_loadu_epi32(mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
//// 64
NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill)
{
assert(nlane > 0);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
__m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}

//// 64-bit nlane
Expand All @@ -280,7 +300,12 @@ NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane,
assert(nlane > 0);
const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
Expand All @@ -293,14 +318,24 @@ NPY_FINLINE npyv_u64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane,
assert(nlane > 0);
const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
return _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
__m512i ret = _mm512_mask_loadu_epi64(vfill, mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane)
{
assert(nlane > 0);
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
return _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
__m512i ret = _mm512_maskz_loadu_epi64(mask, (const __m512i*)ptr);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
/*********************************
* Non-contiguous partial load
Expand All @@ -317,7 +352,12 @@ npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_
const __m512i idx = _mm512_mullo_epi32(steps, _mm512_set1_epi32((int)stride));
const __m512i vfill = _mm512_set1_epi32(fill);
const __mmask16 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
__m512i ret = _mm512_mask_i32gather_epi32(vfill, mask, idx, (const __m512i*)ptr, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32
Expand All @@ -334,7 +374,12 @@ npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_
);
const __m512i vfill = npyv_setall_s64(fill);
const __mmask8 mask = nlane > 15 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64
Expand All @@ -352,7 +397,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride,
);
const __m512i vfill = _mm512_set4_epi32(fill_hi, fill_lo, fill_hi, fill_lo);
const __mmask8 mask = nlane > 7 ? -1 : (1 << nlane) - 1;
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 4);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane)
Expand All @@ -369,7 +419,12 @@ NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride,
);
const __mmask8 mask = nlane > 3 ? -1 : (1 << (nlane*2)) - 1;
const __m512i vfill = _mm512_set4_epi64(fill_hi, fill_lo, fill_hi, fill_lo);
return _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
__m512i ret = _mm512_mask_i64gather_epi64(vfill, mask, idx, (const __m512i*)ptr, 8);
#if NPY_SIMD_GUARD_PARTIAL_LOAD
volatile __m512i workaround = ret;
ret = _mm512_or_si512(workaround, ret);
#endif
return ret;
}
// fill zero to rest lanes
NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane)
Expand Down

0 comments on commit bbd7561

Please sign in to comment.