Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: update x86-simd-sort to latest #26338

Merged
merged 7 commits into from May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/linux_simd.yml
Expand Up @@ -178,19 +178,19 @@ jobs:
python -m pip install pytest pytest-xdist hypothesis typing_extensions

- name: Build
run: spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512f -Dtest-simd='BASELINE,AVX512_KNL,AVX512_KNM,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL,AVX512_SPR'
run: spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_skx -Dtest-simd='BASELINE,AVX512_KNL,AVX512_KNM,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL,AVX512_SPR'

- name: Meson Log
if: always()
run: cat build/meson-logs/meson-log.txt

- name: SIMD tests (KNM)
- name: SIMD tests (SKX)
run: |
export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
cd build-install &&
sde -knm -- python -c "import numpy; numpy.show_config()" &&
sde -knm -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_simd*
sde -skx -- python -c "import numpy; numpy.show_config()" &&
sde -skx -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_simd*

- name: linalg/ufunc/umath tests (TGL)
run: |
Expand Down
36 changes: 12 additions & 24 deletions numpy/_core/src/npysort/highway_qsort.dispatch.cpp
Expand Up @@ -2,31 +2,19 @@
#define VQSORT_ONLY_STATIC 1
#include "hwy/contrib/sort/vqsort-inl.h"

#define DISPATCH_VQSORT(TYPE) \
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(TYPE *arr, intptr_t size) \
{ \
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); \
} \

namespace np { namespace highway { namespace qsort_simd {

template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
DISPATCH_VQSORT(int32_t)
DISPATCH_VQSORT(uint32_t)
DISPATCH_VQSORT(int64_t)
DISPATCH_VQSORT(uint64_t)
DISPATCH_VQSORT(double)
DISPATCH_VQSORT(float)

} } } // np::highway::qsort_simd
2 changes: 1 addition & 1 deletion numpy/_core/src/npysort/x86-simd-sort
Submodule x86-simd-sort updated 65 files
+0 −0 .clang-format
+14 −10 .github/workflows/build-numpy.yml
+57 −8 .github/workflows/c-cpp.yml
+26 −0 .github/workflows/linting.yml
+74 −0 .github/workflows/scorecard.yml
+90 −16 README.md
+13 −0 SECURITY.md
+17 −0 benchmarks/bench-argsort.hpp
+6 −3 benchmarks/bench-ipp.cpp
+19 −20 benchmarks/bench-objsort.hpp
+39 −0 benchmarks/bench-qsort.hpp
+2 −1 benchmarks/bench-vqsort.cpp
+6 −9 benchmarks/bench.h
+12 −18 examples/Makefile
+0 −10 examples/avx2-32bit-qsort.cpp
+0 −10 examples/avx512-16bit-qsort.cpp
+0 −10 examples/avx512-32bit-qsort.cpp
+0 −10 examples/avx512-64bit-qsort.cpp
+0 −9 examples/avx512-argsort.cpp
+14 −13 examples/avx512-kv.cpp
+0 −10 examples/avx512fp-16bit-qsort.cpp
+11 −0 examples/icl-16bit.cpp
+19 −0 examples/skx-avx2.cpp
+11 −0 examples/spr-16bit.cpp
+14 −14 lib/x86simdsort-avx2.cpp
+29 −13 lib/x86simdsort-icl.cpp
+54 −24 lib/x86simdsort-internal.h
+62 −41 lib/x86simdsort-scalar.h
+26 −27 lib/x86simdsort-skx.cpp
+15 −7 lib/x86simdsort-spr.cpp
+39 −28 lib/x86simdsort.cpp
+21 −20 lib/x86simdsort.h
+1 −1 meson.build
+353 −0 misc/bench-simdobjsort.txt
+ misc/object_qsort-perf.jpg
+210 −0 misc/simd-objsort.ipynb
+10 −10 src/README.md
+0 −1 src/avx2-32bit-half.hpp
+42 −7 src/avx2-32bit-qsort.hpp
+45 −9 src/avx2-64bit-qsort.hpp
+0 −1 src/avx2-emu-funcs.hpp
+0 −2 src/avx512-16bit-common.h
+87 −19 src/avx512-16bit-qsort.hpp
+25 −3 src/avx512-32bit-qsort.hpp
+25 −2 src/avx512-64bit-common.h
+34 −35 src/avx512-64bit-keyvaluesort.hpp
+0 −1 src/avx512-64bit-qsort.hpp
+26 −40 src/avx512fp16-16bit-qsort.hpp
+151 −0 src/x86simdsort-static-incl.h
+41 −67 src/xss-common-argsort.h
+127 −0 src/xss-common-comparators.hpp
+7 −1 src/xss-common-includes.h
+233 −134 src/xss-common-qsort.h
+5 −6 src/xss-custom-float.h
+4 −8 src/xss-network-keyvaluesort.hpp
+57 −31 src/xss-network-qsort.hpp
+308 −300 src/xss-optimal-networks.hpp
+177 −0 src/xss-pivot-selection.hpp
+7 −10 tests/meson.build
+1 −0 tests/test-keyvalue.cpp
+76 −0 tests/test-objqsort.cpp
+15 −5 tests/test-qsort-common.h
+119 −12 tests/test-qsort.cpp
+4 −4 utils/custom-compare.h
+25 −29 utils/rand_array.h
93 changes: 16 additions & 77 deletions numpy/_core/src/npysort/x86_simd_argsort.dispatch.cpp
@@ -1,87 +1,26 @@
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SKX)
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
#elif defined(NPY_HAVE_AVX2)
#include "x86-simd-sort/src/avx2-32bit-half.hpp"
#include "x86-simd-sort/src/avx2-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx2-64bit-qsort.hpp"
#include "x86-simd-sort/src/xss-common-argsort.h"
#endif
#include "x86-simd-sort/src/x86simdsort-static-incl.h"

namespace {
template<typename T>
void x86_argsort(T* arr, size_t* arg, npy_intp num)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_argsort(arr, arg, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_argsort(arr, arg, num, true);
#endif
}

template<typename T>
void x86_argselect(T* arr, size_t* arg, npy_intp kth, npy_intp num)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_argselect(arr, arg, kth, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_argselect(arr, arg, kth, num, true);
#endif
}
} // anonymous
#define DISPATCH_ARG_METHODS(TYPE) \
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(TYPE* arr, npy_intp* arg, npy_intp num, npy_intp kth) \
{ \
x86simdsortStatic::argselect(arr, reinterpret_cast<size_t*>(arg), kth, num, true); \
} \
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(TYPE* arr, npy_intp *arg, npy_intp size) \
{ \
x86simdsortStatic::argsort(arr, reinterpret_cast<size_t*>(arg), size, true); \
} \

namespace np { namespace qsort_simd {

template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
DISPATCH_ARG_METHODS(uint32_t)
DISPATCH_ARG_METHODS(int32_t)
DISPATCH_ARG_METHODS(float)
DISPATCH_ARG_METHODS(uint64_t)
DISPATCH_ARG_METHODS(int64_t)
DISPATCH_ARG_METHODS(double)

}} // namespace np::simd

Expand Down
96 changes: 16 additions & 80 deletions numpy/_core/src/npysort/x86_simd_qsort.dispatch.cpp
@@ -1,89 +1,25 @@
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SKX)
#include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
#elif defined(NPY_HAVE_AVX2)
#include "x86-simd-sort/src/avx2-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx2-64bit-qsort.hpp"
#endif
#include "x86-simd-sort/src/x86simdsort-static-incl.h"

namespace {
template<typename T>
void x86_qsort(T* arr, npy_intp num)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_qsort(arr, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_qsort(arr, num, true);
#endif
}

template<typename T>
void x86_qselect(T* arr, npy_intp num, npy_intp kth)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_qselect(arr, kth, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_qselect(arr, kth, num, true);
#endif
}
} // anonymous
#define DISPATCH_SORT_METHODS(TYPE) \
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(TYPE *arr, npy_intp num, npy_intp kth) \
{ \
x86simdsortStatic::qselect(arr, kth, num, true); \
} \
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(TYPE *arr, npy_intp num) \
{ \
x86simdsortStatic::qsort(arr, num, true); \
} \

namespace np { namespace qsort_simd {
#if defined(NPY_HAVE_AVX512_SKX) || defined(NPY_HAVE_AVX2)
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int32_t *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(uint32_t *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int64_t*arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(uint64_t*arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(float *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(double *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, npy_intp num)
{
x86_qsort(arr, num);
}
#endif // NPY_HAVE_AVX512_SKX || NPY_HAVE_AVX2

DISPATCH_SORT_METHODS(uint32_t)
DISPATCH_SORT_METHODS(int32_t)
DISPATCH_SORT_METHODS(float)
DISPATCH_SORT_METHODS(uint64_t)
DISPATCH_SORT_METHODS(int64_t)
DISPATCH_SORT_METHODS(double)
}} // namespace np::qsort_simd

#endif // __CYGWIN__
30 changes: 15 additions & 15 deletions numpy/_core/src/npysort/x86_simd_qsort_16bit.dispatch.cpp
@@ -1,36 +1,37 @@
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SPR)
#include "x86-simd-sort/src/avx512fp16-16bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
#elif defined(NPY_HAVE_AVX512_ICL)
#include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
#include "x86-simd-sort/src/x86simdsort-static-incl.h"
/*
* MSVC doesn't set the macro __AVX512VBMI2__ which is required for the 16-bit
* functions and therefore we need to manually include this file here
*/
#ifdef _MSC_VER
#include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
#endif

namespace np { namespace qsort_simd {

/*
* QSelect dispatch functions:
*/
#if defined(NPY_HAVE_AVX512_ICL) || defined(NPY_HAVE_AVX512_SPR)
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(Half *arr, npy_intp num, npy_intp kth)
{
#if defined(NPY_HAVE_AVX512_SPR)
avx512_qselect(reinterpret_cast<_Float16*>(arr), kth, num, true);
x86simdsortStatic::qselect(reinterpret_cast<_Float16*>(arr), kth, num, true);
#else
avx512_qselect_fp16(reinterpret_cast<uint16_t*>(arr), kth, num, true);
avx512_qselect_fp16(reinterpret_cast<uint16_t*>(arr), kth, num, true, false);
#endif
}

template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(uint16_t *arr, npy_intp num, npy_intp kth)
{
avx512_qselect(arr, kth, num);
x86simdsortStatic::qselect(arr, kth, num);
}

template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int16_t *arr, npy_intp num, npy_intp kth)
{
avx512_qselect(arr, kth, num);
x86simdsortStatic::qselect(arr, kth, num);
}

/*
Expand All @@ -39,20 +40,19 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int16_t *arr, npy_intp num, npy_
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, npy_intp size)
{
#if defined(NPY_HAVE_AVX512_SPR)
avx512_qsort(reinterpret_cast<_Float16*>(arr), size, true);
x86simdsortStatic::qsort(reinterpret_cast<_Float16*>(arr), size, true);
#else
avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size, true);
avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size, true, false);
#endif
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, npy_intp size)
{
avx512_qsort(arr, size);
x86simdsortStatic::qsort(arr, size);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, npy_intp size)
{
avx512_qsort(arr, size);
x86simdsortStatic::qsort(arr, size);
}
#endif // NPY_HAVE_AVX512_ICL || SPR

}} // namespace np::qsort_simd

Expand Down