Skip to content

Commit

Permalink
Merge pull request #26338 from r-devulap/xss-static-api
Browse files Browse the repository at this point in the history
MAINT: update x86-simd-sort to latest
  • Loading branch information
Mousius committed May 6, 2024
2 parents 2270786 + 4045b72 commit 2e354ee
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 201 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/linux_simd.yml
Expand Up @@ -178,19 +178,19 @@ jobs:
python -m pip install pytest pytest-xdist hypothesis typing_extensions
- name: Build
run: spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512f -Dtest-simd='BASELINE,AVX512_KNL,AVX512_KNM,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL,AVX512_SPR'
run: spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_skx -Dtest-simd='BASELINE,AVX512_KNL,AVX512_KNM,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL,AVX512_SPR'

- name: Meson Log
if: always()
run: cat build/meson-logs/meson-log.txt

- name: SIMD tests (KNM)
- name: SIMD tests (SKX)
run: |
export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
cd build-install &&
sde -knm -- python -c "import numpy; numpy.show_config()" &&
sde -knm -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_simd*
sde -skx -- python -c "import numpy; numpy.show_config()" &&
sde -skx -- python -m pytest $NUMPY_SITE/numpy/_core/tests/test_simd*
- name: linalg/ufunc/umath tests (TGL)
run: |
Expand Down
36 changes: 12 additions & 24 deletions numpy/_core/src/npysort/highway_qsort.dispatch.cpp
Expand Up @@ -2,31 +2,19 @@
#define VQSORT_ONLY_STATIC 1
#include "hwy/contrib/sort/vqsort-inl.h"

#define DISPATCH_VQSORT(TYPE) \
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(TYPE *arr, intptr_t size) \
{ \
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending()); \
} \

namespace np { namespace highway { namespace qsort_simd {

template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
{
hwy::HWY_NAMESPACE::VQSortStatic(arr, size, hwy::SortAscending());
}
DISPATCH_VQSORT(int32_t)
DISPATCH_VQSORT(uint32_t)
DISPATCH_VQSORT(int64_t)
DISPATCH_VQSORT(uint64_t)
DISPATCH_VQSORT(double)
DISPATCH_VQSORT(float)

} } } // np::highway::qsort_simd
2 changes: 1 addition & 1 deletion numpy/_core/src/npysort/x86-simd-sort
Submodule x86-simd-sort updated 65 files
+0 −0 .clang-format
+14 −10 .github/workflows/build-numpy.yml
+57 −8 .github/workflows/c-cpp.yml
+26 −0 .github/workflows/linting.yml
+74 −0 .github/workflows/scorecard.yml
+90 −16 README.md
+13 −0 SECURITY.md
+17 −0 benchmarks/bench-argsort.hpp
+6 −3 benchmarks/bench-ipp.cpp
+19 −20 benchmarks/bench-objsort.hpp
+39 −0 benchmarks/bench-qsort.hpp
+2 −1 benchmarks/bench-vqsort.cpp
+6 −9 benchmarks/bench.h
+12 −18 examples/Makefile
+0 −10 examples/avx2-32bit-qsort.cpp
+0 −10 examples/avx512-16bit-qsort.cpp
+0 −10 examples/avx512-32bit-qsort.cpp
+0 −10 examples/avx512-64bit-qsort.cpp
+0 −9 examples/avx512-argsort.cpp
+14 −13 examples/avx512-kv.cpp
+0 −10 examples/avx512fp-16bit-qsort.cpp
+11 −0 examples/icl-16bit.cpp
+19 −0 examples/skx-avx2.cpp
+11 −0 examples/spr-16bit.cpp
+14 −14 lib/x86simdsort-avx2.cpp
+29 −13 lib/x86simdsort-icl.cpp
+54 −24 lib/x86simdsort-internal.h
+62 −41 lib/x86simdsort-scalar.h
+26 −27 lib/x86simdsort-skx.cpp
+15 −7 lib/x86simdsort-spr.cpp
+39 −28 lib/x86simdsort.cpp
+21 −20 lib/x86simdsort.h
+1 −1 meson.build
+353 −0 misc/bench-simdobjsort.txt
+ misc/object_qsort-perf.jpg
+210 −0 misc/simd-objsort.ipynb
+10 −10 src/README.md
+0 −1 src/avx2-32bit-half.hpp
+42 −7 src/avx2-32bit-qsort.hpp
+45 −9 src/avx2-64bit-qsort.hpp
+0 −1 src/avx2-emu-funcs.hpp
+0 −2 src/avx512-16bit-common.h
+87 −19 src/avx512-16bit-qsort.hpp
+25 −3 src/avx512-32bit-qsort.hpp
+25 −2 src/avx512-64bit-common.h
+34 −35 src/avx512-64bit-keyvaluesort.hpp
+0 −1 src/avx512-64bit-qsort.hpp
+26 −40 src/avx512fp16-16bit-qsort.hpp
+151 −0 src/x86simdsort-static-incl.h
+41 −67 src/xss-common-argsort.h
+127 −0 src/xss-common-comparators.hpp
+7 −1 src/xss-common-includes.h
+233 −134 src/xss-common-qsort.h
+5 −6 src/xss-custom-float.h
+4 −8 src/xss-network-keyvaluesort.hpp
+57 −31 src/xss-network-qsort.hpp
+308 −300 src/xss-optimal-networks.hpp
+177 −0 src/xss-pivot-selection.hpp
+7 −10 tests/meson.build
+1 −0 tests/test-keyvalue.cpp
+76 −0 tests/test-objqsort.cpp
+15 −5 tests/test-qsort-common.h
+119 −12 tests/test-qsort.cpp
+4 −4 utils/custom-compare.h
+25 −29 utils/rand_array.h
93 changes: 16 additions & 77 deletions numpy/_core/src/npysort/x86_simd_argsort.dispatch.cpp
@@ -1,87 +1,26 @@
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SKX)
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
#elif defined(NPY_HAVE_AVX2)
#include "x86-simd-sort/src/avx2-32bit-half.hpp"
#include "x86-simd-sort/src/avx2-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx2-64bit-qsort.hpp"
#include "x86-simd-sort/src/xss-common-argsort.h"
#endif
#include "x86-simd-sort/src/x86simdsort-static-incl.h"

namespace {
template<typename T>
void x86_argsort(T* arr, size_t* arg, npy_intp num)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_argsort(arr, arg, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_argsort(arr, arg, num, true);
#endif
}

template<typename T>
void x86_argselect(T* arr, size_t* arg, npy_intp kth, npy_intp num)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_argselect(arr, arg, kth, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_argselect(arr, arg, kth, num, true);
#endif
}
} // anonymous
#define DISPATCH_ARG_METHODS(TYPE) \
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(TYPE* arr, npy_intp* arg, npy_intp num, npy_intp kth) \
{ \
x86simdsortStatic::argselect(arr, reinterpret_cast<size_t*>(arg), kth, num, true); \
} \
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(TYPE* arr, npy_intp *arg, npy_intp size) \
{ \
x86simdsortStatic::argsort(arr, reinterpret_cast<size_t*>(arg), size, true); \
} \

namespace np { namespace qsort_simd {

template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint32_t *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(int64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(uint64_t*arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(float *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSelect)(double *arr, npy_intp* arg, npy_intp num, npy_intp kth)
{
x86_argselect(arr, reinterpret_cast<size_t*>(arg), kth, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
{
x86_argsort(arr, reinterpret_cast<size_t*>(arg), size);
}
DISPATCH_ARG_METHODS(uint32_t)
DISPATCH_ARG_METHODS(int32_t)
DISPATCH_ARG_METHODS(float)
DISPATCH_ARG_METHODS(uint64_t)
DISPATCH_ARG_METHODS(int64_t)
DISPATCH_ARG_METHODS(double)

}} // namespace np::simd

Expand Down
96 changes: 16 additions & 80 deletions numpy/_core/src/npysort/x86_simd_qsort.dispatch.cpp
@@ -1,89 +1,25 @@
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SKX)
#include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
#elif defined(NPY_HAVE_AVX2)
#include "x86-simd-sort/src/avx2-32bit-qsort.hpp"
#include "x86-simd-sort/src/avx2-64bit-qsort.hpp"
#endif
#include "x86-simd-sort/src/x86simdsort-static-incl.h"

namespace {
template<typename T>
void x86_qsort(T* arr, npy_intp num)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_qsort(arr, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_qsort(arr, num, true);
#endif
}

template<typename T>
void x86_qselect(T* arr, npy_intp num, npy_intp kth)
{
#if defined(NPY_HAVE_AVX512_SKX)
avx512_qselect(arr, kth, num, true);
#elif defined(NPY_HAVE_AVX2)
avx2_qselect(arr, kth, num, true);
#endif
}
} // anonymous
#define DISPATCH_SORT_METHODS(TYPE) \
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(TYPE *arr, npy_intp num, npy_intp kth) \
{ \
x86simdsortStatic::qselect(arr, kth, num, true); \
} \
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(TYPE *arr, npy_intp num) \
{ \
x86simdsortStatic::qsort(arr, num, true); \
} \

namespace np { namespace qsort_simd {
#if defined(NPY_HAVE_AVX512_SKX) || defined(NPY_HAVE_AVX2)
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int32_t *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(uint32_t *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int64_t*arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(uint64_t*arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(float *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(double *arr, npy_intp num, npy_intp kth)
{
x86_qselect(arr, num, kth);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, npy_intp num)
{
x86_qsort(arr, num);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, npy_intp num)
{
x86_qsort(arr, num);
}
#endif // NPY_HAVE_AVX512_SKX || NPY_HAVE_AVX2

DISPATCH_SORT_METHODS(uint32_t)
DISPATCH_SORT_METHODS(int32_t)
DISPATCH_SORT_METHODS(float)
DISPATCH_SORT_METHODS(uint64_t)
DISPATCH_SORT_METHODS(int64_t)
DISPATCH_SORT_METHODS(double)
}} // namespace np::qsort_simd

#endif // __CYGWIN__
30 changes: 15 additions & 15 deletions numpy/_core/src/npysort/x86_simd_qsort_16bit.dispatch.cpp
@@ -1,36 +1,37 @@
#include "x86_simd_qsort.hpp"
#ifndef __CYGWIN__

#if defined(NPY_HAVE_AVX512_SPR)
#include "x86-simd-sort/src/avx512fp16-16bit-qsort.hpp"
#include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
#elif defined(NPY_HAVE_AVX512_ICL)
#include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
#include "x86-simd-sort/src/x86simdsort-static-incl.h"
/*
* MSVC doesn't set the macro __AVX512VBMI2__ which is required for the 16-bit
* functions and therefore we need to manually include this file here
*/
#ifdef _MSC_VER
#include "x86-simd-sort/src/avx512-16bit-qsort.hpp"
#endif

namespace np { namespace qsort_simd {

/*
* QSelect dispatch functions:
*/
#if defined(NPY_HAVE_AVX512_ICL) || defined(NPY_HAVE_AVX512_SPR)
template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(Half *arr, npy_intp num, npy_intp kth)
{
#if defined(NPY_HAVE_AVX512_SPR)
avx512_qselect(reinterpret_cast<_Float16*>(arr), kth, num, true);
x86simdsortStatic::qselect(reinterpret_cast<_Float16*>(arr), kth, num, true);
#else
avx512_qselect_fp16(reinterpret_cast<uint16_t*>(arr), kth, num, true);
avx512_qselect_fp16(reinterpret_cast<uint16_t*>(arr), kth, num, true, false);
#endif
}

template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(uint16_t *arr, npy_intp num, npy_intp kth)
{
avx512_qselect(arr, kth, num);
x86simdsortStatic::qselect(arr, kth, num);
}

template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int16_t *arr, npy_intp num, npy_intp kth)
{
avx512_qselect(arr, kth, num);
x86simdsortStatic::qselect(arr, kth, num);
}

/*
Expand All @@ -39,20 +40,19 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSelect)(int16_t *arr, npy_intp num, npy_
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(Half *arr, npy_intp size)
{
#if defined(NPY_HAVE_AVX512_SPR)
avx512_qsort(reinterpret_cast<_Float16*>(arr), size, true);
x86simdsortStatic::qsort(reinterpret_cast<_Float16*>(arr), size, true);
#else
avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size, true);
avx512_qsort_fp16(reinterpret_cast<uint16_t*>(arr), size, true, false);
#endif
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint16_t *arr, npy_intp size)
{
avx512_qsort(arr, size);
x86simdsortStatic::qsort(arr, size);
}
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int16_t *arr, npy_intp size)
{
avx512_qsort(arr, size);
x86simdsortStatic::qsort(arr, size);
}
#endif // NPY_HAVE_AVX512_ICL || SPR

}} // namespace np::qsort_simd

Expand Down

0 comments on commit 2e354ee

Please sign in to comment.