numpy · mattip · May 3, 2020 · Mar 17, 2020 · Mar 17, 2020 · Mar 17, 2020
diff --git a/doc/release/upcoming_changes/15769.improvement.rst b/doc/release/upcoming_changes/15769.improvement.rst
@@ -0,0 +1,15 @@
+Ability to disable madvise hugepages
+------------------------------------
+
+On Linux NumPy has previously added support for madavise
+hugepages which can improve performance for very large arrays.
+Unfortunately, on older Kernel versions this led to peformance
+regressions, thus by default the support has been disabled on
+kernels before version 4.6. To override the default, you can
+use the environment variable::
+
+    NUMPY_MADVISE_HUGEPAGE=0
+
+or set it to 1 to force enabling support. Note that this only makes
+a difference if the operating system is set up to use madvise
+transparent hugepage.
diff --git a/doc/source/reference/global_state.rst b/doc/source/reference/global_state.rst
@@ -0,0 +1,85 @@
+.. _global_state:
+
+************
+Global State
+************
+
+NumPy has a few import-time, compile-time, or runtime options
+which change the global behaviour.
+Most of these are related to performance or for debugging
+purposes and will not be interesting to the vast majority
+of users.
+
+
+Performance-Related Options
+===========================
+
+Number of Threads used for Linear Algebra
+-----------------------------------------
+
+NumPy itself is normally intentionally limited to a single thread
+during function calls, however it does support multiple Python
+threads running at the same time.
+Note that for performant linear algebra NumPy uses a BLAS backend
+such as OpenBLAS or MKL, which may use multiple threads that may
+be controlled by environment variables such as ``OMP_NUM_THREADS``
+depending on what is used.
+One way to control the number of threads is the package
+`threadpoolctl <https://pypi.org/project/threadpoolctl/>`_
+
+
+Madvise Hugepage on Linux
+-------------------------
+
+When working with very large arrays on modern Linux kernels,
+you can experience a significant speedup when
+`transparent hugepage <https://www.kernel.org/doc/html/latest/admin-guide/mm/transhuge.html>`_
+is used.
+The current system policy for transparent hugepages can be seen by::
+
+    cat /sys/kernel/mm/transparent_hugepage/enabled
+
+When set to ``madvise`` NumPy will typically use hugepages for a performance
+boost. This behaviour can be modified by setting the environment variable::
+
+    NUMPY_MADVISE_HUGEPAGE=0
+
+or setting it to ``1`` to always enable it. When not set, the default
+is to use madvise on Kernels 4.6 and newer. These kernels presumably
+experience a large speedup with hugepage support.
+This flag is checked at import time.
+
+
+Interoperability-Related Options
+================================
+
+The array function protocol which allows array-like objects to
+hook into the NumPy API is currently enabled by default.
+This option exists since NumPy 1.16 and is enabled by default since
+NumPy 1.17. It can be disabled using::
+
+    NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=0
+
+See also :py:meth:`numpy.class.__array_function__` for more information.
+This flag is checked at import time.
+
+
+Debugging-Related Options
+=========================
+
+Relaxed Strides Checking
+------------------------
+
+The *compile-time* environment variables::
+
+    NPY_RELAXED_STRIDES_DEBUG=0
+    NPY_RELAXED_STRIDES_CHECKING=1
+
+control how NumPy reports contiguity for arrays.
+The default that it is enabled and the debug mode is disabled.
+This setting should always be enabled. Setting the
+debug option can be interesting for testing code written
+in C which iterates through arrays that may or may not be
+contiguous in memory.
+Most users will have no reason to change these, for details
+please see the `memory layout <memory-layout>`_ documentation.
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
@@ -22,6 +22,7 @@ For learning how to use NumPy, see also :ref:`user`.
    constants
    ufuncs
    routines
+   global_state
    distutils
    distutils_guide
    c-api/index

diff --git a/numpy/__init__.py b/numpy/__init__.py
@@ -286,3 +286,24 @@ def _mac_os_check():
                         error_message))
                 raise RuntimeError(msg)
     del _mac_os_check
+
+    # We usually use madvise hugepages support, but on some old kernels it
+    # is slow and thus better avoided.
+    # Specifically kernel version 4.6 had a bug fix which probably fixed this:
+    # https://github.com/torvalds/linux/commit/7cf91a98e607c2f935dbcc177d70011e95b8faff
+    import os
+    use_hugepage = os.environ.get("NUMPY_MADVISE_HUGEPAGE", None)
+    if sys.platform == "linux" and use_hugepage is None:
+        use_hugepage = 1
+        kernel_version = os.uname().release.split(".")[:2]
+        kernel_version = tuple(int(v) for v in kernel_version)
+        if kernel_version < (4, 6):
+            use_hugepage = 0
+    elif use_hugepage is None:
+        # This is not Linux, so it should not matter, just enable anyway
+        use_hugepage = 1
+    else:
+        use_hugepage = int(use_hugepage)
+
+    # Note that this will currently only make a difference on Linux
+    core.multiarray._multiarray_umath._set_madvise_hugepage(use_hugepage)
diff --git a/numpy/core/src/multiarray/alloc.c b/numpy/core/src/multiarray/alloc.c
@@ -47,6 +47,25 @@ typedef struct {
 static cache_bucket datacache[NBUCKETS];
 static cache_bucket dimcache[NBUCKETS_DIM];
 
+static int _madvise_hugepage = 1;
+
+
+NPY_NO_EXPORT PyObject *
+_set_madvise_hugepage(PyObject *NPY_UNUSED(self), PyObject *enabled_obj)
+{
+    int was_enabled = _madvise_hugepage;
+    int enabled = PyObject_IsTrue(enabled_obj);
+    if (enabled < 0) {
+        return NULL;
+    }
+    _madvise_hugepage = enabled;
+    if (was_enabled) {
+        Py_RETURN_TRUE;
+    }
+    Py_RETURN_FALSE;
+}
+
+
 /* as the cache is managed in global variables verify the GIL is held */
 
 /*
@@ -75,7 +94,7 @@ _npy_alloc_cache(npy_uintp nelem, npy_uintp esz, npy_uint msz,
 #endif
 #ifdef NPY_OS_LINUX
         /* allow kernel allocating huge pages for large arrays */
-        if (NPY_UNLIKELY(nelem * esz >= ((1u<<22u)))) {
+        if (NPY_UNLIKELY(nelem * esz >= ((1u<<22u))) && _madvise_hugepage) {
             npy_uintp offset = 4096u - (npy_uintp)p % (4096u);
             npy_uintp length = nelem * esz - offset;
             /**

diff --git a/numpy/core/src/multiarray/alloc.h b/numpy/core/src/multiarray/alloc.h
@@ -6,6 +6,9 @@
 
 #define NPY_TRACE_DOMAIN 389047
 
+NPY_NO_EXPORT PyObject *
+_set_madvise_hugepage(PyObject *NPY_UNUSED(self), PyObject *enabled_obj);
+
 NPY_NO_EXPORT void *
 npy_alloc_cache(npy_uintp sz);
 

diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c
@@ -34,6 +34,7 @@
 NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0;
 
 /* Internal APIs */
+#include "alloc.h"
 #include "arrayfunction_override.h"
 #include "arraytypes.h"
 #include "arrayobject.h"
@@ -3971,6 +3972,7 @@ normalize_axis_index(PyObject *NPY_UNUSED(self), PyObject *args, PyObject *kwds)
     return PyInt_FromLong(axis);
 }
 
+
 static struct PyMethodDef array_module_methods[] = {
     {"_get_implementing_args",
         (PyCFunction)array__get_implementing_args,
@@ -4159,6 +4161,8 @@ static struct PyMethodDef array_module_methods[] = {
         METH_VARARGS, NULL},
     {"_add_newdoc_ufunc", (PyCFunction)add_newdoc_ufunc,
         METH_VARARGS, NULL},
+    {"_set_madvise_hugepage", (PyCFunction)_set_madvise_hugepage,
+        METH_O, "Toggle and return madvise hugepage (no OS support check)."},
     {NULL, NULL, 0, NULL}                /* sentinel */
 };