scikit-learn · jeremiedbb · Mar 4, 2019 · Mar 7, 2019 · Mar 8, 2019 · Mar 8, 2019
diff --git a/sklearn/metrics/_safe_euclidean_sparse.pyx b/sklearn/metrics/_safe_euclidean_sparse.pyx
@@ -0,0 +1,69 @@
+#cython: language_level=3
+#cython: boundscheck=False, cdivision=True, wraparound=False
+
+
+import numpy as np
+cimport numpy as np
+from cython cimport floating
+from libc.math cimport fmax
+
+
+np.import_array()
+
+
+ctypedef fused INT:
+    np.int32_t
+    np.int64_t
+
+
+def _euclidean_sparse_dense_exact(floating[::1] X_data,
+                                  INT[::1] X_indices,
+                                  INT[::1] X_indptr,
+                                  np.ndarray[floating, ndim=2, mode='c'] Y,
+                                  floating[::1] y_squared_norms):
+    cdef:
+        int n_samples_X = X_indptr.shape[0] - 1
+        int n_samples_Y = Y.shape[0]
+        int n_features = Y.shape[1]
+
+        int i, j
+
+        floating[:, ::1] D = np.empty((n_samples_X, n_samples_Y), Y.dtype)
+
+    for i in range(n_samples_X):
+        for j in range(n_samples_Y):
+            D[i, j] = _euclidean_sparse_dense_exact_1d(
+                &X_data[X_indptr[i]],
+                &X_indices[X_indptr[i]],
+                X_indptr[i + 1] - X_indptr[i],
+                &Y[j, 0],
+                y_squared_norms[j])
+
+    return np.asarray(D)
+
+
+cdef floating _euclidean_sparse_dense_exact_1d(floating *y_data,
+                                               INT *y_indices,
+                                               int y_nnz,
+                                               floating *x,
+                                               floating x_squared_norm) nogil:
+    """Euclidean distance between x dense and y sparse"""
+    cdef:
+        int i
+        floating xi
+        floating tmp = 0.0
+        floating result = 0.0
+        floating partial_x_squared_norm = 0.0
+
+    # Split the loop to avoid unsafe compiler auto optimizations
+    for i in range(y_nnz):
+        xi = x[y_indices[i]]
+        partial_x_squared_norm += xi * xi
+
+    for i in range(y_nnz):
+        tmp = y_data[i] - x[y_indices[i]]
+        result += tmp * tmp 
+
+    result += x_squared_norm - partial_x_squared_norm
+
+    return fmax(result, 0)
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -30,6 +30,10 @@
 from ..utils._joblib import effective_n_jobs
 
 from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
+from .pairwise_fast import _euclidean_dense_dense_exact
+from .pairwise_fast import _euclidean_dense_dense_fast_sym
+from .pairwise_fast import _add_norms
+from ._safe_euclidean_sparse import _euclidean_sparse_dense_exact
 
 
 # Utility Functions
@@ -168,20 +172,6 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     Considering the rows of X (and Y=X) as vectors, compute the
     distance matrix between each pair of vectors.
 
-    For efficiency reasons, the euclidean distance between a pair of row
-    vector x and y is computed as::
-
-        dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
-
-    This formulation has two advantages over other ways of computing distances.
-    First, it is computationally efficient when dealing with sparse data.
-    Second, if one argument varies but the other remains unchanged, then
-    `dot(x, x)` and/or `dot(y, y)` can be pre-computed.
-
-    However, this is not the most precise way of doing this computation, and
-    the distance matrix returned by this function may not be exactly
-    symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
-
     Read more in the :ref:`User Guide <metrics>`.
 
     Parameters
@@ -193,17 +183,43 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     Y_norm_squared : array-like, shape (n_samples_2, ), optional
         Pre-computed dot-products of vectors in Y (e.g.,
         ``(Y**2).sum(axis=1)``)
+        May be ignored in some cases, see the note below.
 
     squared : boolean, optional
         Return squared Euclidean distances.
 
     X_norm_squared : array-like, shape = [n_samples_1], optional
         Pre-computed dot-products of vectors in X (e.g.,
         ``(X**2).sum(axis=1)``)
+        May be ignored in some cases, see the note below.
 
     Returns
     -------
-    distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
+    distances : array, shape (n_samples_1, n_samples_2)
+
+    Note
+    ----
+    When ``n_features > 32``, the euclidean distance between a pair of row
+    vector x and y is computed as::
+
+        dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
+
+    This formulation is computationaly more efficient than the usual one and
+    can benefit from pre-computed ``dot(x, x)`` and/or ``dot(y, y)``. When the
+    input is stored in float32, computations are done by first upcasting ``X``
+    and ``Y`` to float64 (by chunks to limit memory usage). In that case,
+    ``X_norm_squared`` and ``Y_norm_squared`` are ignored and computed based on
+    upcast ``X`` and ``Y`` to keep good precision.
+
+    However, this is not the most precise way of doing this computation, and
+    the distance matrix returned by this function may not be exactly
+    symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
+
+    When ``n_features < 32``, the previous method is not as efficient and is
+    more likely to suffer from numerical instabilities, so the euclidean
+    distance between a pair of row vector x and y is computed as::
+
+        dist(x, y) = sqrt(dot(x - y))
 
     Examples
     --------
@@ -224,41 +240,157 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    if X_norm_squared is not None:
-        XX = check_array(X_norm_squared)
-        if XX.shape == (1, X.shape[0]):
-            XX = XX.T
-        elif XX.shape != (X.shape[0], 1):
-            raise ValueError(
-                "Incompatible dimensions for X and X_norm_squared")
-    else:
-        XX = row_norms(X, squared=True)[:, np.newaxis]
+    XX, YY = _check_norms(X, Y, X_norm_squared, Y_norm_squared)
+
+    n_features = X.shape[1]
+
+    # For n_features > 32 we use the 'fast 'method to compute the euclidean
+    # distance, i.e. d(x,y)² = ||x||² + ||y||² - 2 * x.y
+    # It's faster but less precise.
+    if n_features > 32:
 
-    if X is Y:  # shortcut in the common case euclidean_distances(X, X)
-        YY = XX.T
-    elif Y_norm_squared is not None:
-        YY = np.atleast_2d(Y_norm_squared)
+        # To minimize precision issues with float32, we compute the distance
+        # matrix on chunks of X and Y upcast to float64
+        if X.dtype == np.float32:
+            distances = _euclidean_distances_upcast_fast(X, XX, Y, YY)
 
-        if YY.shape != (1, Y.shape[0]):
-            raise ValueError(
-                "Incompatible dimensions for Y and Y_norm_squared")
+        # if dtype is already float64, no need to chunk and upcast
+        else:
+            if X is Y and not issparse(X):
+                # In this case the distance matrix is symmetric, so we only
+                # need to compute half of it. When X is dense, we can benefit
+                # from the BLAS triangular matrix matrix multiplication `syrk`.
+                distances = _euclidean_dense_dense_fast_sym(X, XX)
+            else:
+                distances = - 2 * safe_sparse_dot(X, Y.T, dense_output=True)
+                _add_norms(distances, XX, YY)
+
+    # For n_features <= 32, we use the 'exact' method, i.e. the usual method,
+    # d(x,y)² = ||x - y||².
     else:
-        YY = row_norms(Y, squared=True)[np.newaxis, :]
 
-    distances = safe_sparse_dot(X, Y.T, dense_output=True)
-    distances *= -2
-    distances += XX
-    distances += YY
-    np.maximum(distances, 0, out=distances)
+        # distances being between rows of X and Y, it's more efficient to work
+        # on C-contiguous arrays
+        if not issparse(X):
+            X = np.asarray(X, order='C')
+        if not issparse(Y):
+            Y = np.asarray(Y, order='C')
+
+        # Euclidean distance between 2 sparse vectors is very slow. It's much
+        # faster to densify one. We densify the smaller one for lower memory
+        # usage.
+        if issparse(X) and issparse(Y):
+            if Y.shape[0] > X.shape[0]:
+                X = X.toarray()
+            else:
+                Y = Y.toarray()
+
+        if issparse(X):
+            distances = _euclidean_sparse_dense_exact(
+                X.data, X.indices, X.indptr, Y, YY)
+        elif issparse(Y):
+            distances = _euclidean_sparse_dense_exact(
+                Y.data, Y.indices, Y.indptr, X, XX).T
+        else:
+            distances = _euclidean_dense_dense_exact(X, Y)
 
+    # Ensure that distances between vectors and themselves are set to 0.0.
+    # This may not be the case due to floating point rounding errors.
     if X is Y:
-        # Ensure that distances between vectors and themselves are set to 0.0.
-        # This may not be the case due to floating point rounding errors.
-        distances.flat[::distances.shape[0] + 1] = 0.0
+        np.fill_diagonal(distances, 0)
 
     return distances if squared else np.sqrt(distances, out=distances)
 
 
+def _check_norms(X, Y=None, X_norm_squared=None, Y_norm_squared=None):
+    n_features = X.shape[1]
+
+    if n_features > 32 and X.dtype == np.float32:
+        # In this case, we compute euclidean distances by upcasting to float64.
+        # It' necessary to compute the norms on upcast X and not to upcast
+        # the norms computed on X to keep good precision, so we don't use
+        # provided norms.
+        return None, None
+    else:
+        if X_norm_squared is not None:
+            XX = np.atleast_1d(X_norm_squared).reshape(-1)
+            if XX.shape != (X.shape[0],):
+                raise ValueError(
+                    "Incompatible dimensions for X and X_norm_squared")
+        else:
+            XX = row_norms(X, squared=True)
+
+        if X is Y:  # shortcut in the common case euclidean_distances(X, X)
+            YY = XX
+        elif Y_norm_squared is not None:
+            YY = np.atleast_1d(Y_norm_squared).reshape(-1)
+            if YY.shape != (Y.shape[0],):
+                raise ValueError(
+                    "Incompatible dimensions for Y and Y_norm_squared")
+        else:
+            YY = row_norms(Y, squared=True)
+
+        XX = XX.astype(X.dtype, copy=False)
+        YY = YY.astype(Y.dtype, copy=False)
+
+        return XX, YY
+
+
+def _euclidean_distances_upcast_fast(X, XX, Y, YY):
+    """Euclidean distances between X and Y
+
+    Assumes X and Y have float32 dtype.
+    X and Y are upcast to float64 by chunks, which size is chosen to limit
+    memory increase by approximately 10MiB.
+    """
+    n_samples_X = X.shape[0]
+    n_samples_Y = Y.shape[0]
+    n_features = X.shape[1]
+
+    distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32)
+
+    maxmem = 10 * 2**17  # this number of float64 take 10MiB memory.
+
+    x_density = X.getnnz() / np.prod(X.shape) if issparse(X) else 1
+    y_density = Y.getnnz() / np.prod(Y.shape) if issparse(Y) else 1
+
+    # The increase amount of memory is:
+    # - x_density * chunk_size * n_features (copy of chunk of X)
+    # - y_density * chunk_size * n_features (copy of chunk of Y)
+    # - chunk_size * chunk_size (chunk of distance matrix)
+    # Hence x² + (xd+yd)kx = M, where x=chunk_size, k=n_features, M=maxmem
+    #                                 xd=x_density and yd=y_density
+    tmp = (x_density + y_density) * n_features
+    chunk_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2
+    chunk_size = max(int(chunk_size), 1)
+
+    n_samples_X_rem = n_samples_X % chunk_size
+    n_chunks_X = n_samples_X // chunk_size + (n_samples_X_rem != 0)
+    n_samples_Y_rem = n_samples_Y % chunk_size
+    n_chunks_Y = n_samples_Y // chunk_size + (n_samples_Y_rem != 0)
+
+    for i in range(n_chunks_X):
+        xs = i * chunk_size
+        xe = xs + (chunk_size if i < n_chunks_X - 1 else n_samples_X_rem)
+
+        X_chunk = X[xs:xe].astype(np.float64)
+        XX_chunk = row_norms(X_chunk, squared=True)
+
+        for j in range(n_chunks_Y):
+            ys = j * chunk_size
+            ye = ys + (chunk_size if j < n_chunks_Y - 1 else n_samples_Y_rem)
+
+            Y_chunk = Y[ys:ye].astype(np.float64)
+            YY_chunk = row_norms(Y_chunk, squared=True)
+
+            d = - 2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True)
+            _add_norms(d, XX_chunk, YY_chunk)
+
+            distances[xs:xe, ys:ye] = d.astype(np.float32)
+
+    return distances
+
+
 def _argmin_min_reduce(dist, start):
     indices = dist.argmin(axis=1)
     values = dist[np.arange(dist.shape[0]), indices]