scikit-learn · rth · Apr 29, 2019 · Mar 4, 2019 · Apr 1, 2019 · Apr 1, 2019
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -347,9 +347,14 @@ Support for Python 3.4 and below has been officially dropped.
   :issue:`12855` by :user:`Pawel Sendyk <psendyk>`.
 
 - |API| The parameter ``labels`` in :func:`metrics.hamming_loss` is deprecated
-  in version 0.21 and will be removed in version 0.23.
-  :issue:`10580` by :user:`Reshama Shaikh <reshamas>` and :user:`Sandra
-  Mitrovic <SandraMNE>`.
+  in version 0.21 and will be removed in version 0.23. :issue:`10580` by
+  :user:`Reshama Shaikh <reshamas>` and :user:`Sandra Mitrovic <SandraMNE>`.
+
+- |Fix| The function :func:`euclidean_distances`, and therefore the functions
+  :func:`pairwise_distances` and :func:`pairwise_distances_chunked` with
+  ``metric=euclidean``, suffered from numerical precision issues. Precision has
+  been increased for float32. :issue:`13554` by :user:` <Celelibi>` and
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 - |API| :func:`metrics.jaccard_similarity_score` is deprecated in favour of
   the more consistent :func:`metrics.jaccard_score`. The former behavior for

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -203,7 +203,7 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
 
     Returns
     -------
-    distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
+    distances : array, shape (n_samples_1, n_samples_2)
 
     Examples
     --------
@@ -231,34 +231,115 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
         elif XX.shape != (X.shape[0], 1):
             raise ValueError(
                 "Incompatible dimensions for X and X_norm_squared")
+        if XX.dtype == np.float32:
+            XX = None
+    elif X.dtype == np.float32:
+        XX = None
     else:
         XX = row_norms(X, squared=True)[:, np.newaxis]
 
-    if X is Y:  # shortcut in the common case euclidean_distances(X, X)
+    if X is Y and XX is not None:
+        # shortcut in the common case euclidean_distances(X, X)
         YY = XX.T
     elif Y_norm_squared is not None:
         YY = np.atleast_2d(Y_norm_squared)
 
         if YY.shape != (1, Y.shape[0]):
             raise ValueError(
                 "Incompatible dimensions for Y and Y_norm_squared")
+        if YY.dtype == np.float32:
+            YY = None
+    elif Y.dtype == np.float32:
+        YY = None
     else:
         YY = row_norms(Y, squared=True)[np.newaxis, :]
 
-    distances = safe_sparse_dot(X, Y.T, dense_output=True)
-    distances *= -2
-    distances += XX
-    distances += YY
+    if X.dtype == np.float32:
+        # To minimize precision issues with float32, we compute the distance
+        # matrix on chunks of X and Y upcast to float64
+        distances = _euclidean_distances_upcast_fast(X, XX, Y, YY)
+    else:
+        # if dtype is already float64, no need to chunk and upcast
+        distances = - 2 * safe_sparse_dot(X, Y.T, dense_output=True)
+        distances += XX
+        distances += YY
     np.maximum(distances, 0, out=distances)
 
+    # Ensure that distances between vectors and themselves are set to 0.0.
+    # This may not be the case due to floating point rounding errors.
     if X is Y:
-        # Ensure that distances between vectors and themselves are set to 0.0.
-        # This may not be the case due to floating point rounding errors.
-        distances.flat[::distances.shape[0] + 1] = 0.0
+        np.fill_diagonal(distances, 0)
 
     return distances if squared else np.sqrt(distances, out=distances)
 
 
+def _euclidean_distances_upcast_fast(X, XX=None, Y=None, YY=None):
+    """Euclidean distances between X and Y
+
+    Assumes X and Y have float32 dtype.
+    Assumes XX and YY have float64 dtype or are None.
+
+    X and Y are upcast to float64 by chunks, which size is chosen to limit
+    memory increase by approximately 10%.
-    memory increase by approximately 10%.
+    memory increase by approximately 10% (bounded to 10Mb).
-    memory increase by approximately 10%.
+    memory increase by approximately 10% (bounded to 10Mb).
+    """
+    n_samples_X = X.shape[0]
+    n_samples_Y = Y.shape[0]
+    n_features = X.shape[1]
+
+    distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32)
+
+    x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1
+    y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1
+
+    # Allow 10% more memory than X, Y and the distance matrix take (at least
+    # 10Mib)
+    maxmem = max(
+        ((x_density * n_samples_X + y_density * n_samples_Y) * n_features
+         + (x_density * n_samples_X * y_density * n_samples_Y)) / 10,
+        10 * 2**17)
+
+    # The increase amount of memory is:
+    # - x_density * batch_size * n_features (copy of chunk of X)
+    # - y_density * batch_size * n_features (copy of chunk of Y)
+    # - batch_size * batch_size (chunk of distance matrix)
+    # Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem
+    #                                 xd=x_density and yd=y_density
+    tmp = (x_density + y_density) * n_features
+    batch_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2
+    batch_size = max(int(batch_size), 1)
+
+    x_batches = gen_batches(X.shape[0], batch_size)
+    y_batches = gen_batches(Y.shape[0], batch_size)
+
+    for i, x_slice in enumerate(x_batches):
+        X_chunk = X[x_slice].astype(np.float64)
+        if XX is None:
+            XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis]
+        else:
+            XX_chunk = XX[x_slice]
+
+        for j, y_slice in enumerate(y_batches):
+            if X is Y and j < i:
+                # when X is Y the distance matrix is symmetric so we only need
+                # to compute half of it.
+                d = distances[y_slice, x_slice].T
+
+            else:
+                Y_chunk = Y[y_slice].astype(np.float64)
+                if YY is None:
+                    YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :]
+                else:
+                    YY_chunk = YY[:, y_slice]
+
+                d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True)
+                d += XX_chunk
+                d += YY_chunk
+
+            distances[x_slice, y_slice] = d.astype(np.float32)
+
+    return distances
+
+
 def _argmin_min_reduce(dist, start):
     indices = dist.argmin(axis=1)
     values = dist[np.arange(dist.shape[0]), indices]

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -540,41 +540,114 @@ def test_pairwise_distances_chunked():
     assert_raises(StopIteration, next, gen)
 
 
-def test_euclidean_distances():
-    # Check the pairwise Euclidean distances computation
-    X = [[0]]
-    Y = [[1], [2]]
+@pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("y_array_constr", [np.array, csr_matrix],
+                         ids=["dense", "sparse"])
+def test_euclidean_distances_known_result(x_array_constr, y_array_constr):
+    # Check the pairwise Euclidean distances computation on known result
+    X = x_array_constr([[0]])
+    Y = y_array_constr([[1], [2]])
     D = euclidean_distances(X, Y)
-    assert_array_almost_equal(D, [[1., 2.]])
+    assert_allclose(D, [[1., 2.]])
 
-    X = csr_matrix(X)
-    Y = csr_matrix(Y)
-    D = euclidean_distances(X, Y)
-    assert_array_almost_equal(D, [[1., 2.]])
 
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("y_array_constr", [np.array, csr_matrix],
+                         ids=["dense", "sparse"])
+def test_euclidean_distances_with_norms(dtype, y_array_constr):
+    # check that we still get the right answers with {X,Y}_norm_squared
+    # and that we get a wrong answer with wrong {X,Y}_norm_squared
     rng = np.random.RandomState(0)
-    X = rng.random_sample((10, 4))
-    Y = rng.random_sample((20, 4))
-    X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1)
-    Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1)
+    X = rng.random_sample((10, 10)).astype(dtype, copy=False)
+    Y = rng.random_sample((20, 10)).astype(dtype, copy=False)
+
+    # norms will only be used if their dtype is float64
+    X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)
+    Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)
+
+    Y = y_array_constr(Y)
 
-    # check that we still get the right answers with {X,Y}_norm_squared
     D1 = euclidean_distances(X, Y)
     D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
     D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
     D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
                              Y_norm_squared=Y_norm_sq)
-    assert_array_almost_equal(D2, D1)
-    assert_array_almost_equal(D3, D1)
-    assert_array_almost_equal(D4, D1)
+    assert_allclose(D2, D1)
+    assert_allclose(D3, D1)
+    assert_allclose(D4, D1)
 
     # check we get the wrong answer with wrong {X,Y}_norm_squared
-    X_norm_sq *= 0.5
-    Y_norm_sq *= 0.5
     wrong_D = euclidean_distances(X, Y,
                                   X_norm_squared=np.zeros_like(X_norm_sq),
                                   Y_norm_squared=np.zeros_like(Y_norm_sq))
-    assert_greater(np.max(np.abs(wrong_D - D1)), .01)
+    with pytest.raises(AssertionError):
+        assert_allclose(wrong_D, D1)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix],
+                         ids=["dense", "sparse"])
+@pytest.mark.parametrize("y_array_constr", [np.array, csr_matrix],
+                         ids=["dense", "sparse"])
+def test_euclidean_distances(dtype, x_array_constr, y_array_constr):
+    # check that euclidean distances gives same result as scipy cdist
+    # when X and Y != X are provided
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(dtype, copy=False)
+    X[X < 0.8] = 0
+    Y = rng.random_sample((10, 10)).astype(dtype, copy=False)
+    Y[Y < 0.8] = 0
+
+    expected = cdist(X, Y)
+
+    X = x_array_constr(X)
+    Y = y_array_constr(Y)
+    distances = euclidean_distances(X, Y)
+
+    assert_allclose(distances, expected, rtol=1e-6)
+    assert distances.dtype == dtype
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix],
+                         ids=["dense", "sparse"])
+def test_euclidean_distances_sym(dtype, x_array_constr):
+    # check that euclidean distances gives same result as scipy pdist
+    # when only X is provided
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((100, 10)).astype(dtype, copy=False)
+    X[X < 0.8] = 0
+
+    expected = squareform(pdist(X))
+
+    X = x_array_constr(X)
+    distances = euclidean_distances(X)
+
+    assert_allclose(distances, expected, rtol=1e-6)
+    assert distances.dtype == dtype
+
+
+@pytest.mark.parametrize("dtype, s",
+                         [(np.float32, 1e-4),
+                          (np.float64, 1e-8)])
-                          (np.float64, 1e-8)])
+                          pytest.param(np.float64, 1e-8, marks=pytest.mark.xfail('failing due to lack of precision'))])
-                          (np.float64, 1e-8)])
+                          pytest.param(np.float64, 1e-8, marks=pytest.mark.xfail('failing due to lack of precision'))])
+@pytest.mark.parametrize("dim", [1, 1000000])
+def test_euclidean_distances_extreme_values(dtype, s, dim):
+    # check that euclidean distances is correct where 'fast' method wouldn't be
+    # on float32 thanks to upcasting and is not on float64.
+    X = np.array([[1.] * dim], dtype=dtype)
+    Y = np.array([[1. + s] * dim], dtype=dtype)
+
+    distances = euclidean_distances(X, Y)
+    expected = cdist(X, Y)
+
+    if dtype == np.float64:
+        # This is expected to fail for float64 due to lack of precision of the
+        # fast method of euclidean distances.
+        with pytest.raises(AssertionError, match='Not equal to tolerance'):
+            assert_allclose(distances, expected, rtol=(1 - 0.001))
+    else:
+        assert_allclose(distances, expected, rtol=1e-5)
 
 
 def test_cosine_distances():