scikit-learn · Celelibi · Jun 12, 2018 · Jun 12, 2018 · Jun 12, 2018 · Jun 12, 2018
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -22,6 +22,7 @@
 from ..utils import check_array
 from ..utils import gen_even_slices
 from ..utils import gen_batches, get_chunk_n_rows
+from ..utils.fixes import _cast_if_needed
 from ..utils.extmath import row_norms, safe_sparse_dot
 from ..preprocessing import normalize
 from ..externals.joblib import Parallel
@@ -161,6 +162,121 @@ def check_paired_arrays(X, Y):
 
 
 # Pairwise distances
+def _euclidean_distances_float64(X, Y, Y_norm_squared=None, squared=False,
+                                 X_norm_squared=None):
+    """
+    Compute the euclidean distances between X and Y when both are arrays of
+    float64.
+    """
+    if X_norm_squared is not None:
+        XX = X_norm_squared
+    else:
+        XX = row_norms(X, squared=True)[:, np.newaxis]
+
+    if X is Y:  # shortcut in the common case euclidean_distances(X, X)
+        YY = XX.T
+    elif Y_norm_squared is not None:
+        YY = Y_norm_squared
+    else:
+        YY = row_norms(Y, squared=True)[np.newaxis, :]
+
+    distances = safe_sparse_dot(X, Y.T, dense_output=True)
+    distances *= -2
+    distances += XX
+    distances += YY
+    np.maximum(distances, 0, out=distances)
+
+    if X is Y:
+        # Ensure that distances between vectors and themselves are set to 0.0.
+        # This may not be the case due to floating point rounding errors.
+        distances.flat[::distances.shape[0] + 1] = 0.0
+
+    return distances if squared else np.sqrt(distances, out=distances)
+
+
+def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None,
+                              squared=False, X_norm_squared=None):
+    """
+    Compute the euclidean distance matrix between X and Y by casting to float64
+    for a greater numerical stability.
+
+    The computation is done by blocks to limit additional memory usage.
+    """
+    # For performance reasons, swap X and Y if I got X_norm_squared but not
+    # Y_norm_squared
+    if X_norm_squared is not None and Y_norm_squared is None:
+        swap = True
+        X, Y = Y, X
+        X_norm_squared, Y_norm_squared = None, X_norm_squared.T
+    else:
+        swap = False
+
+    # No more than 10MB of additional memory will be used to cast X and Y to
+    # float64 and to get the float64 result.
+    maxmem = 10*1024*1024
+    itemsize = np.float64(0).itemsize
+    maxmem /= itemsize
+
+    # Compute the block size that won't use more than maxmem bytes of
+    # additional (temporary) memory.
+    # The amount of additional memory required is:
+    # - a float64 copy of a block of the rows of X if needed;
+    # - a float64 copy of a block of the rows of Y if needed;
+    # - the float64 block result;
+    # - a float64 copy of a block of X_norm_squared if needed;
+    # - a float64 copy of a block of Y_norm_squared if needed.
+    # This is a quadratic equation that we solve to compute the block size that
+    # would use maxmem bytes.
+    XYmem = 0
+    if X.dtype != np.float64:
+        XYmem += X.shape[1]
+    if Y.dtype != np.float64:
+        XYmem += Y.shape[1]  # Note that Y.shape[1] == X.shape[1]
+    if X_norm_squared is None or X_norm_squared.dtype != np.float64:
+        XYmem += 1
+    if Y_norm_squared is None or Y_norm_squared.dtype != np.float64:
+        XYmem += 1
+
+    delta = XYmem ** 2 + 4 * maxmem
+    bs = int((-XYmem + np.sqrt(delta)) // 2)
+    bs = max(1, bs)
+
+    distances = np.empty((X.shape[0], Y.shape[0]), dtype=outdtype)
+
+    for i in range(0, X.shape[0], bs):
+        ipbs = min(i + bs, X.shape[0])
+        Xc = _cast_if_needed(X[i:ipbs, :], np.float64)
+
+        if X_norm_squared is not None:
+            Xnc = _cast_if_needed(X_norm_squared[i:ipbs, :], np.float64)
+        else:
+            Xnc = row_norms(Xc, squared=True)[:, np.newaxis]
+
+        for j in range(i, Y.shape[0], bs):
+            jpbs = min(j + bs, Y.shape[0])
+            if X is Y and i == j:
+                Yc = Xc
+            else:
+                Yc = _cast_if_needed(Y[j:jpbs, :], np.float64)
+
+            if Y_norm_squared is not None:
+                Ync = _cast_if_needed(Y_norm_squared[:, j:jpbs], np.float64)
+            else:
+                Ync = None
+
+            d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=Ync,
+                                             squared=True, X_norm_squared=Xnc)
+            distances[i:ipbs, j:jpbs] = d
+
+            if X is Y and j > i:
+                distances[j:jpbs, i:ipbs] = d.T
+
+    if swap:
+        distances = distances.T
+
+    return distances if squared else np.sqrt(distances, out=distances)
+
+
 def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
                         X_norm_squared=None):
     """
@@ -223,39 +339,42 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     """
     X, Y = check_pairwise_arrays(X, Y)
 
-    if X_norm_squared is not None:
-        XX = check_array(X_norm_squared)
-        if XX.shape == (1, X.shape[0]):
-            XX = XX.T
-        elif XX.shape != (X.shape[0], 1):
-            raise ValueError(
-                "Incompatible dimensions for X and X_norm_squared")
-    else:
-        XX = row_norms(X, squared=True)[:, np.newaxis]
+    if X_norm_squared is not None and X_norm_squared.dtype != X.dtype:
+        warnings.warn("X_norm_squared has a different dtype than X")
 
-    if X is Y:  # shortcut in the common case euclidean_distances(X, X)
-        YY = XX.T
-    elif Y_norm_squared is not None:
-        YY = np.atleast_2d(Y_norm_squared)
+    if Y_norm_squared is not None and Y_norm_squared.dtype != Y.dtype:
+        warnings.warn("Y_norm_squared has a different dtype than Y")
 
-        if YY.shape != (1, Y.shape[0]):
+    if X_norm_squared is not None:
+        X_norm_squared = np.atleast_2d(X_norm_squared)
+        X_norm_squared = check_array(X_norm_squared)
+        if X_norm_squared.shape == (1, X.shape[0]):
+            X_norm_squared = X_norm_squared.T
+        elif X_norm_squared.shape != (X.shape[0], 1):
             raise ValueError(
-                "Incompatible dimensions for Y and Y_norm_squared")
-    else:
-        YY = row_norms(Y, squared=True)[np.newaxis, :]
+                    "Incompatible dimensions between X and X_norm_squared")
 
-    distances = safe_sparse_dot(X, Y.T, dense_output=True)
-    distances *= -2
-    distances += XX
-    distances += YY
-    np.maximum(distances, 0, out=distances)
+    if Y_norm_squared is not None:
+        Y_norm_squared = np.atleast_2d(Y_norm_squared)
+        Y_norm_squared = check_array(Y_norm_squared)
+        if Y_norm_squared.shape != (1, Y.shape[0]):
+            raise ValueError(
+                    "Incompatible dimensions between Y and Y_norm_squared")
 
     if X is Y:
-        # Ensure that distances between vectors and themselves are set to 0.0.
-        # This may not be the case due to floating point rounding errors.
-        distances.flat[::distances.shape[0] + 1] = 0.0
+        if X_norm_squared is None and Y_norm_squared is not None:
+            X_norm_squared = Y_norm_squared.T
+        if X_norm_squared is not None and Y_norm_squared is None:
+            Y_norm_squared = X_norm_squared.T
 
-    return distances if squared else np.sqrt(distances, out=distances)
+    outdtype = (X[0, 0] + Y[0, 0]).dtype
+
+    if X.dtype == np.float64 and Y.dtype == np.float64:
+        return _euclidean_distances_float64(X, Y, Y_norm_squared, squared,
+                                            X_norm_squared)
+    else:
+        return _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared,
+                                         squared, X_norm_squared)
 
 
 def _argmin_min_reduce(dist, start):

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -556,6 +556,38 @@ def test_euclidean_distances():
                                   Y_norm_squared=np.zeros_like(Y_norm_sq))
     assert_greater(np.max(np.abs(wrong_D - D1)), .01)
 
+    # Check euclidean_distances when using float32 for one or both arguments
+    X32 = X.astype(np.float32)
+    Y32 = Y.astype(np.float32)
+    D64 = euclidean_distances(X, Y)
+    D64_32 = euclidean_distances(X, Y32)
+    D32_64 = euclidean_distances(X32, Y)
+    D32_32 = euclidean_distances(X32, Y32)
+    assert_equal(D64_32.dtype, np.float64)
+    assert_equal(D32_64.dtype, np.float64)
+    assert_equal(D32_32.dtype, np.float32)
+    assert_array_almost_equal(D64_32, D64)
+    assert_array_almost_equal(D32_64, D64)
+    assert_array_almost_equal(D32_32, D64)
+
+    # Check that {X,Y}_norm_squared are used with float32 arguments
+    X32_norm_sq = 0.5 * (X32 ** 2).sum(axis=1).reshape(1, -1)
+    Y32_norm_sq = 0.5 * (Y32 ** 2).sum(axis=1).reshape(1, -1)
+    DYN = euclidean_distances(X32, Y32, Y_norm_squared=Y32_norm_sq)
+    DXN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq)
+    DXYN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq,
+                               Y_norm_squared=Y32_norm_sq)
+    assert_greater(np.max(np.abs(DYN - D64)), .01)
+    assert_greater(np.max(np.abs(DXN - D64)), .01)
+    assert_greater(np.max(np.abs(DXYN - D64)), .01)
+
+    # Check that the accuracy with float32 is not too bad
+    X = np.array([[0.9215765222065525, 0.9682577158572608],
+                  [0.9221151778782808, 0.9681831844652774]])
+    D64 = euclidean_distances(X)
+    D32 = euclidean_distances(X.astype(np.float32))
+    assert_array_almost_equal(D32, D64)
+
 
 def test_cosine_distances():
     # Check the pairwise Cosine distances computation

diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -295,3 +295,15 @@ def _object_dtype_isnan(X):
 else:
     def _object_dtype_isnan(X):
         return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool)
+
+
+if sp_version < (1, 0):
+    # Before scipy 1.0, sp.sparse.csr_matrix.astype didn't have a 'copy'
+    # argument.
+    def _cast_if_needed(arr, dtype):
+        if arr.dtype == dtype:
+            return arr
+        return arr.astype(dtype)
+else:
+    def _cast_if_needed(arr, dtype):
+        return arr.astype(dtype, copy=False)