Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Stable and fast float32 implementation of euclidean_distances #11271

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
171 changes: 145 additions & 26 deletions sklearn/metrics/pairwise.py
Expand Up @@ -22,6 +22,7 @@
from ..utils import check_array
from ..utils import gen_even_slices
from ..utils import gen_batches, get_chunk_n_rows
from ..utils.fixes import _cast_if_needed
from ..utils.extmath import row_norms, safe_sparse_dot
from ..preprocessing import normalize
from ..externals.joblib import Parallel
Expand Down Expand Up @@ -161,6 +162,121 @@ def check_paired_arrays(X, Y):


# Pairwise distances
def _euclidean_distances_float64(X, Y, Y_norm_squared=None, squared=False,
X_norm_squared=None):
"""
Compute the euclidean distances between X and Y when both are arrays of
float64.
"""
if X_norm_squared is not None:
XX = X_norm_squared
else:
XX = row_norms(X, squared=True)[:, np.newaxis]

if X is Y: # shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is not None:
YY = Y_norm_squared
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]

distances = safe_sparse_dot(X, Y.T, dense_output=True)
distances *= -2
distances += XX
distances += YY
np.maximum(distances, 0, out=distances)

if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
distances.flat[::distances.shape[0] + 1] = 0.0

return distances if squared else np.sqrt(distances, out=distances)


def _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared=None,
squared=False, X_norm_squared=None):
"""
Compute the euclidean distance matrix between X and Y by casting to float64
for a greater numerical stability.

The computation is done by blocks to limit additional memory usage.
"""
# For performance reasons, swap X and Y if I got X_norm_squared but not
# Y_norm_squared
if X_norm_squared is not None and Y_norm_squared is None:
swap = True
X, Y = Y, X
X_norm_squared, Y_norm_squared = None, X_norm_squared.T
else:
swap = False

# No more than 10MB of additional memory will be used to cast X and Y to
# float64 and to get the float64 result.
maxmem = 10*1024*1024
itemsize = np.float64(0).itemsize
maxmem /= itemsize

# Compute the block size that won't use more than maxmem bytes of
# additional (temporary) memory.
# The amount of additional memory required is:
# - a float64 copy of a block of the rows of X if needed;
# - a float64 copy of a block of the rows of Y if needed;
# - the float64 block result;
# - a float64 copy of a block of X_norm_squared if needed;
# - a float64 copy of a block of Y_norm_squared if needed.
# This is a quadratic equation that we solve to compute the block size that
# would use maxmem bytes.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if we use a bit more? The performance is decreased?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance decreases a bit, yes. But it's only noticeable for a maxmem several order of magnitude larger than required. If you look at the first set of plots, you'll see a trend for larger memory sizes.

XYmem = 0
if X.dtype != np.float64:
XYmem += X.shape[1]
if Y.dtype != np.float64:
XYmem += Y.shape[1] # Note that Y.shape[1] == X.shape[1]
if X_norm_squared is None or X_norm_squared.dtype != np.float64:
XYmem += 1
if Y_norm_squared is None or Y_norm_squared.dtype != np.float64:
XYmem += 1

delta = XYmem ** 2 + 4 * maxmem
bs = int((-XYmem + np.sqrt(delta)) // 2)
bs = max(1, bs)

distances = np.empty((X.shape[0], Y.shape[0]), dtype=outdtype)

for i in range(0, X.shape[0], bs):
ipbs = min(i + bs, X.shape[0])
Xc = _cast_if_needed(X[i:ipbs, :], np.float64)

if X_norm_squared is not None:
Xnc = _cast_if_needed(X_norm_squared[i:ipbs, :], np.float64)
else:
Xnc = row_norms(Xc, squared=True)[:, np.newaxis]

for j in range(i, Y.shape[0], bs):
jpbs = min(j + bs, Y.shape[0])
if X is Y and i == j:
Yc = Xc
else:
Yc = _cast_if_needed(Y[j:jpbs, :], np.float64)

if Y_norm_squared is not None:
Ync = _cast_if_needed(Y_norm_squared[:, j:jpbs], np.float64)
else:
Ync = None

d = _euclidean_distances_float64(Xc, Yc, Y_norm_squared=Ync,
squared=True, X_norm_squared=Xnc)
distances[i:ipbs, j:jpbs] = d

if X is Y and j > i:
distances[j:jpbs, i:ipbs] = d.T
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Line not covered by tests, probably need to add a test for this


if swap:
distances = distances.T

return distances if squared else np.sqrt(distances, out=distances)


def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
X_norm_squared=None):
"""
Expand Down Expand Up @@ -223,39 +339,42 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
"""
X, Y = check_pairwise_arrays(X, Y)

if X_norm_squared is not None:
XX = check_array(X_norm_squared)
if XX.shape == (1, X.shape[0]):
XX = XX.T
elif XX.shape != (X.shape[0], 1):
raise ValueError(
"Incompatible dimensions for X and X_norm_squared")
else:
XX = row_norms(X, squared=True)[:, np.newaxis]
if X_norm_squared is not None and X_norm_squared.dtype != X.dtype:
warnings.warn("X_norm_squared has a different dtype than X")

if X is Y: # shortcut in the common case euclidean_distances(X, X)
YY = XX.T
elif Y_norm_squared is not None:
YY = np.atleast_2d(Y_norm_squared)
if Y_norm_squared is not None and Y_norm_squared.dtype != Y.dtype:
warnings.warn("Y_norm_squared has a different dtype than Y")

if YY.shape != (1, Y.shape[0]):
if X_norm_squared is not None:
X_norm_squared = np.atleast_2d(X_norm_squared)
X_norm_squared = check_array(X_norm_squared)
if X_norm_squared.shape == (1, X.shape[0]):
X_norm_squared = X_norm_squared.T
elif X_norm_squared.shape != (X.shape[0], 1):
raise ValueError(
"Incompatible dimensions for Y and Y_norm_squared")
else:
YY = row_norms(Y, squared=True)[np.newaxis, :]
"Incompatible dimensions between X and X_norm_squared")

distances = safe_sparse_dot(X, Y.T, dense_output=True)
distances *= -2
distances += XX
distances += YY
np.maximum(distances, 0, out=distances)
if Y_norm_squared is not None:
Y_norm_squared = np.atleast_2d(Y_norm_squared)
Y_norm_squared = check_array(Y_norm_squared)
if Y_norm_squared.shape != (1, Y.shape[0]):
raise ValueError(
"Incompatible dimensions between Y and Y_norm_squared")

if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
distances.flat[::distances.shape[0] + 1] = 0.0
if X_norm_squared is None and Y_norm_squared is not None:
X_norm_squared = Y_norm_squared.T
if X_norm_squared is not None and Y_norm_squared is None:
Y_norm_squared = X_norm_squared.T
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also not covered by tests..


return distances if squared else np.sqrt(distances, out=distances)
outdtype = (X[0, 0] + Y[0, 0]).dtype

if X.dtype == np.float64 and Y.dtype == np.float64:
return _euclidean_distances_float64(X, Y, Y_norm_squared, squared,
X_norm_squared)
else:
return _euclidean_distances_cast(X, Y, outdtype, Y_norm_squared,
squared, X_norm_squared)


def _argmin_min_reduce(dist, start):
Expand Down
32 changes: 32 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Expand Up @@ -556,6 +556,38 @@ def test_euclidean_distances():
Y_norm_squared=np.zeros_like(Y_norm_sq))
assert_greater(np.max(np.abs(wrong_D - D1)), .01)

# Check euclidean_distances when using float32 for one or both arguments
X32 = X.astype(np.float32)
Y32 = Y.astype(np.float32)
D64 = euclidean_distances(X, Y)
D64_32 = euclidean_distances(X, Y32)
D32_64 = euclidean_distances(X32, Y)
D32_32 = euclidean_distances(X32, Y32)
assert_equal(D64_32.dtype, np.float64)
assert_equal(D32_64.dtype, np.float64)
assert_equal(D32_32.dtype, np.float32)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assert D32_32.dtype == np.float32 etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the adoption of pytest, we are phasing out use of test helpers assert_equal, assert_true, etc. Please use bare assert statements, e.g. assert x == y, assert not x, etc.

assert_array_almost_equal(D64_32, D64)
assert_array_almost_equal(D32_64, D64)
assert_array_almost_equal(D32_32, D64)

# Check that {X,Y}_norm_squared are used with float32 arguments
X32_norm_sq = 0.5 * (X32 ** 2).sum(axis=1).reshape(1, -1)
Y32_norm_sq = 0.5 * (Y32 ** 2).sum(axis=1).reshape(1, -1)
DYN = euclidean_distances(X32, Y32, Y_norm_squared=Y32_norm_sq)
DXN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq)
DXYN = euclidean_distances(X32, Y32, X_norm_squared=X32_norm_sq,
Y_norm_squared=Y32_norm_sq)
assert_greater(np.max(np.abs(DYN - D64)), .01)
assert_greater(np.max(np.abs(DXN - D64)), .01)
assert_greater(np.max(np.abs(DXYN - D64)), .01)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assert np.max(np.abs(DXYN - D64)) > .01


# Check that the accuracy with float32 is not too bad
X = np.array([[0.9215765222065525, 0.9682577158572608],
[0.9221151778782808, 0.9681831844652774]])
D64 = euclidean_distances(X)
D32 = euclidean_distances(X.astype(np.float32))
assert_array_almost_equal(D32, D64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On master I get,

In [5]: D64 = euclidean_distances(X)

In [6]: D64
Out[6]: 
array([[0.        , 0.00054379],
       [0.00054379, 0.        ]])

In [7]: D32 = euclidean_distances(X.astype(np.float32))

In [8]: D32
Out[8]: 
array([[0.        , 0.00059802],
       [0.00059802, 0.        ]], dtype=float32)

and assert_array_almost_equal uses 6 decimals by default, so since this passes, it would increase the accuracy from 1 to 3 significant digits. Maybe use assert_allclose with rtol to make it more explicit?



def test_cosine_distances():
# Check the pairwise Cosine distances computation
Expand Down
12 changes: 12 additions & 0 deletions sklearn/utils/fixes.py
Expand Up @@ -295,3 +295,15 @@ def _object_dtype_isnan(X):
else:
def _object_dtype_isnan(X):
return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool)


if sp_version < (1, 0):
# Before scipy 1.0, sp.sparse.csr_matrix.astype didn't have a 'copy'
# argument.
def _cast_if_needed(arr, dtype):
if arr.dtype == dtype:
return arr
return arr.astype(dtype)
else:
def _cast_if_needed(arr, dtype):
return arr.astype(dtype, copy=False)