New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FIX euclidean_distances float32 numerical instabilities #13554
Changes from 103 commits
d38d8a0
01767d6
83cada9
ecb3d2c
6b140db
558b988
de9d217
395be7a
9c79570
4599379
7425821
a2504af
fee1258
fe74973
7f5f257
46fb590
d8a2341
e0a0dc5
4b6e707
dd634db
0e916a5
78cf2c6
1a96e1e
46c4560
dbbd855
77b456f
6dace0c
37ef0c3
1d03e13
1d5dca9
bf1af7d
b4fb670
2de3969
800ecae
5eff972
5ebcef0
532f110
37ad253
984719c
d0b7441
14a3f16
1fd5b71
7558d50
b4d0527
adceb7d
127cc41
37b099b
bdea46f
d91682a
d7815e3
8a966f4
96ff3b0
c21be57
c3e72f2
78cb1b3
0e4f561
383b132
3801caf
5dc1c46
b928396
b957edd
c0fb225
95d39ca
f11f647
306d202
78c9cb5
66fa659
f229708
6b29d38
876908e
04fcbce
cc2c186
02e864e
740c0fb
247f0e7
62b5a85
3b54222
63cd7d4
e8be8aa
78acc98
2a2caff
5a1f05a
e5a8e0b
0ca7c96
7005ab4
8e3cc5e
18dc3a9
32851c1
f818d86
32ea647
9cf6606
8d4bffc
5135252
e15e391
9a5b3ac
77ac3df
184cd2c
fbcfc52
e5e2848
32c9f99
0f346f5
fce0713
117529e
9c205c1
985037a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,17 +193,24 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, | |
Y_norm_squared : array-like, shape (n_samples_2, ), optional | ||
Pre-computed dot-products of vectors in Y (e.g., | ||
``(Y**2).sum(axis=1)``) | ||
May be ignored in some cases, see the note below. | ||
|
||
squared : boolean, optional | ||
Return squared Euclidean distances. | ||
|
||
X_norm_squared : array-like, shape = [n_samples_1], optional | ||
Pre-computed dot-products of vectors in X (e.g., | ||
``(X**2).sum(axis=1)``) | ||
May be ignored in some cases, see the note below. | ||
|
||
Notes | ||
----- | ||
To achieve better accuracy, `X_norm_squared` and `Y_norm_squared` may be | ||
unused if they are passed as ``float32``. | ||
|
||
Returns | ||
------- | ||
distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) | ||
distances : array, shape (n_samples_1, n_samples_2) | ||
|
||
Examples | ||
-------- | ||
|
@@ -224,41 +231,125 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, | |
""" | ||
X, Y = check_pairwise_arrays(X, Y) | ||
|
||
# If norms are passed as float32, they are unused. If arrays are passed as | ||
# float32, norms needs to be recomputed on upcast chunks. | ||
# TODO: use a float64 accumulator in row_norms to avoid the latter. | ||
if X_norm_squared is not None: | ||
XX = check_array(X_norm_squared) | ||
if XX.shape == (1, X.shape[0]): | ||
XX = XX.T | ||
elif XX.shape != (X.shape[0], 1): | ||
raise ValueError( | ||
"Incompatible dimensions for X and X_norm_squared") | ||
if XX.dtype == np.float32: | ||
XX = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we raise a warning that these cannot be used?? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not but I'm afraid it could generate a lot of warnings. For instance when pairwise_distances is called in a loop. One thing we should certainly do is adding an option to |
||
elif X.dtype == np.float32: | ||
XX = None | ||
else: | ||
XX = row_norms(X, squared=True)[:, np.newaxis] | ||
|
||
if X is Y: # shortcut in the common case euclidean_distances(X, X) | ||
if X is Y and XX is not None: | ||
# shortcut in the common case euclidean_distances(X, X) | ||
YY = XX.T | ||
elif Y_norm_squared is not None: | ||
YY = np.atleast_2d(Y_norm_squared) | ||
|
||
if YY.shape != (1, Y.shape[0]): | ||
raise ValueError( | ||
"Incompatible dimensions for Y and Y_norm_squared") | ||
if YY.dtype == np.float32: | ||
YY = None | ||
elif Y.dtype == np.float32: | ||
YY = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At first glance, it is not obvious that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Setting Without reading There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that we would need a small comment to clarify this part. |
||
else: | ||
YY = row_norms(Y, squared=True)[np.newaxis, :] | ||
|
||
distances = safe_sparse_dot(X, Y.T, dense_output=True) | ||
distances *= -2 | ||
distances += XX | ||
distances += YY | ||
if X.dtype == np.float32: | ||
# To minimize precision issues with float32, we compute the distance | ||
# matrix on chunks of X and Y upcast to float64 | ||
distances = _euclidean_distances_upcast(X, XX, Y, YY) | ||
else: | ||
# if dtype is already float64, no need to chunk and upcast | ||
distances = - 2 * safe_sparse_dot(X, Y.T, dense_output=True) | ||
distances += XX | ||
distances += YY | ||
np.maximum(distances, 0, out=distances) | ||
|
||
# Ensure that distances between vectors and themselves are set to 0.0. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this out of the if? It's only applicable if the (Curiously we currently appear to duplicate this logic in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you mean zeroing the diagonal ? It applies using both methods. It's necessary to also do it in pairwise_distances_chunked, because we lose the diagonal info when passing chunks to pairwise_distances. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but I don't think we zero the diagonal in pairwise_distances at all for other metrics There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was looking at euclidean_distances, not pairwise_distances... That's strange indeed. Even for metric=euclidean it's not enforced when n_jobs > 1. Let me put that also on my todo list :) |
||
# This may not be the case due to floating point rounding errors. | ||
if X is Y: | ||
# Ensure that distances between vectors and themselves are set to 0.0. | ||
# This may not be the case due to floating point rounding errors. | ||
distances.flat[::distances.shape[0] + 1] = 0.0 | ||
np.fill_diagonal(distances, 0) | ||
|
||
return distances if squared else np.sqrt(distances, out=distances) | ||
|
||
|
||
def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None): | ||
"""Euclidean distances between X and Y | ||
|
||
Assumes X and Y have float32 dtype. | ||
Assumes XX and YY have float64 dtype or are None. | ||
|
||
X and Y are upcast to float64 by chunks, which size is chosen to limit | ||
memory increase by approximately 10% (at least 10Mib). | ||
""" | ||
n_samples_X = X.shape[0] | ||
n_samples_Y = Y.shape[0] | ||
n_features = X.shape[1] | ||
|
||
distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32) | ||
|
||
x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1 | ||
y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1 | ||
|
||
# Allow 10% more memory than X, Y and the distance matrix take (at least | ||
# 10Mib) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use |
||
maxmem = max( | ||
((x_density * n_samples_X + y_density * n_samples_Y) * n_features | ||
+ (x_density * n_samples_X * y_density * n_samples_Y)) / 10, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes the assumption that Maybe we could add Also not that it matters too much, but
Actually, maybe, def get_array_nbytes(X):
if issparse(X):
if hasattr(X, 'indices'):
# CSR or CSC
return X.data.nbytes + X.data.nbytes + X.data.indptr
else:
# some other sparse format, assume 8 bytes to index
# one non zero element (e.g. COO)
return X.data.nbytes + X.nnz*8
else:
return X.nbytes
maxmem = (get_array_size(X) + get_array_size(Y) + get_array_size(distances)) / 10 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This function is only used for float32 X and Y. I've explained it in its docstring.
When you change the type of the csr matrix, only a copy of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Are you sure? >>> import scipy.sparse
>>> import numpy as nop
>>> import numpy as np
>>> X = scipy.sparse.csr_matrix(np.ones((10, 10)))
>>> X.dtype
dtype('float64')
>>> Y = X.astype(np.float32)
>>> np.shares_memory(Y.data, Y.data)
True
>>> np.shares_memory(Y.data, X.data)
False
>>> np.shares_memory(Y.indices, X.indices)
False That's not critical, my point here is that it might be beter (not necessarily in this PR) to use |
||
10 * 2**17) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this choice? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A fixed 10Mib maxmem can lead to very small chunks when the number of features is very large (extreme case I agree). In that case the drop of performance can be quite severe. Another possibility could be to have a fixed maxmem with a min chunk_size. I think it's kind of equivalent, but in this case you can have a memory increase bigger than expected whereas in the previous case the memory increase is bounded. That's why I went for the first one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Large chunks would also decrease cache efficiency. Although I'm not a big fan of having an unbounded amount of additional memory, 10% of the memory already used is probably fair. When I look at my old benchmarks, I think that there might be something going on under the hood. Maybe the optimal block size is actually constant. Moreover, if the number of features is large, it can also be chunked. That might deserve to be explored as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the input data is memmapped, calculating this as a function of n_features may not be fair. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think my comment about memmapping was incorrect in any case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This logic seems to be more complicated than necessary, but okay. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am fine to merge with this heuristic. In the future, I am wondering if we could expose a |
||
|
||
# The increase amount of memory in 8-byte blocks is: | ||
# - x_density * batch_size * n_features (copy of chunk of X) | ||
# - y_density * batch_size * n_features (copy of chunk of Y) | ||
# - batch_size * batch_size (chunk of distance matrix) | ||
# Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When a grade school student ask, "Why are we learning the quadratic equation?", we have an answer. |
||
# xd=x_density and yd=y_density | ||
tmp = (x_density + y_density) * n_features | ||
batch_size = (-tmp + np.sqrt(tmp**2 + 4 * maxmem)) / 2 | ||
batch_size = max(int(batch_size), 1) | ||
rth marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
x_batches = gen_batches(X.shape[0], batch_size) | ||
y_batches = gen_batches(Y.shape[0], batch_size) | ||
|
||
for i, x_slice in enumerate(x_batches): | ||
X_chunk = X[x_slice].astype(np.float64) | ||
if XX is None: | ||
XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis] | ||
else: | ||
XX_chunk = XX[x_slice] | ||
|
||
for j, y_slice in enumerate(y_batches): | ||
if X is Y and j < i: | ||
# when X is Y the distance matrix is symmetric so we only need | ||
# to compute half of it. | ||
d = distances[y_slice, x_slice].T | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It must be correct since the corresponding tests pass, but after thinking about it for 20s I'm still not confident that when we compute How about just adding a if X is Y:
# the result is symmetric, copy distances from the lower triangular matrix
for i, x_slice in enumerate(x_batches):
for j, y_slice in enumerate(y_batches):
if j < i:
distances[x_slice, y_slice] = distances[x_slice, y_slice].T it shouldn't matter for run time too much time I think, but it might make things a bit more explicit? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It is since we compute the chunks with |
||
|
||
else: | ||
Y_chunk = Y[y_slice].astype(np.float64) | ||
if YY is None: | ||
YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :] | ||
else: | ||
YY_chunk = YY[:, y_slice] | ||
|
||
d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True) | ||
d += XX_chunk | ||
d += YY_chunk | ||
|
||
distances[x_slice, y_slice] = d.astype(np.float32, copy=False) | ||
|
||
return distances | ||
|
||
|
||
def _argmin_min_reduce(dist, start): | ||
indices = dist.argmin(axis=1) | ||
values = dist[np.arange(dist.shape[0]), indices] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -584,41 +584,115 @@ def test_pairwise_distances_chunked(): | |
assert_raises(StopIteration, next, gen) | ||
|
||
|
||
def test_euclidean_distances(): | ||
# Check the pairwise Euclidean distances computation | ||
X = [[0]] | ||
Y = [[1], [2]] | ||
@pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix], | ||
ids=["dense", "sparse"]) | ||
@pytest.mark.parametrize("y_array_constr", [np.array, csr_matrix], | ||
ids=["dense", "sparse"]) | ||
def test_euclidean_distances_known_result(x_array_constr, y_array_constr): | ||
# Check the pairwise Euclidean distances computation on known result | ||
X = x_array_constr([[0]]) | ||
Y = y_array_constr([[1], [2]]) | ||
D = euclidean_distances(X, Y) | ||
assert_array_almost_equal(D, [[1., 2.]]) | ||
assert_allclose(D, [[1., 2.]]) | ||
|
||
X = csr_matrix(X) | ||
Y = csr_matrix(Y) | ||
D = euclidean_distances(X, Y) | ||
assert_array_almost_equal(D, [[1., 2.]]) | ||
|
||
@pytest.mark.parametrize("dtype", [np.float32, np.float64]) | ||
@pytest.mark.parametrize("y_array_constr", [np.array, csr_matrix], | ||
ids=["dense", "sparse"]) | ||
def test_euclidean_distances_with_norms(dtype, y_array_constr): | ||
# check that we still get the right answers with {X,Y}_norm_squared | ||
# and that we get a wrong answer with wrong {X,Y}_norm_squared | ||
rng = np.random.RandomState(0) | ||
X = rng.random_sample((10, 4)) | ||
Y = rng.random_sample((20, 4)) | ||
X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1) | ||
Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1) | ||
X = rng.random_sample((10, 10)).astype(dtype, copy=False) | ||
Y = rng.random_sample((20, 10)).astype(dtype, copy=False) | ||
|
||
# norms will only be used if their dtype is float64 | ||
X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) | ||
Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) | ||
|
||
Y = y_array_constr(Y) | ||
|
||
# check that we still get the right answers with {X,Y}_norm_squared | ||
D1 = euclidean_distances(X, Y) | ||
D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) | ||
D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) | ||
D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, | ||
Y_norm_squared=Y_norm_sq) | ||
assert_array_almost_equal(D2, D1) | ||
assert_array_almost_equal(D3, D1) | ||
assert_array_almost_equal(D4, D1) | ||
assert_allclose(D2, D1) | ||
assert_allclose(D3, D1) | ||
assert_allclose(D4, D1) | ||
|
||
# check we get the wrong answer with wrong {X,Y}_norm_squared | ||
X_norm_sq *= 0.5 | ||
Y_norm_sq *= 0.5 | ||
wrong_D = euclidean_distances(X, Y, | ||
X_norm_squared=np.zeros_like(X_norm_sq), | ||
Y_norm_squared=np.zeros_like(Y_norm_sq)) | ||
assert_greater(np.max(np.abs(wrong_D - D1)), .01) | ||
with pytest.raises(AssertionError): | ||
assert_allclose(wrong_D, D1) | ||
|
||
|
||
@pytest.mark.parametrize("dtype", [np.float32, np.float64]) | ||
@pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix], | ||
ids=["dense", "sparse"]) | ||
@pytest.mark.parametrize("y_array_constr", [np.array, csr_matrix], | ||
ids=["dense", "sparse"]) | ||
def test_euclidean_distances(dtype, x_array_constr, y_array_constr): | ||
# check that euclidean distances gives same result as scipy cdist | ||
# when X and Y != X are provided | ||
rng = np.random.RandomState(0) | ||
X = rng.random_sample((100, 10)).astype(dtype, copy=False) | ||
X[X < 0.8] = 0 | ||
Y = rng.random_sample((10, 10)).astype(dtype, copy=False) | ||
Y[Y < 0.8] = 0 | ||
|
||
expected = cdist(X, Y) | ||
|
||
X = x_array_constr(X) | ||
Y = y_array_constr(Y) | ||
distances = euclidean_distances(X, Y) | ||
|
||
# the default rtol=1e-7 is too close to the float32 precision | ||
# and fails due too rounding errors. | ||
assert_allclose(distances, expected, rtol=1e-6) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a small comments regarding |
||
assert distances.dtype == dtype | ||
|
||
|
||
@pytest.mark.parametrize("dtype", [np.float32, np.float64]) | ||
@pytest.mark.parametrize("x_array_constr", [np.array, csr_matrix], | ||
ids=["dense", "sparse"]) | ||
def test_euclidean_distances_sym(dtype, x_array_constr): | ||
# check that euclidean distances gives same result as scipy pdist | ||
# when only X is provided | ||
rng = np.random.RandomState(0) | ||
X = rng.random_sample((100, 10)).astype(dtype, copy=False) | ||
X[X < 0.8] = 0 | ||
|
||
expected = squareform(pdist(X)) | ||
|
||
X = x_array_constr(X) | ||
distances = euclidean_distances(X) | ||
|
||
# the default rtol=1e-7 is too close to the float32 precision | ||
# and fails due too rounding errors. | ||
assert_allclose(distances, expected, rtol=1e-6) | ||
assert distances.dtype == dtype | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"dtype, eps, rtol", | ||
[(np.float32, 1e-4, 1e-5), | ||
pytest.param( | ||
np.float64, 1e-8, 0.99, | ||
marks=pytest.mark.xfail(reason='failing due to lack of precision'))]) | ||
@pytest.mark.parametrize("dim", [1, 1000000]) | ||
def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim): | ||
# check that euclidean distances is correct with float32 input thanks to | ||
# upcasting. On float64 there are still precision issues. | ||
X = np.array([[1.] * dim], dtype=dtype) | ||
Y = np.array([[1. + eps] * dim], dtype=dtype) | ||
|
||
distances = euclidean_distances(X, Y) | ||
expected = cdist(X, Y) | ||
|
||
assert_allclose(distances, expected, rtol=1e-5) | ||
|
||
|
||
def test_cosine_distances(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I put the comment here but I think that it should be written above.
I would either add a note (using sphinx syntax) or within the parameter description that
X_norm_squared
andY_norm_squared
will be ignored if they are passed asfloat32
since they will lead to inaccurate distances.