Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Decrease wall time of ma.cov and ma.corrcoef #26285

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
44 changes: 44 additions & 0 deletions benchmarks/benchmarks/bench_ma.py
Expand Up @@ -265,3 +265,47 @@ def time_where(self, mtype, msize):
fun(self.nmxs > 2, self.nmxs, self.nmys)
elif msize == 'big':
fun(self.nmxl > 2, self.nmxl, self.nmyl)


class Cov(Benchmark):
param_names = ["size"]
params = [["small", "large"]]

def setup(self, size):
# Set the proportion of masked values.
prop_mask = 0.2
# Set up a "small" array with 10 vars and 10 obs.
rng = np.random.default_rng()
data = rng.random((10, 10), dtype=np.float32)
self.small = np.ma.array(data, mask=(data <= prop_mask))
# Set up a "large" array with 100 vars and 100 obs.
data = rng.random((100, 100), dtype=np.float32)
self.large = np.ma.array(data, mask=(data <= prop_mask))

def time_cov(self, size):
if size == "small":
np.ma.cov(self.small)
if size == "large":
np.ma.cov(self.large)


class Corrcoef(Benchmark):
param_names = ["size"]
params = [["small", "large"]]

def setup(self, size):
# Set the proportion of masked values.
prop_mask = 0.2
# Set up a "small" array with 10 vars and 10 obs.
rng = np.random.default_rng()
data = rng.random((10, 10), dtype=np.float32)
self.small = np.ma.array(data, mask=(data <= prop_mask))
# Set up a "large" array with 100 vars and 100 obs.
data = rng.random((100, 100), dtype=np.float32)
self.large = np.ma.array(data, mask=(data <= prop_mask))

def time_corrcoef(self, size):
if size == "small":
np.ma.corrcoef(self.small)
if size == "large":
np.ma.corrcoef(self.large)
13 changes: 13 additions & 0 deletions doc/release/upcoming_changes/26285.change.rst
@@ -0,0 +1,13 @@
``ma.corrcoef`` may return a slightly different result
------------------------------------------------------
A pairwise observation approach is currently used in `ma.corrcoef` to
calculate the standard deviations for each pair of variables. This has been
changed as it is being used to normalise the covariance, estimated using
`ma.cov`, which does not consider the observations for each variable in a
pairwise manner, rendering it unnecessary. The normalisation has been
replaced by the more appropriate standard deviation for each variable,
which significantly reduces the wall time, but will return slightly different
estimates of the correlation coefficients in cases where the observations
between a pair of variables are not aligned. However, it will return the same
estimates in all other cases, including returning the same correlation matrix
as `corrcoef` when using a masked array with no masked values.
5 changes: 5 additions & 0 deletions doc/release/upcoming_changes/26285.performance.rst
@@ -0,0 +1,5 @@
``ma.cov`` and ``ma.corrcoef`` are now significantly faster
-----------------------------------------------------------
The private function has been refactored along with `ma.cov` and
`ma.corrcoef`. They are now significantly faster, particularly on large,
masked arrays.
72 changes: 35 additions & 37 deletions numpy/ma/extras.py
Expand Up @@ -1569,7 +1569,14 @@ def _covhelper(x, y=None, rowvar=True, allow_masked=True):
tup = (None, slice(None))
#
if y is None:
xnotmask = np.logical_not(xmask).astype(int)
# Check if we can guarantee that the integers in the (N - ddof)
# normalisation can be accurately represented with single-precision
# before computing the dot product.
if x.shape[0] > 2 ** 24 or x.shape[1] > 2 ** 24:
xnm_dtype = np.float64
else:
xnm_dtype = np.float32
xnotmask = np.logical_not(xmask).astype(xnm_dtype)
else:
y = array(y, copy=False, ndmin=2, dtype=float)
ymask = ma.getmaskarray(y)
Expand All @@ -1584,7 +1591,16 @@ def _covhelper(x, y=None, rowvar=True, allow_masked=True):
x._sharedmask = False
y._sharedmask = False
x = ma.concatenate((x, y), axis)
xnotmask = np.logical_not(np.concatenate((xmask, ymask), axis)).astype(int)
# Check if we can guarantee that the integers in the (N - ddof)
# normalisation can be accurately represented with single-precision
# before computing the dot product.
if x.shape[0] > 2 ** 24 or x.shape[1] > 2 ** 24:
xnm_dtype = np.float64
else:
xnm_dtype = np.float32
xnotmask = np.logical_not(np.concatenate((xmask, ymask), axis)).astype(
xnm_dtype
)
x -= x.mean(axis=rowvar)[tup]
return (x, xnotmask, rowvar)

Expand Down Expand Up @@ -1671,11 +1687,17 @@ def cov(x, y=None, rowvar=True, bias=False, allow_masked=True, ddof=None):

(x, xnotmask, rowvar) = _covhelper(x, y, rowvar, allow_masked)
if not rowvar:
fact = np.dot(xnotmask.T, xnotmask) * 1. - ddof
result = (dot(x.T, x.conj(), strict=False) / fact).squeeze()
fact = np.dot(xnotmask.T, xnotmask) - ddof
mask = np.less_equal(fact, 0, dtype=bool)
with np.errstate(divide="ignore", invalid="ignore"):
data = np.dot(filled(x.T, 0), filled(x.conj(), 0)) / fact
result = ma.array(data, mask=mask).squeeze()
else:
fact = np.dot(xnotmask, xnotmask.T) * 1. - ddof
result = (dot(x, x.T.conj(), strict=False) / fact).squeeze()
fact = np.dot(xnotmask, xnotmask.T) - ddof
mask = np.less_equal(fact, 0, dtype=bool)
with np.errstate(divide="ignore", invalid="ignore"):
data = np.dot(filled(x, 0), filled(x.T.conj(), 0)) / fact
result = ma.array(data, mask=mask).squeeze()
return result


Expand Down Expand Up @@ -1744,39 +1766,15 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, allow_masked=True,
if bias is not np._NoValue or ddof is not np._NoValue:
# 2015-03-15, 1.10
warnings.warn(msg, DeprecationWarning, stacklevel=2)
# Get the data
(x, xnotmask, rowvar) = _covhelper(x, y, rowvar, allow_masked)
# Compute the covariance matrix
if not rowvar:
fact = np.dot(xnotmask.T, xnotmask) * 1.
c = (dot(x.T, x.conj(), strict=False) / fact).squeeze()
else:
fact = np.dot(xnotmask, xnotmask.T) * 1.
c = (dot(x, x.T.conj(), strict=False) / fact).squeeze()
# Check whether we have a scalar
# Estimate the covariance matrix.
corr = cov(x, y, rowvar, allow_masked=allow_masked)
# The non-masked version returns a masked value for a scalar.
try:
diag = ma.diagonal(c)
std = ma.sqrt(ma.diagonal(corr))
except ValueError:
return 1
#
if xnotmask.all():
_denom = ma.sqrt(ma.multiply.outer(diag, diag))
else:
_denom = diagflat(diag)
_denom._sharedmask = False # We know return is always a copy
n = x.shape[1 - rowvar]
if rowvar:
for i in range(n - 1):
for j in range(i + 1, n):
_x = mask_cols(vstack((x[i], x[j]))).var(axis=1)
_denom[i, j] = _denom[j, i] = ma.sqrt(ma.multiply.reduce(_x))
else:
for i in range(n - 1):
for j in range(i + 1, n):
_x = mask_cols(
vstack((x[:, i], x[:, j]))).var(axis=1)
_denom[i, j] = _denom[j, i] = ma.sqrt(ma.multiply.reduce(_x))
return c / _denom
return ma.MaskedConstant()
corr /= ma.multiply.outer(std, std)
return corr

#####--------------------------------------------------------------------------
#---- --- Concatenation helpers ---
Expand Down
22 changes: 21 additions & 1 deletion numpy/ma/tests/test_extras.py
Expand Up @@ -29,7 +29,7 @@
ediff1d, apply_over_axes, apply_along_axis, compress_nd, compress_rowcols,
mask_rowcols, clump_masked, clump_unmasked, flatnotmasked_contiguous,
notmasked_contiguous, notmasked_edges, masked_all, masked_all_like, isin,
diagflat, ndenumerate, stack, vstack
diagflat, ndenumerate, stack, vstack, _covhelper
)


Expand Down Expand Up @@ -1287,6 +1287,26 @@ class TestCov:
def setup_method(self):
self.data = array(np.random.rand(12))

def test_covhelper(self):
x = self.data
# Test not mask output type is a float.
assert_(_covhelper(x, rowvar=True)[1].dtype, np.float32)
assert_(_covhelper(x, y=x, rowvar=False)[1].dtype, np.float32)
# Test not mask output is equal after casting to float.
mask = x > 0.5
assert_array_equal(
_covhelper(
np.ma.masked_array(x, mask), rowvar=True
)[1].astype(bool),
~mask.reshape(1, -1),
)
assert_array_equal(
_covhelper(
np.ma.masked_array(x, mask), y=x, rowvar=False
)[1].astype(bool),
np.vstack((~mask, ~mask)),
)

def test_1d_without_missing(self):
# Test cov on 1D variable w/o missing values
x = self.data
Expand Down