numpy · christopher-titchen · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/benchmarks/benchmarks/bench_ma.py b/benchmarks/benchmarks/bench_ma.py
@@ -265,3 +265,47 @@ def time_where(self, mtype, msize):
             fun(self.nmxs > 2, self.nmxs, self.nmys)
         elif msize == 'big':
             fun(self.nmxl > 2, self.nmxl, self.nmyl)
+
+
+class Cov(Benchmark):
+    param_names = ["size"]
+    params = [["small", "large"]]
+
+    def setup(self, size):
+        # Set the proportion of masked values.
+        prop_mask = 0.2
+        # Set up a "small" array with 10 vars and 10 obs.
+        rng = np.random.default_rng()
+        data = rng.random((10, 10), dtype=np.float32)
+        self.small = np.ma.array(data, mask=(data <= prop_mask))
+        # Set up a "large" array with 100 vars and 100 obs.
+        data = rng.random((100, 100), dtype=np.float32)
+        self.large = np.ma.array(data, mask=(data <= prop_mask))
+
+    def time_cov(self, size):
+        if size == "small":
+            np.ma.cov(self.small)
+        if size == "large":
+            np.ma.cov(self.large)
+
+
+class Corrcoef(Benchmark):
+    param_names = ["size"]
+    params = [["small", "large"]]
+
+    def setup(self, size):
+        # Set the proportion of masked values.
+        prop_mask = 0.2
+        # Set up a "small" array with 10 vars and 10 obs.
+        rng = np.random.default_rng()
+        data = rng.random((10, 10), dtype=np.float32)
+        self.small = np.ma.array(data, mask=(data <= prop_mask))
+        # Set up a "large" array with 100 vars and 100 obs.
+        data = rng.random((100, 100), dtype=np.float32)
+        self.large = np.ma.array(data, mask=(data <= prop_mask))
+
+    def time_corrcoef(self, size):
+        if size == "small":
+            np.ma.corrcoef(self.small)
+        if size == "large":
+            np.ma.corrcoef(self.large)
diff --git a/doc/release/upcoming_changes/26285.change.rst b/doc/release/upcoming_changes/26285.change.rst
@@ -0,0 +1,13 @@
+``ma.corrcoef`` may return a slightly different result
+------------------------------------------------------
+A pairwise observation approach is currently used in `ma.corrcoef` to 
+calculate the standard deviations for each pair of variables. This has been 
+changed as it is being used to normalise the covariance, estimated using 
+`ma.cov`, which does not consider the observations for each variable in a 
+pairwise manner, rendering it unnecessary. The normalisation has been 
+replaced by the more appropriate standard deviation for each variable, 
+which significantly reduces the wall time, but will return slightly different 
+estimates of the correlation coefficients in cases where the observations 
+between a pair of variables are not aligned. However, it will return the same 
+estimates in all other cases, including returning the same correlation matrix 
+as `corrcoef` when using a masked array with no masked values.
diff --git a/doc/release/upcoming_changes/26285.performance.rst b/doc/release/upcoming_changes/26285.performance.rst
@@ -0,0 +1,5 @@
+``ma.cov`` and ``ma.corrcoef`` are now significantly faster
+-----------------------------------------------------------
+The private function has been refactored along with `ma.cov` and 
+`ma.corrcoef`. They are now significantly faster, particularly on large, 
+masked arrays.
diff --git a/numpy/ma/extras.py b/numpy/ma/extras.py
@@ -1569,7 +1569,14 @@ def _covhelper(x, y=None, rowvar=True, allow_masked=True):
         tup = (None, slice(None))
     #
     if y is None:
-        xnotmask = np.logical_not(xmask).astype(int)
+        # Check if we can guarantee that the integers in the (N - ddof) 
+        # normalisation can be accurately represented with single-precision 
+        # before computing the dot product.
+        if x.shape[0] > 2 ** 24 or x.shape[1] > 2 ** 24:
+            xnm_dtype = np.float64
+        else:
+            xnm_dtype = np.float32
+        xnotmask = np.logical_not(xmask).astype(xnm_dtype)
     else:
         y = array(y, copy=False, ndmin=2, dtype=float)
         ymask = ma.getmaskarray(y)
@@ -1584,7 +1591,16 @@ def _covhelper(x, y=None, rowvar=True, allow_masked=True):
                     x._sharedmask = False
                     y._sharedmask = False
         x = ma.concatenate((x, y), axis)
-        xnotmask = np.logical_not(np.concatenate((xmask, ymask), axis)).astype(int)
+        # Check if we can guarantee that the integers in the (N - ddof) 
+        # normalisation can be accurately represented with single-precision 
+        # before computing the dot product.
+        if x.shape[0] > 2 ** 24 or x.shape[1] > 2 ** 24:
+            xnm_dtype = np.float64
+        else:
+            xnm_dtype = np.float32
+        xnotmask = np.logical_not(np.concatenate((xmask, ymask), axis)).astype(
+            xnm_dtype
+        )
     x -= x.mean(axis=rowvar)[tup]
     return (x, xnotmask, rowvar)
 
@@ -1671,11 +1687,17 @@ def cov(x, y=None, rowvar=True, bias=False, allow_masked=True, ddof=None):
 
     (x, xnotmask, rowvar) = _covhelper(x, y, rowvar, allow_masked)
     if not rowvar:
-        fact = np.dot(xnotmask.T, xnotmask) * 1. - ddof
-        result = (dot(x.T, x.conj(), strict=False) / fact).squeeze()
+        fact = np.dot(xnotmask.T, xnotmask) - ddof
+        mask = np.less_equal(fact, 0, dtype=bool)
+        with np.errstate(divide="ignore", invalid="ignore"):
+            data = np.dot(filled(x.T, 0), filled(x.conj(), 0)) / fact
+        result = ma.array(data, mask=mask).squeeze()
     else:
-        fact = np.dot(xnotmask, xnotmask.T) * 1. - ddof
-        result = (dot(x, x.T.conj(), strict=False) / fact).squeeze()
+        fact = np.dot(xnotmask, xnotmask.T) - ddof
+        mask = np.less_equal(fact, 0, dtype=bool)
+        with np.errstate(divide="ignore", invalid="ignore"):
+            data = np.dot(filled(x, 0), filled(x.T.conj(), 0)) / fact
+        result = ma.array(data, mask=mask).squeeze()
     return result
 
 
@@ -1744,39 +1766,15 @@ def corrcoef(x, y=None, rowvar=True, bias=np._NoValue, allow_masked=True,
     if bias is not np._NoValue or ddof is not np._NoValue:
         # 2015-03-15, 1.10
         warnings.warn(msg, DeprecationWarning, stacklevel=2)
-    # Get the data
-    (x, xnotmask, rowvar) = _covhelper(x, y, rowvar, allow_masked)
-    # Compute the covariance matrix
-    if not rowvar:
-        fact = np.dot(xnotmask.T, xnotmask) * 1.
-        c = (dot(x.T, x.conj(), strict=False) / fact).squeeze()
-    else:
-        fact = np.dot(xnotmask, xnotmask.T) * 1.
-        c = (dot(x, x.T.conj(), strict=False) / fact).squeeze()
-    # Check whether we have a scalar
+    # Estimate the covariance matrix.
+    corr = cov(x, y, rowvar, allow_masked=allow_masked)
+    # The non-masked version returns a masked value for a scalar.
     try:
-        diag = ma.diagonal(c)
+        std = ma.sqrt(ma.diagonal(corr))
     except ValueError:
-        return 1
-    #
-    if xnotmask.all():
-        _denom = ma.sqrt(ma.multiply.outer(diag, diag))
-    else:
-        _denom = diagflat(diag)
-        _denom._sharedmask = False  # We know return is always a copy
-        n = x.shape[1 - rowvar]
-        if rowvar:
-            for i in range(n - 1):
-                for j in range(i + 1, n):
-                    _x = mask_cols(vstack((x[i], x[j]))).var(axis=1)
-                    _denom[i, j] = _denom[j, i] = ma.sqrt(ma.multiply.reduce(_x))
-        else:
-            for i in range(n - 1):
-                for j in range(i + 1, n):
-                    _x = mask_cols(
-                            vstack((x[:, i], x[:, j]))).var(axis=1)
-                    _denom[i, j] = _denom[j, i] = ma.sqrt(ma.multiply.reduce(_x))
-    return c / _denom
+        return ma.MaskedConstant()
+    corr /= ma.multiply.outer(std, std)
+    return corr
 
 #####--------------------------------------------------------------------------
 #---- --- Concatenation helpers ---

diff --git a/numpy/ma/tests/test_extras.py b/numpy/ma/tests/test_extras.py
@@ -29,7 +29,7 @@
     ediff1d, apply_over_axes, apply_along_axis, compress_nd, compress_rowcols,
     mask_rowcols, clump_masked, clump_unmasked, flatnotmasked_contiguous,
     notmasked_contiguous, notmasked_edges, masked_all, masked_all_like, isin,
-    diagflat, ndenumerate, stack, vstack
+    diagflat, ndenumerate, stack, vstack, _covhelper
     )
 
 
@@ -1287,6 +1287,26 @@ class TestCov:
     def setup_method(self):
         self.data = array(np.random.rand(12))
 
+    def test_covhelper(self):
+        x = self.data
+        # Test not mask output type is a float.
+        assert_(_covhelper(x, rowvar=True)[1].dtype, np.float32)
+        assert_(_covhelper(x, y=x, rowvar=False)[1].dtype, np.float32)
+        # Test not mask output is equal after casting to float.
+        mask = x > 0.5
+        assert_array_equal(
+            _covhelper(
+                np.ma.masked_array(x, mask), rowvar=True
+            )[1].astype(bool),
+            ~mask.reshape(1, -1),
+        )
+        assert_array_equal(
+            _covhelper(
+                np.ma.masked_array(x, mask), y=x, rowvar=False
+            )[1].astype(bool),
+            np.vstack((~mask, ~mask)),
+        )
+
     def test_1d_without_missing(self):
         # Test cov on 1D variable w/o missing values
         x = self.data