Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add new function nancov #14784

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/release/upcoming_changes/14784.new_function.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
new function numpy.nancov added to numpy
---------------------------------------------------------------------
``numpy.nancov`` will estimate a covariance matrix while ignoring nan-values.

Using ``np.nancov(x, pairwise=True)`` calculates pairwise covariance where the covariance of 2 variables will include all observations where neither variable is nan.

The default behavior is ``pairwise=False`` to ignore all observations where any variable is nan.
1 change: 1 addition & 0 deletions doc/source/reference/routines.statistics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Correlating
corrcoef
correlate
cov
nancov

Histograms
----------
Expand Down
134 changes: 133 additions & 1 deletion numpy/lib/nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- `nancumprod` -- cumulative product of non-NaN values
- `nanmean` -- mean of non-NaN values
- `nanvar` -- variance of non-NaN values
- `nancov` -- covariance of non-NaN values
- `nanstd` -- standard deviation of non-NaN values
- `nanmedian` -- median of non-NaN values
- `nanquantile` -- qth quantile of non-NaN values
Expand All @@ -35,7 +36,7 @@

__all__ = [
'nansum', 'nanmax', 'nanmin', 'nanargmax', 'nanargmin', 'nanmean',
'nanmedian', 'nanpercentile', 'nanvar', 'nanstd', 'nanprod',
'nanmedian', 'nanpercentile', 'nanvar', 'nancov', 'nanstd', 'nanprod',
'nancumsum', 'nancumprod', 'nanquantile'
]

Expand Down Expand Up @@ -1562,6 +1563,137 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
return var


def _nancov_dispatcher(m, y=None, pairwise=None, ddof=None):
return (m, y, pairwise)


@array_function_dispatch(_nancov_dispatcher)
def nancov(m, y=None, pairwise=False, ddof=1):
"""
Estimate a covariance matrix, while ignoring NaNs.

Returns a covariance matrix using only slices with all non-nan values.
All slices with nan values will be ignored by default.

If `pairwise=True` is set, the covariance of variables x and y will be
computed using slices where x and y are non-nan values. Only slices
where x and y are nan will be ignored. The returned covariance matrix
my not be a positive definite matrix.

For all-NaN slices or slices with zero degrees of freedom, NaN is
returned.

Parameters
----------
m : array_like
A 1-D or 2-D array containing multiple variables and observations.
Each row of `m` represents a variable, and each column a single
observation of all those variables.
y : array_like, optional
An additional set of variables and observations. `y` has the same form
as that of `m`.
pairwise : bool, option
By default (False), any observation(column) with nan values are
ignored. If `pairwise` is True, then the covariance of x and y will
be computed pairwise, using rows where both x and y are non-nan values
and ignoring only rows where either x or y is a nan value.
ddof : int, optional
"Delta Degrees of Freedom": the divisor used in the calculation is
``N - ddof``, where ``N`` represents the number of
observations(columns) with no nan-values. If `pairwise` is set to True,
``N`` is computed separately for each pair of variables as the number
of observations where both x and y are non-nan values.
By default ddof is 1.


Returns
-------
out : ndarray
The covariance matrix of the variables

See Also
--------
cov : Estimated covariance matrix

Notes
-----
In the regular covariance matrix function, `cov`, if the i'th variable
has a nan value in any observation, all entries :math:`C_{i,j}` and
:math:`C_{j,i}` for all variables `j` will have a nan value in the
returned matrix.


With `nancov`, if `pairwise` is False, then the covariance matrix will
be calculated after removing all columns(observations) with nan values.

If `pairwise` is True, then pairwise covariances will be calculated where
the covariance of `x` and `y` will include all columns where `x` and `y`
are not nan values.

In either case, the normalization will be equal to the number of columns
used in calculating the covariance of two variables minus `ddof`.


Examples
--------

>>> x = np.array([
[1, 2, 3],
[2, np.nan, 6],
[3, 1.5, 1]])

Without pairwise covariances::
Notice how all covariances are calculated without the center column.

>>> np.nancov(x)
>>> array([
[ 2, 4, -2],
[ 4, 8, -4],
[-2, -4, 2]])

With pairwise covariances::
Covariance between variables :math:`x_0` and :math:`x_2` are calculated
with all 3 columns since they do not have any nan values, but all
covariances involving :math:`x_1` exclude the 2nd column with a nan value.

>>> np.nancov(x, pairwise=True)
>>> array([
[ 1, 4, -1],
[ 4, 8, -4],
[-1, -4, 1.08333333]])

>>> np.nanvar(a)
1.5555555555555554
>>> np.nanvar(a, axis=0)
array([1., 0.])
>>> np.nanvar(a, axis=1)
array([0., 0.25]) # may vary

"""

X = np.array(m, ndmin=2)
if y is not None:
y = np.array(y, ndmin=2)
X = np.concatenate((X, y), axis=0)

if not pairwise:
print(X)
nanmask = np.any(np.isnan(X.T), axis=1)
X = np.delete(X, np.argwhere(nanmask), axis=1)
print(nanmask)
print(X)
return (np.cov(X, ddof=ddof))
else:
rows = X.shape[0]
out = np.empty((rows, rows))
for i in range(rows):
out[i][i] = np.nanvar(X[i], ddof=ddof)
for j in range(i+1, rows):
out[i][j] = np.nancov(X[i], X[j], ddof=ddof)[0][1]
out[j][i] = out[i][j]
return out


def _nanstd_dispatcher(
a, axis=None, dtype=None, out=None, ddof=None, keepdims=None):
return (a, out)
Expand Down
64 changes: 63 additions & 1 deletion numpy/lib/tests/test_nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy.lib.nanfunctions import _nan_mask
from numpy.testing import (
assert_, assert_equal, assert_almost_equal, assert_no_warnings,
assert_raises, assert_array_equal, suppress_warnings
assert_raises, assert_array_equal, assert_allclose, suppress_warnings
)


Expand Down Expand Up @@ -298,6 +298,11 @@ def test_nanvar(self):
for mat in self.integer_arrays():
assert_equal(np.nanvar(mat, ddof=1), tgt)

def test_nancov(self):
tgt = np.cov(self.mat)
for mat in self.integer_arrays():
assert_equal(np.nancov(mat), tgt)

def test_nanstd(self):
tgt = np.std(self.mat)
for mat in self.integer_arrays():
Expand Down Expand Up @@ -447,6 +452,63 @@ def test_empty(self):
assert_equal(res, tgt)


class TestNanFunctions_NanCov(object):
x1 = np.array([
[0, 1, 4],
[1, 1, 3],
[2, 1, 2],
[3, 1, 1],
[4, 1, 0]])
res1 = np.array([
[2.5, 0., -2.5],
[0., 0., 0.],
[-2.5, 0., 2.5]])

x2 = np.array([
[1, np.nan, 2],
[np.nan, 3, 1],
[5, 2, np.nan],
[9, 4, 2],
[3, 8, 9]])
res2_default = np.array([
[18, -12, -21],
[-12, 8, 14],
[-21, 14, 24.5]])
res2_pairwise = np.array([
[11.6666666, -4.6666666, -4.6666666],
[-4.6666666, 6.9166666, 11.5],
[-4.6666666, 11.5, 13.6666666]])

x3 = np.array([0, 1, 2, 3, 4])
y3 = np.array([4, 3, np.nan, 1, 0])
res3_default = np.array([
[3.3333333, -3.3333333],
[-3.3333333, 3.3333333]])
res3_pairwise = np.array([
[2.5, -3.3333333],
[-3.3333333, 3.3333333]])

def test_default_no_nans(self):
assert_allclose(np.nancov(self.x1.T), self.res1)

def test_pairwise_no_nans(self):
assert_allclose(np.nancov(self.x1.T, pairwise=True), self.res1)

def test_default_with_nans(self):
assert_allclose(np.nancov(self.x2.T), self.res2_default)

def test_pairwise_with_nans(self):
assert_allclose(np.nancov(self.x2.T, pairwise=True),
self.res2_pairwise)

def test_default_xy(self):
assert_allclose(np.nancov(self.x3.T, self.y3), self.res3_default)

def test_pairwise_xy(self):
assert_allclose(np.nancov(self.x3.T, self.y3, pairwise=True),
self.res3_pairwise)


class TestNanFunctions_CumSumProd(SharedNanFunctionsTestsMixin):

nanfuncs = [np.nancumsum, np.nancumprod]
Expand Down