Skip to content

Commit

Permalink
ENH: added new function nancov
Browse files Browse the repository at this point in the history
Added a new user facing function nancov, which calculates the covariance
of variables while ignoring nan values. Partially addresses features
requested in issue numpy#14414 and improves upon PR numpy#14688.
  • Loading branch information
Harry-Kwon committed Oct 26, 2019
1 parent 142f291 commit 1190088
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 2 deletions.
134 changes: 133 additions & 1 deletion numpy/lib/nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- `nancumprod` -- cumulative product of non-NaN values
- `nanmean` -- mean of non-NaN values
- `nanvar` -- variance of non-NaN values
- `nancov` -- covariance of non-NaN values
- `nanstd` -- standard deviation of non-NaN values
- `nanmedian` -- median of non-NaN values
- `nanquantile` -- qth quantile of non-NaN values
Expand All @@ -35,7 +36,7 @@

__all__ = [
'nansum', 'nanmax', 'nanmin', 'nanargmax', 'nanargmin', 'nanmean',
'nanmedian', 'nanpercentile', 'nanvar', 'nanstd', 'nanprod',
'nanmedian', 'nanpercentile', 'nanvar', 'nancov', 'nanstd', 'nanprod',
'nancumsum', 'nancumprod', 'nanquantile'
]

Expand Down Expand Up @@ -1562,6 +1563,137 @@ def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=np._NoValue):
return var


def _nancov_dispatcher(m, y=None, pairwise=None, ddof=None):
return (m, y, pairwise)


@array_function_dispatch(_nancov_dispatcher)
def nancov(m, y=None, pairwise=False, ddof=1):
"""
Estimate a covariance matrix, while ignoring NaNs.
Returns a covariance matrix using only slices with all non-nan values.
All slices with nan values will be ignored by default.
If `pairwise=True` is set, the covariance of variables x and y will be
computed using slices where x and y are non-nan values. Only slices
where x and y are nan will be ignored. The returned covariance matrix
my not be a positive definite matrix.
For all-NaN slices or slices with zero degrees of freedom, NaN is
returned.
Parameters
----------
m : array_like
A 1-D or 2-D array containing multiple variables and observations.
Each row of `m` represents a variable, and each column a single
observation of all those variables.
y : array_like, optional
An additional set of variables and observations. `y` has the same form
as that of `m`.
pairwise : bool, option
By default (False), any observation(column) with nan values are
ignored. If `pairwise` is True, then the covariance of x and y will
be computed pairwise, using rows where both x and y are non-nan values
and ignoring only rows where either x or y is a nan value.
ddof : int, optional
"Delta Degrees of Freedom": the divisor used in the calculation is
``N - ddof``, where ``N`` represents the number of
observations(columns) with no nan-values. If `pairwise` is set to True,
``N`` is computed separately for each pair of variables as the number
of observations where both x and y are non-nan values.
By default ddof is 1.
Returns
-------
out : ndarray
The covariance matrix of the variables
See Also
--------
cov : Estimated covariance matrix
Notes
-----
In the regular covariance matrix function, `cov`, if the i'th variable
has a nan value in any observation, all entries :math:`C_{i,j}` and
:math:`C_{j,i}` for all variables `j` will have a nan value in the
returned matrix.
With `nancov`, if `pairwise` is False, then the covariance matrix will
be calculated after removing all columns(observations) with nan values.
If `pairwise` is True, then pairwise covariances will be calculated where
the covariance of `x` and `y` will include all columns where `x` and `y`
are not nan values.
In either case, the normalization will be equal to the number of columns
used in calculating the covariance of two variables minus `ddof`.
Examples
--------
>>> x = np.array([
[1, 2, 3],
[2, np.nan, 6],
[3, 1.5, 1]])
Without pairwise covariances::
Notice how all covariances are calculated without the center column.
>>> np.nancov(x)
>>> array([
[ 2, 4, -2],
[ 4, 8, -4],
[-2, -4, 2]])
With pairwise covariances::
Covariance between variables :math:`x_0` and :math:`x_2` are calculated
with all 3 columns since they do not have any nan values, but all
covariances involving :math:`x_1` exclude the 2nd column with a nan value.
>>> np.nancov(x, pairwise=True)
>>> array([
[ 1, 4, -1],
[ 4, 8, -4],
[-1, -4, 1.08333333]])
>>> np.nanvar(a)
1.5555555555555554
>>> np.nanvar(a, axis=0)
array([1., 0.])
>>> np.nanvar(a, axis=1)
array([0., 0.25]) # may vary
"""

X = np.array(m, ndmin=2)
if y is not None:
y = np.array(y, ndmin=2)
X = np.concatenate((X, y), axis=0)

if not pairwise:
print(X)
nanmask = np.any(np.isnan(X.T), axis=1)
X = np.delete(X, np.argwhere(nanmask), axis=1)
print(nanmask)
print(X)
return (np.cov(X, ddof=ddof))
else:
rows = X.shape[0]
out = np.empty((rows, rows))
for i in range(rows):
out[i][i] = np.nanvar(X[i], ddof=ddof)
for j in range(i+1, rows):
out[i][j] = np.nancov(X[i], X[j], ddof=ddof)[0][1]
out[j][i] = out[i][j]
return out


def _nanstd_dispatcher(
a, axis=None, dtype=None, out=None, ddof=None, keepdims=None):
return (a, out)
Expand Down
64 changes: 63 additions & 1 deletion numpy/lib/tests/test_nanfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy.lib.nanfunctions import _nan_mask
from numpy.testing import (
assert_, assert_equal, assert_almost_equal, assert_no_warnings,
assert_raises, assert_array_equal, suppress_warnings
assert_raises, assert_array_equal, assert_allclose, suppress_warnings
)


Expand Down Expand Up @@ -298,6 +298,11 @@ def test_nanvar(self):
for mat in self.integer_arrays():
assert_equal(np.nanvar(mat, ddof=1), tgt)

def test_nancov(self):
tgt = np.cov(self.mat)
for mat in self.integer_arrays():
assert_equal(np.nancov(mat), tgt)

def test_nanstd(self):
tgt = np.std(self.mat)
for mat in self.integer_arrays():
Expand Down Expand Up @@ -447,6 +452,63 @@ def test_empty(self):
assert_equal(res, tgt)


class TestNanFunctions_NanCov(object):
x1 = np.array([
[0, 1, 4],
[1, 1, 3],
[2, 1, 2],
[3, 1, 1],
[4, 1, 0]])
res1 = np.array([
[2.5, 0., -2.5],
[0., 0., 0.],
[-2.5, 0., 2.5]])

x2 = np.array([
[1, np.nan, 2],
[np.nan, 3, 1],
[5, 2, np.nan],
[9, 4, 2],
[3, 8, 9]])
res2_default = np.array([
[18, -12, -21],
[-12, 8, 14],
[-21, 14, 24.5]])
res2_pairwise = np.array([
[11.6666666, -4.6666666, -4.6666666],
[-4.6666666, 6.9166666, 11.5],
[-4.6666666, 11.5, 13.6666666]])

x3 = np.array([0, 1, 2, 3, 4])
y3 = np.array([4, 3, np.nan, 1, 0])
res3_default = np.array([
[3.3333333, -3.3333333],
[-3.3333333, 3.3333333]])
res3_pairwise = np.array([
[2.5, -3.3333333],
[-3.3333333, 3.3333333]])

def test_default_no_nans(self):
assert_allclose(np.nancov(self.x1.T), self.res1)

def test_pairwise_no_nans(self):
assert_allclose(np.nancov(self.x1.T, pairwise=True), self.res1)

def test_default_with_nans(self):
assert_allclose(np.nancov(self.x2.T), self.res2_default)

def test_pairwise_with_nans(self):
assert_allclose(np.nancov(self.x2.T, pairwise=True),
self.res2_pairwise)

def test_default_xy(self):
assert_allclose(np.nancov(self.x3.T, self.y3), self.res3_default)

def test_pairwise_xy(self):
assert_allclose(np.nancov(self.x3.T, self.y3, pairwise=True),
self.res3_pairwise)


class TestNanFunctions_CumSumProd(SharedNanFunctionsTestsMixin):

nanfuncs = [np.nancumsum, np.nancumprod]
Expand Down

0 comments on commit 1190088

Please sign in to comment.