Skip to content

Commit

Permalink
ENH: add CovDetMCD preliminary version
Browse files Browse the repository at this point in the history
  • Loading branch information
josef-pkt committed Apr 22, 2024
1 parent c22837f commit 021cd2d
Showing 1 changed file with 254 additions and 0 deletions.
254 changes: 254 additions & 0 deletions statsmodels/robust/covariance.py
Expand Up @@ -32,6 +32,7 @@
from statsmodels.tools.testing import Holder

mad0 = lambda x: mad(x, center=0) # noqa: E731
median = lambda x: np.median(x, axis=0) # noqa: E731


# from scikit-learn
Expand Down Expand Up @@ -1162,3 +1163,256 @@ def _cov_starting(data, standardize=False, quantile=0.5):

# TODO: rescale back to original space using center and std
return cov_all


# ####### Det, CovDet and helper functions, might be moved to separate module


class _Standardize():
"""Robust standardization of random variable
"""

def __init__(self, x, func_center=None, func_scale=None):
# naming mean or center
# maybe also allow str func_scale for robust.scale, e.g. for not
# vectorized Qn
if func_center is None:
center = np.median(x, axis=0)

Check warning on line 1181 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1181

Added line #L1181 was not covered by tests
else:
center = func_center(x)
xdm = x - center

Check warning on line 1184 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1183-L1184

Added lines #L1183 - L1184 were not covered by tests
if func_scale is None:
scale = mad(xdm, center=0)

Check warning on line 1186 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1186

Added line #L1186 was not covered by tests
else:
# assumes vectorized
scale = func_scale(x)

Check warning on line 1189 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1189

Added line #L1189 was not covered by tests

self.x_stand = xdm / scale

Check warning on line 1191 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1191

Added line #L1191 was not covered by tests

self.center = center
self.scale = scale

Check warning on line 1194 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1193-L1194

Added lines #L1193 - L1194 were not covered by tests

def transform(self, x):
return (x - self.center) / self.scale

Check warning on line 1197 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1197

Added line #L1197 was not covered by tests

def untransform_mom(self, m, c):
mean = self.center + m
cov = c * np.outer(self.scale, self.scale)
return mean, cov

Check warning on line 1202 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1200-L1202

Added lines #L1200 - L1202 were not covered by tests


def _orthogonalize_det(z, corr, loc_func, scale_func):
"""Orthogonalize
This is a simplified version of the OGK method.
version from DetMCD works on zscored data
(does not return mean and cov of original data)
so we drop the compensation for scaling in zscoring
z is the data here, zscored with robust estimators,
e.g. median and Qn in DetMCD
"""
evals, evecs = np.linalg.eigh(corr) # noqa: F841
z = z.dot(evecs)
transf0 = evecs

Check warning on line 1218 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1216-L1218

Added lines #L1216 - L1218 were not covered by tests

scale_z = scale_func(z) # scale of principal components
cov = (transf0 * scale_z**2).dot(transf0.T)

Check warning on line 1221 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1220-L1221

Added lines #L1220 - L1221 were not covered by tests
# extra step in DetMCD, sphering data with new cov to compute center
# I think this is equivalent to scaling z
loc_z = loc_func(z / scale_z) # center of principal components
loc = (transf0 * scale_z).dot(loc_z)

Check warning on line 1225 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1224-L1225

Added lines #L1224 - L1225 were not covered by tests

return loc, cov

Check warning on line 1227 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1227

Added line #L1227 was not covered by tests


def _get_detcov_startidx(z, h, options_start=None, methods_cov="all"):
"""Starting sets for deterministic robust covariance estimators.
These are intended as starting sets for DetMCD, DetS and DetMM.
"""
if options_start is None:
options_start = {}

Check warning on line 1236 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1236

Added line #L1236 was not covered by tests

loc_func = options_start.get("loc_func", median)
scale_func = options_start.get("scale_func", mad)

Check warning on line 1239 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1238-L1239

Added lines #L1238 - L1239 were not covered by tests

cov_all = _cov_starting(z, standardize=True, quantile=0.5)

Check warning on line 1241 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1241

Added line #L1241 was not covered by tests

idx_all = []

Check warning on line 1243 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1243

Added line #L1243 was not covered by tests
for c in cov_all:
if not hasattr(c, "method"):
continue
method = c.method
mean, cov = _orthogonalize_det(z, c.cov, loc_func, scale_func)
d = mahalanobis(z, mean, cov)
idx_sel = np.argpartition(d, h)[:h]
idx_all.append((idx_sel, method))

Check warning on line 1251 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1246-L1251

Added lines #L1246 - L1251 were not covered by tests

return idx_all

Check warning on line 1253 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1253

Added line #L1253 was not covered by tests


class CovDetMCD:
"""Minimum covariance determinant estimator with deterministic starts.
preliminary version
reproducability:
This uses deterministic starting sets and there is no randomness in the
estimator.
However, this will not be reprodusible across statsmodels versions
when the methods for starting sets or tuning parameters for the
optimization change.
"""

def __init__(self):
# no options yet, methods were written as functions
pass

def cstep_mcd(self, x, mean, cov, h, maxiter=2, tol=1e-8):
"""C-step for mcd iteration
x is data, perc is percentile h / nobs, don't need perc when we
use np.argpartition
requires starting mean and cov
"""

converged = False

Check warning on line 1282 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1282

Added line #L1282 was not covered by tests

for i in range(maxiter):
d = mahalanobis(x, mean, cov)
idx_sel = np.argpartition(d, h)[:h]
x_sel = x[idx_sel]
mean = x_sel.mean(0)
cov_new = np.cov(x_sel.T, ddof=1)

Check warning on line 1289 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1285-L1289

Added lines #L1285 - L1289 were not covered by tests

if ((cov - cov_new)**2).mean() < tol:
cov = cov_new
converged = True

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable converged is not used.
break

Check warning on line 1294 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1292-L1294

Added lines #L1292 - L1294 were not covered by tests

return mean, cov

Check warning on line 1296 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1296

Added line #L1296 was not covered by tests

def _mcd_one(self, x, idx, h, maxiter=2, mean=None, cov=None):
"""Compute mcd for one starting set of observations.
Parameters
----------
x : ndarray
Data.
idx : ndarray
Indices or mask of observation in starting set, used as ``x[idx]``
h : int
Number of observations in evaluation set for cov.
maxiter : int
Maximum number of c-steps.
Returns
-------
mean : ndarray
Estimated mean.
cov : ndarray
Estimated covariance.
det : float
Determinant of estimated covariance matrix.
Notes
-----
This does not do any preprocessing of the data and returns the empirical mean
and covariance of evaluation set of the data ``x``.
"""
x_sel = x[idx]

Check warning on line 1326 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1326

Added line #L1326 was not covered by tests
if mean is None:
mean = x_sel.mean(0)

Check warning on line 1328 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1328

Added line #L1328 was not covered by tests
if cov is None:
cov = np.cov(x_sel.T, ddof=1)

Check warning on line 1330 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1330

Added line #L1330 was not covered by tests

# updated with c-step
mean, cov = self.cstep_mcd(x, mean, cov, h, maxiter=maxiter)
det = np.linalg.det(cov)

Check warning on line 1334 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1333-L1334

Added lines #L1333 - L1334 were not covered by tests

return mean, cov, det

Check warning on line 1336 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1336

Added line #L1336 was not covered by tests

def mcd_det(self, x, h, *, h_start=None, mean_func=None, scale_func=None,
maxiter=100, options_start=None, reweight=True):
"""
Compute minimum covariance determinant estimate of mean and covariance.
x : array-like
Data with observation in rows and variables in columns.
h : int
Number of observations in evaluation set for minimimizing
determinant.
h_start : int
Number of observations used in starting mean and covariance.
mean_func, scale_func : callable or None.
Mean and scale function for initial standardization.
Current defaults, if they are None, are median and mad, but
default scale_func will likely change.
options_start : None or dict
Options for the starting estimators.
TODO: which options? e.g. for OGK
Returns
-------
Holder instance with results
"""

x = np.asarray(x)
nobs, k_vars = x.shape

Check warning on line 1364 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1363-L1364

Added lines #L1363 - L1364 were not covered by tests

if h is None:
h = (nobs + k_vars + 1) // 2 # check with literature

Check warning on line 1367 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1367

Added line #L1367 was not covered by tests
if mean_func is None:
mean_func = lambda x: np.median(x, axis=0) # noqa
if scale_func is None:
scale_func = mad

Check warning on line 1371 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1371

Added line #L1371 was not covered by tests
if options_start is None:
options_start = {}

Check warning on line 1373 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1373

Added line #L1373 was not covered by tests
if h_start is None:
nobs, k_vars = x.shape
h_start = max(nobs // 2 + 1, k_vars + 1)

Check warning on line 1376 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1375-L1376

Added lines #L1375 - L1376 were not covered by tests

m = mean_func(x)
s = scale_func(x)
z = (x - m) / s

Check warning on line 1380 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1378-L1380

Added lines #L1378 - L1380 were not covered by tests
# get initial mean, cov of standardized data, we only need ranking
# of obs
starts = _get_detcov_startidx(z, h_start, options_start)

Check warning on line 1383 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1383

Added line #L1383 was not covered by tests

fac_trunc = coef_normalize_cov_truncated(h / nobs, k_vars)

Check warning on line 1385 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1385

Added line #L1385 was not covered by tests

res = {}

Check warning on line 1387 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1387

Added line #L1387 was not covered by tests
for ii, ini in enumerate(starts):
idx_sel, method = ini
mean, cov, det = self._mcd_one(x, idx_sel, h, maxiter=maxiter)
res[ii] = Holder(

Check warning on line 1391 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1389-L1391

Added lines #L1389 - L1391 were not covered by tests
mean=mean,
cov=cov * fac_trunc,
det_subset=det,
method=method
)

det_all = np.array([i.det_subset for i in res.values()])
idx_best = np.argmin(det_all)
best = res[idx_best]

Check warning on line 1400 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1399-L1400

Added lines #L1399 - L1400 were not covered by tests
# mean = best.mean
# cov = best.cov

# need to c-step to convergence for best,
# is with best 2 in original DetMCD
if maxiter < 100:
mean, cov, det = self._mcd_one(x, None, h, maxiter=100,

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable mean is not used.

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable cov is not used.

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable det is not used.

Check warning on line 1407 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1407

Added line #L1407 was not covered by tests
mean=best.mean, cov=best.cov)

# include extra info in returned Holder instance
best.det_all = det_all
best.idx_best = idx_best

Check warning on line 1412 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1411-L1412

Added lines #L1411 - L1412 were not covered by tests

best.tmean = m
best.tscale = s

Check warning on line 1415 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1414-L1415

Added lines #L1414 - L1415 were not covered by tests

#return Holder(mean=mean, cov=cov, start_best=best)
return best # is Holder instance already

Check warning on line 1418 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1418

Added line #L1418 was not covered by tests

0 comments on commit 021cd2d

Please sign in to comment.