Skip to content

Commit

Permalink
ENH/REF: add other cov to deterministic start, some ref and bug
Browse files Browse the repository at this point in the history
  • Loading branch information
josef-pkt committed Apr 25, 2024
1 parent 68a79f9 commit c79c89d
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 23 deletions.
68 changes: 59 additions & 9 deletions statsmodels/robust/covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@
import numpy as np
from scipy import stats, linalg
from scipy.linalg.lapack import dtrtri
from .scale import mad
from .scale import mad, qn_scale

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'qn_scale' is not used.
from statsmodels.tools.testing import Holder

from statsmodels.stats.covariance import corr_rank, corr_normal_scores

mad0 = lambda x: mad(x, center=0) # noqa: E731
median = lambda x: np.median(x, axis=0) # noqa: E731

Expand Down Expand Up @@ -313,6 +315,7 @@ def mahalanobis(data, cov=None, cov_inv=None, sqrt=False):
"""Mahalanobis distance squared
Note: this is without taking the square root.
assumes data is already centered.
Parameters
----------
Expand Down Expand Up @@ -1085,7 +1088,7 @@ def _cov_iter(data, weights_func, weights_args=None, cov_init=None,
return res


def _cov_starting(data, standardize=False, quantile=0.5):
def _cov_starting(data, standardize=False, quantile=0.5, retransform=False):
"""compute some robust starting covariances
The returned covariance matrices are intended as starting values
Expand Down Expand Up @@ -1129,7 +1132,7 @@ def _cov_starting(data, standardize=False, quantile=0.5):

cov_all = []
d = mahalanobis(xs, cov=None, cov_inv=np.eye(k_vars))
percentiles = [(k_vars+2) / nobs * 100 * 2, 25, 50]
percentiles = [(k_vars+2) / nobs * 100 * 2, 25, 50, 85]
cutoffs = np.percentile(d, percentiles)
for p, cutoff in zip(percentiles, cutoffs):
xsp = xs[d < cutoff]
Expand All @@ -1151,16 +1154,56 @@ def _cov_starting(data, standardize=False, quantile=0.5):
c03 = _cov_iter(xs, weights_quantile, weights_args=(quantile,),
rescale="med", cov_init=c02.cov, maxiter=100)

if standardize:
if not standardize or not retransform:
cov_all.extend([c0, c01, c02, c03])
else:
# compensate for initial rescaling
# TODO: this does not return list of Holder anymore
s = np.outer(std, std)
cov_all.extend([r.cov * s for r in [c0, c01, c02, c03]])

c2 = cov_ogk(xs)
cov_all.append(c2)

c2raw = Holder(
cov=c2.cov_raw,
mean=c2.loc_raw * std + center,
method="ogk_raw",
)
cov_all.append(c2raw)

z_tanh = np.tanh(xs)
c_th = Holder(
cov=np.corrcoef(z_tanh.T), # not consistently scaled for cov
mean=center, # TODO: do we add inverted mean z_tanh ?
method="tanh",
)
cov_all.append(c_th)

x_spatial = xs / np.sqrt(np.sum(xs**2, axis=1))[:, None]
c_th = Holder(
cov=np.cov(x_spatial.T),
mean=center,
method="spatial",
)
cov_all.append(c_th)

c_th = Holder(
# not consistently scaled for cov
# cov=stats.spearmanr(xs)[0], # not correct shape if k=1 or 2
cov=corr_rank(xs), # always returns matrix, np.corrcoef result
mean=center,
method="spearman",
)
cov_all.append(c_th)

c_ns = Holder(
cov=corr_normal_scores(xs), # not consistently scaled for cov
mean=center, # TODO: do we add inverted mean z_tanh ?
method="normal-scores",
)
cov_all.append(c_ns)

# TODO: rescale back to original space using center and std
return cov_all

Expand Down Expand Up @@ -1202,7 +1245,7 @@ def untransform_mom(self, m, c):
return mean, cov

Check warning on line 1245 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1243-L1245

Added lines #L1243 - L1245 were not covered by tests


def _orthogonalize_det(z, corr, loc_func, scale_func):
def _orthogonalize_det(x, corr, loc_func, scale_func):
"""Orthogonalize
This is a simplified version of the OGK method.
Expand All @@ -1214,15 +1257,20 @@ def _orthogonalize_det(z, corr, loc_func, scale_func):
e.g. median and Qn in DetMCD
"""
evals, evecs = np.linalg.eigh(corr) # noqa: F841
z = z.dot(evecs)
z = x.dot(evecs)
transf0 = evecs

Check warning on line 1261 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1259-L1261

Added lines #L1259 - L1261 were not covered by tests

scale_z = scale_func(z) # scale of principal components
cov = (transf0 * scale_z**2).dot(transf0.T)

Check warning on line 1264 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1263-L1264

Added lines #L1263 - L1264 were not covered by tests
# extra step in DetMCD, sphering data with new cov to compute center
# I think this is equivalent to scaling z
loc_z = loc_func(z / scale_z) # center of principal components
loc = (transf0 * scale_z).dot(loc_z)
#loc_z = loc_func(z / scale_z) * scale_z # center of principal components
#loc = (transf0 * scale_z).dot(loc_z)
transf1 = (transf0 * scale_z).dot(transf0.T)

Check warning on line 1269 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1269

Added line #L1269 was not covered by tests
#transf1inv = (transf0 * scale_z**(-1)).dot(transf0.T)

#loc = loc_func(x @ transf1inv) @ transf1
loc = loc_func((z / scale_z).dot(transf0.T)) @ transf1

Check warning on line 1273 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1273

Added line #L1273 was not covered by tests

return loc, cov

Check warning on line 1275 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1275

Added line #L1275 was not covered by tests

Expand All @@ -1232,13 +1280,15 @@ def _get_detcov_startidx(z, h, options_start=None, methods_cov="all"):
These are intended as starting sets for DetMCD, DetS and DetMM.
"""

if options_start is None:
options_start = {}

Check warning on line 1285 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1285

Added line #L1285 was not covered by tests

loc_func = options_start.get("loc_func", median)
scale_func = options_start.get("scale_func", mad)
z = (z - loc_func(z)) / scale_func(z)

Check warning on line 1289 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1287-L1289

Added lines #L1287 - L1289 were not covered by tests

cov_all = _cov_starting(z, standardize=True, quantile=0.5)
cov_all = _cov_starting(z, standardize=False, quantile=0.5)

Check warning on line 1291 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1291

Added line #L1291 was not covered by tests

idx_all = []

Check warning on line 1293 in statsmodels/robust/covariance.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/covariance.py#L1293

Added line #L1293 was not covered by tests
for c in cov_all:
Expand Down
42 changes: 28 additions & 14 deletions statsmodels/robust/resistant_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,21 @@ def _get_start_params(self, h):
]
return start_params_all

Check warning on line 61 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L61

Added line #L61 was not covered by tests

def _fit_once(self, start_params, maxiter=100):
def _fit_one(self, start_params, maxiter=100):
mod = RLM(self.endog, self.exog, M=self.norm)
res = mod.fit(start_params=start_params,

Check warning on line 65 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L63-L65

Added lines #L63 - L65 were not covered by tests
scale_est=self.mscale,
maxiter=maxiter)
return res

Check warning on line 68 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L68

Added line #L68 was not covered by tests

def fit(self, h, maxiter=100, maxiter_step=5):
def fit(self, h, maxiter=100, maxiter_step=5, start_params_extra=None):

Check warning on line 70 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L70

Added line #L70 was not covered by tests

start_params_all = self._get_start_params(h)

Check warning on line 72 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L72

Added line #L72 was not covered by tests
if start_params_extra:
start_params_all.extend(start_params_extra)
res = {}

Check warning on line 75 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L74-L75

Added lines #L74 - L75 were not covered by tests
for ii, sp in enumerate(self._get_start_params(h)):
res_ii = self._fit_once(sp, maxiter=maxiter_step)
for ii, sp in enumerate(start_params_all):
res_ii = self._fit_one(sp, maxiter=maxiter_step)
res[ii] = Holder(

Check warning on line 78 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L77-L78

Added lines #L77 - L78 were not covered by tests
scale=res_ii.scale,
params=res_ii.params,
Expand All @@ -83,7 +86,7 @@ def fit(self, h, maxiter=100, maxiter_step=5):
best_idx = scale_sorted[0]

Check warning on line 86 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L85-L86

Added lines #L85 - L86 were not covered by tests

# TODO: iterate until convergence if start fits are not converged
res_best = self._fit_once(res[best_idx].params, maxiter=maxiter)
res_best = self._fit_one(res[best_idx].params, maxiter=maxiter)

Check warning on line 89 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L89

Added line #L89 was not covered by tests

# TODO: add extra start and convergence info
res_best._results.results_iter = res

Check warning on line 92 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L92

Added line #L92 was not covered by tests
Expand All @@ -97,25 +100,36 @@ class RLMDetSMM(RLMDetS):
"""

def fit(self, h=None, binding=False):
def fit(self, h=None, binding=False, start=None):
norm_m = rnorms.TukeyBiweight(c=4.685061)

Check warning on line 104 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L103-L104

Added lines #L103 - L104 were not covered by tests
res_s = super().fit(h)
mod_m = RLM(res_s.model.endog, res_s.model.exog, M=norm_m)
if start is None:
res_s = super().fit(h)
start_params = np.asarray(res_s.params)
start_scale = res_s.scale

Check warning on line 108 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L106-L108

Added lines #L106 - L108 were not covered by tests
else:
start_params, start_scale = start
res_s = None

Check warning on line 111 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L110-L111

Added lines #L110 - L111 were not covered by tests

# mod_m = RLM(res_s.model.endog, res_s.model.exog, M=norm_m)
mod_m = RLM(self.endog, self.exog, M=norm_m)
res_mm = mod_m.fit(

Check warning on line 115 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L114-L115

Added lines #L114 - L115 were not covered by tests
start_params=np.asarray(res_s.params),
start_scale=res_s.scale,
start_params=start_params,
start_scale=start_scale,
update_scale=False
)

if not binding:
# we can compute this first and skip MM if scale decrease
mod_sm = RLM(res_s.model.endog, res_s.model.exog, M=norm_m)
mod_sm = RLM(self.endog, self.exog, M=norm_m)
res_sm = mod_sm.fit(

Check warning on line 124 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L123-L124

Added lines #L123 - L124 were not covered by tests
start_params=res_s.params,
start_params=start_params,
scale_est=self.mscale
)

if not binding and res_sm.scale < res_mm.scale:
return res_sm
res = res_sm

Check warning on line 130 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L130

Added line #L130 was not covered by tests
else:
return res_mm
res = res_mm

Check warning on line 132 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L132

Added line #L132 was not covered by tests

res._results.results_dets = res_s
return res

Check warning on line 135 in statsmodels/robust/resistant_linear_model.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/robust/resistant_linear_model.py#L134-L135

Added lines #L134 - L135 were not covered by tests
12 changes: 12 additions & 0 deletions statsmodels/stats/covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,18 @@ def transform_corr_normal(corr, method, return_var=False, possdef=True):
return corr_n


def corr_rank(data):
"""Spearman rank correlation
simplified version of scipy.stats.spearmanr
"""
x = np.asarray(data)
axisout = 0
ar = np.apply_along_axis(stats.rankdata, axisout, x)
corr = np.corrcoef(ar, rowvar=False)
return corr


def corr_normal_scores(data):
"""Gaussian rank (normal scores) correlation
Expand Down

0 comments on commit c79c89d

Please sign in to comment.