ENH/REF: add other cov to deterministic start, some ref and bug

statsmodels · Apr 25, 2024 · c79c89d · c79c89d
1 parent 68a79f9
commit c79c89d
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 23 deletions.
diff --git a/statsmodels/robust/covariance.py b/statsmodels/robust/covariance.py
@@ -28,9 +28,11 @@
 import numpy as np
 from scipy import stats, linalg
 from scipy.linalg.lapack import dtrtri
-from .scale import mad
+from .scale import mad, qn_scale
 from statsmodels.tools.testing import Holder
 
+from statsmodels.stats.covariance import corr_rank, corr_normal_scores
+
 mad0 = lambda x: mad(x, center=0)  # noqa: E731
 median = lambda x: np.median(x, axis=0)  # noqa: E731
 
@@ -313,6 +315,7 @@ def mahalanobis(data, cov=None, cov_inv=None, sqrt=False):
     """Mahalanobis distance squared
 
     Note: this is without taking the square root.
+    assumes data is already centered.
 
     Parameters
     ----------
@@ -1085,7 +1088,7 @@ def _cov_iter(data, weights_func, weights_args=None, cov_init=None,
     return res
 
 
-def _cov_starting(data, standardize=False, quantile=0.5):
+def _cov_starting(data, standardize=False, quantile=0.5, retransform=False):
     """compute some robust starting covariances
 
     The returned covariance matrices are intended as starting values
@@ -1129,7 +1132,7 @@ def _cov_starting(data, standardize=False, quantile=0.5):
 
     cov_all = []
     d = mahalanobis(xs, cov=None, cov_inv=np.eye(k_vars))
-    percentiles = [(k_vars+2) / nobs * 100 * 2, 25, 50]
+    percentiles = [(k_vars+2) / nobs * 100 * 2, 25, 50, 85]
     cutoffs = np.percentile(d, percentiles)
     for p, cutoff in zip(percentiles, cutoffs):
         xsp = xs[d < cutoff]
@@ -1151,16 +1154,56 @@ def _cov_starting(data, standardize=False, quantile=0.5):
         c03 = _cov_iter(xs, weights_quantile, weights_args=(quantile,),
                         rescale="med", cov_init=c02.cov, maxiter=100)
 
-        if standardize:
+        if not standardize or not retransform:
             cov_all.extend([c0, c01, c02, c03])
         else:
             # compensate for initial rescaling
+            # TODO: this does not return list of Holder anymore
             s = np.outer(std, std)
             cov_all.extend([r.cov * s for r in [c0, c01, c02, c03]])
 
     c2 = cov_ogk(xs)
     cov_all.append(c2)
 
+    c2raw = Holder(
+            cov=c2.cov_raw,
+            mean=c2.loc_raw * std + center,
+            method="ogk_raw",
+            )
+    cov_all.append(c2raw)
+
+    z_tanh = np.tanh(xs)
+    c_th = Holder(
+            cov=np.corrcoef(z_tanh.T),  # not consistently scaled for cov
+            mean=center,  # TODO: do we add inverted mean z_tanh ?
+            method="tanh",
+            )
+    cov_all.append(c_th)
+
+    x_spatial = xs / np.sqrt(np.sum(xs**2, axis=1))[:, None]
+    c_th = Holder(
+            cov=np.cov(x_spatial.T),
+            mean=center,
+            method="spatial",
+            )
+    cov_all.append(c_th)
+
+    c_th = Holder(
+            # not consistently scaled for cov
+            # cov=stats.spearmanr(xs)[0], # not correct shape if k=1 or 2
+            cov=corr_rank(xs),  # always returns matrix, np.corrcoef result
+            mean=center,
+            method="spearman",
+            )
+    cov_all.append(c_th)
+
+    c_ns = Holder(
+            cov=corr_normal_scores(xs),  # not consistently scaled for cov
+            mean=center,  # TODO: do we add inverted mean z_tanh ?
+            method="normal-scores",
+            )
+    cov_all.append(c_ns)
+
     # TODO: rescale back to original space using center and std
     return cov_all
 
@@ -1202,7 +1245,7 @@ def untransform_mom(self, m, c):
         return mean, cov
 
 
-def _orthogonalize_det(z, corr, loc_func, scale_func):
+def _orthogonalize_det(x, corr, loc_func, scale_func):
     """Orthogonalize
 
     This is a simplified version of the OGK method.
@@ -1214,15 +1257,20 @@ def _orthogonalize_det(z, corr, loc_func, scale_func):
     e.g. median and Qn in DetMCD
     """
     evals, evecs = np.linalg.eigh(corr)  # noqa: F841
-    z = z.dot(evecs)
+    z = x.dot(evecs)
     transf0 = evecs
 
     scale_z = scale_func(z)  # scale of principal components
     cov = (transf0 * scale_z**2).dot(transf0.T)
     # extra step in DetMCD, sphering data with new cov to compute center
     # I think this is equivalent to scaling z
-    loc_z = loc_func(z / scale_z)  # center of principal components
-    loc = (transf0 * scale_z).dot(loc_z)
+    #loc_z = loc_func(z / scale_z) * scale_z  # center of principal components
+    #loc = (transf0 * scale_z).dot(loc_z)
+    transf1 = (transf0 * scale_z).dot(transf0.T)
+    #transf1inv = (transf0 * scale_z**(-1)).dot(transf0.T)
+
+    #loc = loc_func(x @ transf1inv) @ transf1
+    loc = loc_func((z / scale_z).dot(transf0.T)) @ transf1
 
     return loc, cov
 
@@ -1232,13 +1280,15 @@ def _get_detcov_startidx(z, h, options_start=None, methods_cov="all"):
 
     These are intended as starting sets for DetMCD, DetS and DetMM.
     """
+
     if options_start is None:
         options_start = {}
 
     loc_func = options_start.get("loc_func", median)
     scale_func = options_start.get("scale_func", mad)
+    z = (z - loc_func(z)) / scale_func(z)
 
-    cov_all = _cov_starting(z, standardize=True, quantile=0.5)
+    cov_all = _cov_starting(z, standardize=False, quantile=0.5)
 
     idx_all = []
     for c in cov_all:

diff --git a/statsmodels/robust/resistant_linear_model.py b/statsmodels/robust/resistant_linear_model.py
@@ -60,18 +60,21 @@ def _get_start_params(self, h):
             ]
         return start_params_all
 
-    def _fit_once(self, start_params, maxiter=100):
+    def _fit_one(self, start_params, maxiter=100):
         mod = RLM(self.endog, self.exog, M=self.norm)
         res = mod.fit(start_params=start_params,
                       scale_est=self.mscale,
                       maxiter=maxiter)
         return res
 
-    def fit(self, h, maxiter=100, maxiter_step=5):
+    def fit(self, h, maxiter=100, maxiter_step=5, start_params_extra=None):
 
+        start_params_all = self._get_start_params(h)
+        if start_params_extra:
+            start_params_all.extend(start_params_extra)
         res = {}
-        for ii, sp in enumerate(self._get_start_params(h)):
-            res_ii = self._fit_once(sp, maxiter=maxiter_step)
+        for ii, sp in enumerate(start_params_all):
+            res_ii = self._fit_one(sp, maxiter=maxiter_step)
             res[ii] = Holder(
                 scale=res_ii.scale,
                 params=res_ii.params,
@@ -83,7 +86,7 @@ def fit(self, h, maxiter=100, maxiter_step=5):
         best_idx = scale_sorted[0]
 
         # TODO: iterate until convergence if start fits are not converged
-        res_best = self._fit_once(res[best_idx].params, maxiter=maxiter)
+        res_best = self._fit_one(res[best_idx].params, maxiter=maxiter)
 
         # TODO: add extra start and convergence info
         res_best._results.results_iter = res
@@ -97,25 +100,36 @@ class RLMDetSMM(RLMDetS):
 
     """
 
-    def fit(self, h=None, binding=False):
+    def fit(self, h=None, binding=False, start=None):
         norm_m = rnorms.TukeyBiweight(c=4.685061)
-        res_s = super().fit(h)
-        mod_m = RLM(res_s.model.endog, res_s.model.exog, M=norm_m)
+        if start is None:
+            res_s = super().fit(h)
+            start_params = np.asarray(res_s.params)
+            start_scale = res_s.scale
+        else:
+            start_params, start_scale = start
+            res_s = None
+
+        # mod_m = RLM(res_s.model.endog, res_s.model.exog, M=norm_m)
+        mod_m = RLM(self.endog, self.exog, M=norm_m)
         res_mm = mod_m.fit(
-            start_params=np.asarray(res_s.params),
-            start_scale=res_s.scale,
+            start_params=start_params,
+            start_scale=start_scale,
             update_scale=False
             )
 
         if not binding:
             # we can compute this first and skip MM if scale decrease
-            mod_sm = RLM(res_s.model.endog, res_s.model.exog, M=norm_m)
+            mod_sm = RLM(self.endog, self.exog, M=norm_m)
             res_sm = mod_sm.fit(
-                start_params=res_s.params,
+                start_params=start_params,
                 scale_est=self.mscale
                 )
 
         if not binding and res_sm.scale < res_mm.scale:
-            return res_sm
+            res = res_sm
         else:
-            return res_mm
+            res = res_mm
+
+        res._results.results_dets = res_s
+        return res
diff --git a/statsmodels/stats/covariance.py b/statsmodels/stats/covariance.py
@@ -149,6 +149,18 @@ def transform_corr_normal(corr, method, return_var=False, possdef=True):
         return corr_n
 
 
+def corr_rank(data):
+    """Spearman rank correlation
+
+    simplified version of scipy.stats.spearmanr
+    """
+    x = np.asarray(data)
+    axisout = 0
+    ar = np.apply_along_axis(stats.rankdata, axisout, x)
+    corr = np.corrcoef(ar, rowvar=False)
+    return corr
+
+
 def corr_normal_scores(data):
     """Gaussian rank (normal scores) correlation