ENH/REF: norms in resistant models, smaller ENH in older code

statsmodels · May 2, 2024 · 41b4bb2 · 41b4bb2
1 parent a3e739a
commit 41b4bb2
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 19 deletions.
diff --git a/statsmodels/robust/norms.py b/statsmodels/robust/norms.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-# TODO: add plots to weighting functions for online docs.
+from . import tools as rtools
 
 
 def _cabs(x):
@@ -41,6 +41,9 @@ class RobustNorm:
 
     continuous = 1
 
+    def __repr__(self):
+        return self.__class__.__name__
+
     def rho(self, z):
         """
         The robust criterion estimator function.
@@ -862,12 +865,48 @@ class TukeyBiweight(RobustNorm):
     def __init__(self, c=4.685):
         self.c = c
 
-    def _set_tuning_param(self, c):
+    def __repr__(self):
+        return f"{self.__class__.__name__}(c={self.c})"
+
+    @classmethod
+    def get_tuning(cls, bp=None, eff=None):
+        """Tuning parameter for given breakdown point or efficiency.
+
+        This currently only return values from a table.
+
+        Parameters
+        ----------
+        bp : float in [0.05, 0.5] or None
+            Required breakdown point
+            Either bp or eff has to be specified, but not both.
+        eff : float or None
+            Required asymptotic efficiency.
+            Either bp or eff has to be specified, but not both.
+
+        Returns
+        -------
+        float : tuning parameter.
+
+        """
+        if ((bp is None and eff is None) or
+            (bp is not None and eff is not None)):
+            raise ValueError("exactly one of bp and eff needs to be provided")
+
+        if bp is not None:
+            return rtools.tukeybiweight_bp[bp]
+        elif eff is not None:
+            return rtools.tukeybiweight_eff[eff]
+
+    def _set_tuning_param(self, c, inplace=False):
         """Set and change the tuning parameter of the Norm.
 
         Warning: this needs to wipe cached attributes that depend on the param.
         """
-        self.c = c
+        if inplace:
+            self.c = c
+            return self
+        else:
+            return self.__class__(c=c)
 
     def max_rho(self):
         return self.rho(self.c)

diff --git a/statsmodels/robust/resistant_linear_model.py b/statsmodels/robust/resistant_linear_model.py
@@ -18,25 +18,53 @@
 
 class RLMDetS(Model):
     """S-estimator for linear model with deterministic starts.
+
+
+    Notes
+    -----
+    This estimator combines the method of Fast-S regression (Saliban-Barrera
+    et al 2006) using starting sets similar to the deterministic estimation of
+    multivariate location and scatter DetS and DetMM of Hubert et al (2012).
+
+
+    References
+    ----------
+
+    .. [1] Hubert, Mia, Peter J. Rousseeuw, and Tim Verdonck. 2012. “A
+       Deterministic Algorithm for Robust Location and Scatter.” Journal of
+       Computational and Graphical Statistics 21 (3): 618–37.
+       https://doi.org/10.1080/10618600.2012.672100.
+
+    .. [2] Hubert, Mia, Peter Rousseeuw, Dina Vanpaemel, and Tim Verdonck.
+       2015. “The DetS and DetMM Estimators for Multivariate Location and
+       Scatter.” Computational Statistics & Data Analysis 81 (January): 64–75.
+       https://doi.org/10.1016/j.csda.2014.07.013.
+
+    .. [3] Rousseeuw, Peter J., Stefan Van Aelst, Katrien Van Driessen, and
+        Jose Agulló. 2004. “Robust Multivariate Regression.”
+        Technometrics 46 (3): 293–305.
+
+    .. [4] Salibian-Barrera, Matías, and Víctor J. Yohai. 2006. “A Fast
+       Algorithm for S-Regression Estimates.” Journal of Computational and
+       Graphical Statistics 15 (2): 414–27.
+
+
     """
 
     def __init__(self, endog, exog, norm=None, breakdown_point=0.5,
                  col_indices=None, include_endog=False):
         super().__init__(endog, exog)
 
         if norm is None:
-            norm = rnorms.TukeyBiweight(c=1.547)
-            scale_bias = 0.1995
-            self.mscale = rscale.MScale(norm, scale_bias)
-        else:
-            raise ValueError()
+            norm = rnorms.TukeyBiweight()
+
+        tune = norm.get_tuning(bp=breakdown_point)
+        c = tune[0]
+        scale_bias = tune[2]
+        norm = norm._set_tuning_param(c, inplace=False)
+        self.mscale = rscale.MScale(norm, scale_bias)
 
         self.norm = norm
-        # need tuning params for given breakdown point
-        # if type(norm) is a class then create an instance
-        # if norm is an instance, then just use it
-        # self.norm._set_tuning_param(???)
-        # data for robust mahalanobis distance of starting sets
         self.breakdown_point = breakdown_point
 
         # TODO: detect constant
@@ -45,6 +73,7 @@ def __init__(self, endog, exog, norm=None, breakdown_point=0.5,
         else:
             exog_start = self.exog[:, col_indices]
 
+        # data for robust mahalanobis distance of starting sets
         if include_endog:
             self.data_start = np.column_stack((endog, exog_start))
         else:
@@ -104,12 +133,90 @@ def fit(self, h, maxiter=100, maxiter_step=5, start_params_extra=None):
 
 
 class RLMDetSMM(RLMDetS):
-    """MM-estimator with S-estimator starting values
+    """MM-estimator with S-estimator starting values.
+
+    Parameters
+    ----------
+    endog : array-like, 1-dim
+        Dependent, endogenous variable.
+    exog array-like, 1-dim
+        Inependent, exogenous regressor variables.
+    norm : robust norm
+        Redescending robust norm used for S- and MM-estimation.
+        Default is TukeyBiweight.
+    efficiency : float in (0, 1)
+        Asymptotic efficiency of the MM-estimator (used in second stage).
+    breakdown_point : float in (0, 0.5)
+        Breakdown point of the preliminary S-estimator.
+    col_indices : None or array-like of ints
+        Index of columns of exog to use in the mahalanobis distance computation
+        for the starting sets of the S-estimator.
+        Default is all exog except first column (constant). Todo: will change
+        when we autodetect the constant column
+    include_endog : bool
+        If true, then the endog variable is combined with the exog variables
+        to compute the the mahalanobis distances for the starting sets of the
+        S-estimator.
 
     """
+    def __init__(self, endog, exog, norm=None, efficiency=0.95,
+                 breakdown_point=0.5, col_indices=None, include_endog=False):
+        super().__init__(
+            endog,
+            exog,
+            norm=norm,
+            breakdown_point=breakdown_point,
+            col_indices=col_indices,
+            include_endog=include_endog
+            )
 
-    def fit(self, h=None, binding=False, start=None):
-        norm_m = rnorms.TukeyBiweight(c=4.685061)
+        self.efficiency = efficiency
+        if norm is None:
+            norm = rnorms.TukeyBiweight()
+
+        c = norm.get_tuning(eff=efficiency)[0]
+        norm = norm._set_tuning_param(c, inplace=False)
+        self.norm_mean = norm
+
+    def fit(self, h=None, scale_binding=False, start=None):
+        """Estimate the model
+
+        Parameters
+        ----------
+        h : int
+            The size of the initial sets for the S-estimator.
+            Default is ....  (todo)
+        scale_binding : bool
+            If true, then the scale is fixed in the second stage M-estimation,
+            i.e. this is the MM-estimator.
+            If false, then the high breakdown point M-scale is used also in the
+            second stage M-estimation if that estimated scale is smaller than
+            the scale of the preliminary, first stage S-estimato.
+        start : tuple or None
+            If None, then the starting parameters and scale for the second
+            stage M-estimation are taken from the fist stage S-estimator.
+            Alternatively, the starting parameters and starting scale can be
+            provided by the user as tuple (start_params, start_scale). In this
+            case the first stage S-estimation in skipped.
+        maxiter, other optimization parameters are still missing (todo)
+
+        Returns
+        -------
+        results instance
+
+        Notes
+        -----
+        If scale_binding is false, then the estimator is a standard
+        MM-estimator with fixed scale in the second stage M-estimation.
+        If scale_binding is true, then the estimator will try to find an
+        estimate with lower M-scale using the same scale-norm rho as in the
+        first stage S-estimator. If the estimated scale, is not smaller than
+        then the scale estimated in the first stage S-estimator, then the
+        fixed scale MM-estimator is returned.
+
+
+        """
+        norm_m = self.norm_mean
         if start is None:
             res_s = super().fit(h)
             start_params = np.asarray(res_s.params)
@@ -118,23 +225,22 @@ def fit(self, h=None, binding=False, start=None):
             start_params, start_scale = start
             res_s = None
 
-        # mod_m = RLM(res_s.model.endog, res_s.model.exog, M=norm_m)
         mod_m = RLM(self.endog, self.exog, M=norm_m)
         res_mm = mod_m.fit(
             start_params=start_params,
             start_scale=start_scale,
             update_scale=False
             )
 
-        if not binding:
+        if not scale_binding:
             # we can compute this first and skip MM if scale decrease
             mod_sm = RLM(self.endog, self.exog, M=norm_m)
             res_sm = mod_sm.fit(
                 start_params=start_params,
                 scale_est=self.mscale
                 )
 
-        if not binding and res_sm.scale < res_mm.scale:
+        if not scale_binding and res_sm.scale < res_mm.scale:
             res = res_sm
         else:
             res = res_mm

diff --git a/statsmodels/robust/robust_linear_model.py b/statsmodels/robust/robust_linear_model.py
@@ -277,6 +277,8 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
             self.scale = self._estimate_scale(wls_results.resid)
         elif start_scale:
             self.scale = start_scale
+            if not update_scale:
+                self.scale_est = scale_est = "fixed"
 
         history = dict(params=[np.inf], scale=[])
         if conv == 'coefs':

diff --git a/statsmodels/robust/scale.py b/statsmodels/robust/scale.py
@@ -426,6 +426,9 @@ def __init__(self, chi_func, scale_bias):
         self.chi_func = chi_func
         self.scale_bias = scale_bias
 
+    def __repr__(self):
+        return repr(self.chi_func)
+
     def __call__(self, data, **kwds):
         return self.fit(data, **kwds)
 

diff --git a/statsmodels/robust/tools.py b/statsmodels/robust/tools.py
@@ -242,3 +242,33 @@ def func(c):
         )
 
     return res2
+
+#  ##### tables
+
+tukeybiweight_bp = {
+    # breakdown point : (tuning parameter, efficiency, scale bias)
+    0.50: (1.547645, 0.286826, 0.199600),
+    0.45: (1.756059, 0.369761, 0.231281),
+    0.40: (1.987965, 0.461886, 0.263467),
+    0.35: (2.251831, 0.560447, 0.295793),
+    0.30: (2.560843, 0.661350, 0.327896),
+    0.25: (2.937015, 0.759040, 0.359419),
+    0.20: (3.420681, 0.846734, 0.390035),
+    0.15: (4.096255, 0.917435, 0.419483),
+    0.10: (5.182361, 0.966162, 0.447614),
+    0.05: (7.545252, 0.992424, 0.474424),
+    }
+
+tukeybiweight_eff = {
+    #efficiency : (tuning parameter, breakdown point)
+    0.65: (2.523102, 0.305646),
+    0.70: (2.697221, 0.280593),
+    0.75: (2.897166, 0.254790),
+    0.80: (3.136909, 0.227597),
+    0.85: (3.443690, 0.197957),
+    0.90: (3.882662, 0.163779),
+    0.95: (4.685065, 0.119414),
+    0.97: (5.596823, 0.087088),
+    0.98: (5.920719, 0.078604),
+    0.99: (7.041392, 0.056969),
+    }