Merge pull request #9210 from josef-pkt/robust_mscale

BUG/ENH: fix scale.Huber and add robust MScale
statsmodels · Apr 19, 2024 · c22837f · c22837f
2 parents d6466f7 + fbaa0d5
commit c22837f
Show file tree

Hide file tree

Showing 6 changed files with 248 additions and 130 deletions.
diff --git a/.gitignore b/.gitignore
@@ -70,6 +70,7 @@ coverage_html_report/
 
 # VS Code
 .vscode/
+.vs/
 
 # Project specific
 statsmodels/version.py

diff --git a/statsmodels/robust/covariance.py b/statsmodels/robust/covariance.py
@@ -308,107 +308,6 @@ def _outlier_gy(d, distr=None, k_endog=1, trim_prob=0.975):
 
 # ## GK and OGK ###
 
-def _weight_mean(x, c):
-    """Tukey-biweight, bisquare weights used in tau scale.
-
-    Parameters
-    ----------
-    x : ndarray
-        Data
-    c : float
-        Parameter for bisquare weights
-
-    Returns
-    -------
-    ndarray : weights
-    """
-    x = np.asarray(x)
-    w = (1 - (x / c)**2)**2 * (np.abs(x) <= c)
-    return w
-
-
-def _winsor(x, c):
-    """Winsorized squared data used in tau scale.
-
-    Parameters
-    ----------
-    x : ndarray
-        Data
-    c : float
-        threshold
-
-    Returns
-    -------
-    winsorized squared data, ``np.minimum(x**2, c**2)``
-    """
-    return np.minimum(x**2, c**2)
-
-
-def scale_tau(data, cm=4.5, cs=3, weight_mean=_weight_mean,
-              weight_scale=_winsor, normalize=True, ddof=0):
-    """Tau estimator of univariate scale.
-
-    Experimental, API will change
-
-    Parameters
-    ----------
-    data : array_like, 1-D or 2-D
-        If data is 2d, then the location and scale estimates
-        are calculated for each column
-    cm : float
-        constant used in call to weight_mean
-    cs : float
-        constant used in call to weight_scale
-    weight_mean : callable
-        function to calculate weights for weighted mean
-    weight_scale : callable
-        function to calculate scale, "rho" function
-    normalize : bool
-        rescale the scale estimate so it is consistent when the data is
-        normally distributed. The computation assumes winsorized (truncated)
-        variance.
-
-    Returns
-    -------
-    mean : nd_array
-        robust mean
-    std : nd_array
-        robust estimate of scale (standard deviation)
-
-    Notes
-    -----
-    Uses definition of Maronna and Zamar 2002, with weighted mean and
-    trimmed variance.
-    The normalization has been added to match R robustbase.
-    R robustbase uses by default ddof=0, with option to set it to 2.
-
-    References
-    ----------
-    .. [1] Maronna, Ricardo A, and Ruben H Zamar. “Robust Estimates of Location
-       and Dispersion for High-Dimensional Datasets.” Technometrics 44, no. 4
-       (November 1, 2002): 307–17. https://doi.org/10.1198/004017002188618509.
-    """
-
-    x = np.asarray(data)
-    nobs = x.shape[0]
-
-    med_x = np.median(x, 0)
-    xdm = x - med_x
-    mad_x = np.median(np.abs(xdm), 0)
-    wm = weight_mean(xdm / mad_x, cm)
-    mean = (wm * x).sum(0) / wm.sum(0)
-    var = (mad_x**2 * weight_scale((x - mean) / mad_x, cs).sum(0) /
-           (nobs - ddof))
-
-    cf = 1
-    if normalize:
-        c = cs * stats.norm.ppf(0.75)
-        cf = 2 * ((1 - c**2) * stats.norm.cdf(c) - c * stats.norm.pdf(c)
-                  + c**2) - 1
-    # return Holder(loc=mean, scale=np.sqrt(var / cf))
-    return mean, np.sqrt(var / cf)
-
-
 def mahalanobis(data, cov=None, cov_inv=None, sqrt=False):
     """Mahalanobis distance squared
 

diff --git a/statsmodels/robust/robust_linear_model.py b/statsmodels/robust/robust_linear_model.py
@@ -191,10 +191,12 @@ def _estimate_scale(self, resid):
         elif isinstance(self.scale_est, scale.HuberScale):
             return self.scale_est(self.df_resid, self.nobs, resid)
         else:
-            return scale.scale_est(self, resid) ** 2
+            # use df correction to match HuberScale
+            return self.scale_est(resid) * np.sqrt(self.nobs / self.df_resid)
 
     def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
-            update_scale=True, conv='dev', start_params=None):
+            update_scale=True, conv='dev', start_params=None, start_scale=None,
+            ):
         """
         Fits the model using iteratively reweighted least squares.
 
@@ -217,6 +219,7 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
             Specifies method for the initial estimates of the parameters.
             Default is None, which means that the least squares estimate
             is used.  Currently it is the only available choice.
+            Deprecated and will be removed. There is no choice here.
         maxiter : int
             The maximum number of iterations to try. Default is 50.
         scale_est : str or HuberScale()
@@ -236,6 +239,11 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
         start_params : array_like, optional
             Initial guess of the solution of the optimizer. If not provided,
             the initial parameters are computed using OLS.
+        start_scale : float, optional
+            Initial scale. If update_scale is False, then the scale will be
+            fixed at this level for the estimation of the mean parameters.
+            during iteration. If not provided, then the initial scale is
+            estimated from the OLS residuals
 
         Returns
         -------
@@ -264,8 +272,10 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
                                              check_weights=False)
             wls_results = fake_wls.results(start_params)
 
-        if not init:
+        if not init and not start_scale:
             self.scale = self._estimate_scale(wls_results.resid)
+        elif start_scale:
+            self.scale = start_scale
 
         history = dict(params=[np.inf], scale=[])
         if conv == 'coefs':