Skip to content

Commit

Permalink
Merge pull request #9210 from josef-pkt/robust_mscale
Browse files Browse the repository at this point in the history
BUG/ENH: fix scale.Huber and add robust MScale
  • Loading branch information
josef-pkt committed Apr 19, 2024
2 parents d6466f7 + fbaa0d5 commit c22837f
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 130 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ coverage_html_report/

# VS Code
.vscode/
.vs/

# Project specific
statsmodels/version.py
Expand Down
101 changes: 0 additions & 101 deletions statsmodels/robust/covariance.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,107 +308,6 @@ def _outlier_gy(d, distr=None, k_endog=1, trim_prob=0.975):

# ## GK and OGK ###

def _weight_mean(x, c):
"""Tukey-biweight, bisquare weights used in tau scale.
Parameters
----------
x : ndarray
Data
c : float
Parameter for bisquare weights
Returns
-------
ndarray : weights
"""
x = np.asarray(x)
w = (1 - (x / c)**2)**2 * (np.abs(x) <= c)
return w


def _winsor(x, c):
"""Winsorized squared data used in tau scale.
Parameters
----------
x : ndarray
Data
c : float
threshold
Returns
-------
winsorized squared data, ``np.minimum(x**2, c**2)``
"""
return np.minimum(x**2, c**2)


def scale_tau(data, cm=4.5, cs=3, weight_mean=_weight_mean,
weight_scale=_winsor, normalize=True, ddof=0):
"""Tau estimator of univariate scale.
Experimental, API will change
Parameters
----------
data : array_like, 1-D or 2-D
If data is 2d, then the location and scale estimates
are calculated for each column
cm : float
constant used in call to weight_mean
cs : float
constant used in call to weight_scale
weight_mean : callable
function to calculate weights for weighted mean
weight_scale : callable
function to calculate scale, "rho" function
normalize : bool
rescale the scale estimate so it is consistent when the data is
normally distributed. The computation assumes winsorized (truncated)
variance.
Returns
-------
mean : nd_array
robust mean
std : nd_array
robust estimate of scale (standard deviation)
Notes
-----
Uses definition of Maronna and Zamar 2002, with weighted mean and
trimmed variance.
The normalization has been added to match R robustbase.
R robustbase uses by default ddof=0, with option to set it to 2.
References
----------
.. [1] Maronna, Ricardo A, and Ruben H Zamar. “Robust Estimates of Location
and Dispersion for High-Dimensional Datasets.” Technometrics 44, no. 4
(November 1, 2002): 307–17. https://doi.org/10.1198/004017002188618509.
"""

x = np.asarray(data)
nobs = x.shape[0]

med_x = np.median(x, 0)
xdm = x - med_x
mad_x = np.median(np.abs(xdm), 0)
wm = weight_mean(xdm / mad_x, cm)
mean = (wm * x).sum(0) / wm.sum(0)
var = (mad_x**2 * weight_scale((x - mean) / mad_x, cs).sum(0) /
(nobs - ddof))

cf = 1
if normalize:
c = cs * stats.norm.ppf(0.75)
cf = 2 * ((1 - c**2) * stats.norm.cdf(c) - c * stats.norm.pdf(c)
+ c**2) - 1
# return Holder(loc=mean, scale=np.sqrt(var / cf))
return mean, np.sqrt(var / cf)


def mahalanobis(data, cov=None, cov_inv=None, sqrt=False):
"""Mahalanobis distance squared
Expand Down
16 changes: 13 additions & 3 deletions statsmodels/robust/robust_linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,12 @@ def _estimate_scale(self, resid):
elif isinstance(self.scale_est, scale.HuberScale):
return self.scale_est(self.df_resid, self.nobs, resid)
else:
return scale.scale_est(self, resid) ** 2
# use df correction to match HuberScale
return self.scale_est(resid) * np.sqrt(self.nobs / self.df_resid)

def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
update_scale=True, conv='dev', start_params=None):
update_scale=True, conv='dev', start_params=None, start_scale=None,
):
"""
Fits the model using iteratively reweighted least squares.
Expand All @@ -217,6 +219,7 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
Specifies method for the initial estimates of the parameters.
Default is None, which means that the least squares estimate
is used. Currently it is the only available choice.
Deprecated and will be removed. There is no choice here.
maxiter : int
The maximum number of iterations to try. Default is 50.
scale_est : str or HuberScale()
Expand All @@ -236,6 +239,11 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
start_params : array_like, optional
Initial guess of the solution of the optimizer. If not provided,
the initial parameters are computed using OLS.
start_scale : float, optional
Initial scale. If update_scale is False, then the scale will be
fixed at this level for the estimation of the mean parameters.
during iteration. If not provided, then the initial scale is
estimated from the OLS residuals
Returns
-------
Expand Down Expand Up @@ -264,8 +272,10 @@ def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1',
check_weights=False)
wls_results = fake_wls.results(start_params)

if not init:
if not init and not start_scale:
self.scale = self._estimate_scale(wls_results.resid)
elif start_scale:
self.scale = start_scale

history = dict(params=[np.inf], scale=[])
if conv == 'coefs':
Expand Down

0 comments on commit c22837f

Please sign in to comment.