Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Stop unstable lowess iterations #9220

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
77 changes: 27 additions & 50 deletions statsmodels/nonparametric/_smoothers_lowess.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,
np.ndarray[DTYPE_t, ndim = 1] weights
DTYPE_t xval, radius

# overhead if not given_xvals
np.ndarray std_resid = np.empty(np.PyArray_DIMS(exog)[0])
double median, scale
double* std_resid_data = <double *>np.PyArray_DATA(std_resid)

y = endog # now just alias
x = exog

Expand Down Expand Up @@ -219,7 +224,27 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,

# Calculate residual weights
if not given_xvals:
resid_weights = calculate_residual_weights(y, y_fit)

for j in range(n):
std_resid_data[j] = fabs(y[j] - y_fit[j])

median = <double>np.median(std_resid)
# algorithm is numerically unstable
if median <= 1e-7:
break
else:
scale = 6.0 * median
for j in range(n):
std_resid_data[j] /= scale

# Some trimming of outlier residuals.
for j in range(n):
if std_resid_data[j] > 1:
std_resid_data[j] = 1.0
# std_resid[std_resid >= 0.999] = 1.0
# std_resid[std_resid <= 0.001] = 0.0

resid_weights = bisquare(std_resid)

return np.array([xvals, y_fit]).T, resid_weights

Expand Down Expand Up @@ -546,53 +571,6 @@ cpdef update_indices(const double[::1] xvals,
return i, last_fit_i


cpdef np.ndarray calculate_residual_weights(const double[::1] y, const double[::1] y_fit):
"""
Calculate residual weights for the next `robustifying` iteration.

Parameters
----------
y: 1-D numpy array
The vector of actual input y-values.
y_fit: 1-D numpy array
The vector of fitted y-values from the current
iteration.

Returns
-------
resid_weights: 1-D numpy array
The vector of residual weights, to be used in the
next iteration of regressions.
"""
cdef:
Py_ssize_t j
np.npy_intp n = y.size
np.ndarray std_resid = np.empty(n)
double median, scale
double* std_resid_data = <double *>np.PyArray_DATA(std_resid)

for j in range(n):
std_resid_data[j] = fabs(y[j] - y_fit[j])

median = <double>np.median(std_resid)
if median == 0:
for j in range(n):
std_resid_data[j] = <double>(std_resid_data[j] > 0)
else:
scale = 6.0 * median
for j in range(n):
std_resid_data[j] /= scale

# Some trimming of outlier residuals.
for j in range(n):
if std_resid_data[j] > 1:
std_resid_data[j] = 1.0
# std_resid[std_resid >= 0.999] = 1.0
# std_resid[std_resid <= 0.001] = 0.0

return bisquare(std_resid)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just copy-pasted this into the loops, so I could check the median and break. There are other ways to do this...



cdef void tricube(double[::1] x):
"""
The tri-cubic function (1 - x**3)**3. Used to weight neighboring
Expand Down Expand Up @@ -649,8 +627,7 @@ cpdef np.ndarray bisquare(const double[::1] x):
"""
The bi-square function (1 - x**2)**2.

Used to weight the residuals in the `robustifying`
iterations. Called by the calculate_residual_weights function.
Used to weight the residuals in the `robustifying` iterations.

Parameters
----------
Expand Down
4 changes: 3 additions & 1 deletion statsmodels/nonparametric/smoothers_lowess.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ def lowess(endog, exog, frac=2.0/3.0, it=3, delta=0.0, xvals=None, is_sorted=Fal
are less problematic. The weights downgrade the influence of
points with large residuals. In the extreme case, points whose
residuals are larger than 6 times the median absolute residual
are given weight 0.
are given weight 0. If during iterations, the median absolute
residual becomes less than 1e-7 (basically zero), iterations
are stopped as the algorithm becomes unstable.

`delta` can be used to save computations. For each `x_i`, regressions
are skipped for points closer than `delta`. The next regression is
Expand Down
11 changes: 9 additions & 2 deletions statsmodels/nonparametric/tests/test_lowess.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,13 +289,20 @@ def test_exog_predict(self):
with pytest.raises(ValueError):
lowess(y, x, xvals=np.array([[5], [10]]))

def test_stop_iter(self):
# See GH-2108
expected = lowess([0] * 10 + [1] * 10, np.arange(20), it=1)
result = lowess([0] * 10 + [1] * 10, np.arange(20), it=2)
assert_equal(expected, result)

def test_returns_inputs():

def test_nan_regression():
# see 1960
y = [0] * 10 + [1] * 10
x = np.arange(20)
result = lowess(y, x, frac=0.4)
assert_almost_equal(result, np.column_stack((x, y)))
out = lowess([0] * 7 + [1] * 7, np.arange(14), frac=.2, it=1)[:, 1]
assert not np.any(np.isnan(out))


def test_xvals_dtype(reset_randomstate):
Expand Down