Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Fix lowess extrapolation #9221

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
95 changes: 37 additions & 58 deletions statsmodels/nonparametric/_smoothers_lowess.pyx
Expand Up @@ -147,6 +147,11 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,
np.ndarray[DTYPE_t, ndim = 1] weights
DTYPE_t xval, radius

# overhead if not given_xvals
np.ndarray std_resid = np.empty(np.PyArray_DIMS(exog)[0])
double median, scale
double* std_resid_data = <double *>np.PyArray_DATA(std_resid)

y = endog # now just alias
x = exog

Expand Down Expand Up @@ -198,8 +203,12 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,
# Calculate the weights for the regression in this neighborhood.
# Determine if at least some weights are positive, so a regression
# is ok.
reg_ok = calculate_weights(x, weights, resid_weights, xval, left_end,
right_end, radius)
if radius > 1e-8:
reg_ok = calculate_weights(x, weights, resid_weights, xval,
left_end, right_end, radius)
else: # no variance to scale weights
reg_ok = 0


# If ok, run the regression
calculate_y_fit(x, y, i, xval, y_fit, weights, left_end, right_end,
Expand All @@ -219,7 +228,27 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,

# Calculate residual weights
if not given_xvals:
resid_weights = calculate_residual_weights(y, y_fit)

for j in range(n):
std_resid_data[j] = fabs(y[j] - y_fit[j])

median = <double>np.median(std_resid)
# algorithm is numerically unstable
if median <= 1e-7:
break
else:
scale = 6.0 * median
for j in range(n):
std_resid_data[j] /= scale

# Some trimming of outlier residuals.
for j in range(n):
if std_resid_data[j] > 1:
std_resid_data[j] = 1.0
# std_resid[std_resid >= 0.999] = 1.0
# std_resid[std_resid <= 0.001] = 0.0

resid_weights = bisquare(std_resid)

return np.array([xvals, y_fit]).T, resid_weights

Expand Down Expand Up @@ -344,13 +373,11 @@ cdef bint calculate_weights(np.ndarray[DTYPE_t, ndim = 1] x,
resid_weights[left_end:right_end])

sum_weights = np.sum(weights[left_end:right_end])
for j in range(left_end, right_end):
num_nonzero_weights += weights[j] > 1e-12

if num_nonzero_weights < 2:
# Need at least 2 non-zero weights to get an okay regression fit
# see 1960
if sum_weights <= 1e-12:
# Need at least 1 non-zero weights to get regression fit
# see 1960/9220
return 0 # False

for j in range(left_end, right_end):
weights[j] /= sum_weights

Expand Down Expand Up @@ -546,53 +573,6 @@ cpdef update_indices(const double[::1] xvals,
return i, last_fit_i


cpdef np.ndarray calculate_residual_weights(const double[::1] y, const double[::1] y_fit):
"""
Calculate residual weights for the next `robustifying` iteration.

Parameters
----------
y: 1-D numpy array
The vector of actual input y-values.
y_fit: 1-D numpy array
The vector of fitted y-values from the current
iteration.

Returns
-------
resid_weights: 1-D numpy array
The vector of residual weights, to be used in the
next iteration of regressions.
"""
cdef:
Py_ssize_t j
np.npy_intp n = y.size
np.ndarray std_resid = np.empty(n)
double median, scale
double* std_resid_data = <double *>np.PyArray_DATA(std_resid)

for j in range(n):
std_resid_data[j] = fabs(y[j] - y_fit[j])

median = <double>np.median(std_resid)
if median == 0:
for j in range(n):
std_resid_data[j] = <double>(std_resid_data[j] > 0)
else:
scale = 6.0 * median
for j in range(n):
std_resid_data[j] /= scale

# Some trimming of outlier residuals.
for j in range(n):
if std_resid_data[j] > 1:
std_resid_data[j] = 1.0
# std_resid[std_resid >= 0.999] = 1.0
# std_resid[std_resid <= 0.001] = 0.0

return bisquare(std_resid)


cdef void tricube(double[::1] x):
"""
The tri-cubic function (1 - x**3)**3. Used to weight neighboring
Expand Down Expand Up @@ -649,8 +629,7 @@ cpdef np.ndarray bisquare(const double[::1] x):
"""
The bi-square function (1 - x**2)**2.

Used to weight the residuals in the `robustifying`
iterations. Called by the calculate_residual_weights function.
Used to weight the residuals in the `robustifying` iterations.

Parameters
----------
Expand Down
4 changes: 3 additions & 1 deletion statsmodels/nonparametric/smoothers_lowess.py
Expand Up @@ -84,7 +84,9 @@ def lowess(endog, exog, frac=2.0/3.0, it=3, delta=0.0, xvals=None, is_sorted=Fal
are less problematic. The weights downgrade the influence of
points with large residuals. In the extreme case, points whose
residuals are larger than 6 times the median absolute residual
are given weight 0.
are given weight 0. If during iterations, the median absolute
residual becomes less than 1e-7 (basically zero), iterations
are stopped as the algorithm becomes unstable.

`delta` can be used to save computations. For each `x_i`, regressions
are skipped for points closer than `delta`. The next regression is
Expand Down
33 changes: 31 additions & 2 deletions statsmodels/nonparametric/tests/test_lowess.py
Expand Up @@ -289,13 +289,20 @@ def test_exog_predict(self):
with pytest.raises(ValueError):
lowess(y, x, xvals=np.array([[5], [10]]))

def test_stop_iter(self):
# See GH-2108
expected = lowess([0] * 10 + [1] * 10, np.arange(20), it=1)
result = lowess([0] * 10 + [1] * 10, np.arange(20), it=2)
assert_equal(expected, result)

def test_returns_inputs():

def test_nan_regression():
# see 1960
y = [0] * 10 + [1] * 10
x = np.arange(20)
result = lowess(y, x, frac=0.4)
assert_almost_equal(result, np.column_stack((x, y)))
out = lowess([0] * 7 + [1] * 7, np.arange(14), frac=.2, it=1)[:, 1]
assert not np.any(np.isnan(out))


def test_xvals_dtype(reset_randomstate):
Expand All @@ -304,3 +311,25 @@ def test_xvals_dtype(reset_randomstate):
# Previously raised ValueError: Buffer dtype mismatch
results_xvals = lowess(y, x, frac=0.4, xvals=x[:5])
assert_allclose(results_xvals, np.zeros(5), atol=1e-12)


def test_interpolated_output():
# see #7337
y = np.arange(5, 15, dtype=float)
x = np.arange(5, 15, dtype=float)
xvals = np.arange(0, 20, dtype=float)

result = lowess(y, x, xvals=xvals, frac=0.6)
assert_allclose(result, xvals, atol=1e-8)

result = lowess(y, x, xvals=xvals, frac=0.5)
assert_allclose(result, xvals, atol=1e-8)

result = lowess(y, x, xvals=xvals, frac=0.4)
assert_allclose(result, xvals, atol=1e-8)

result = lowess(y, x, xvals=xvals, frac=0.3)
assert_allclose(result, xvals, atol=1e-8)

result = lowess(y, x, xvals=x, frac=0.5)
assert_allclose(result, x, atol=1e-8)