statsmodels · jseabold · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/statsmodels/nonparametric/_smoothers_lowess.pyx b/statsmodels/nonparametric/_smoothers_lowess.pyx
@@ -147,6 +147,11 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,
         np.ndarray[DTYPE_t, ndim = 1] weights
         DTYPE_t xval, radius
 
+        # overhead if not given_xvals
+        np.ndarray std_resid = np.empty(np.PyArray_DIMS(exog)[0])
+        double median, scale
+        double* std_resid_data = <double *>np.PyArray_DATA(std_resid)
+
     y = endog   # now just alias
     x = exog
 
@@ -198,8 +203,12 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,
             # Calculate the weights for the regression in this neighborhood.
             # Determine if at least some weights are positive, so a regression
             # is ok.
-            reg_ok = calculate_weights(x, weights, resid_weights, xval, left_end,
-                                       right_end, radius)
+            if radius > 1e-8:
+                reg_ok = calculate_weights(x, weights, resid_weights, xval,
+                                           left_end, right_end, radius)
+            else:  # no variance to scale weights
+                reg_ok = 0
+
 
             # If ok, run the regression
             calculate_y_fit(x, y, i, xval, y_fit, weights, left_end, right_end,
@@ -219,7 +228,27 @@ def lowess(np.ndarray[DTYPE_t, ndim = 1] endog,
 
         # Calculate residual weights
         if not given_xvals:
-            resid_weights = calculate_residual_weights(y, y_fit)
+
+            for j in range(n):
+                std_resid_data[j] = fabs(y[j] - y_fit[j])
+
+            median = <double>np.median(std_resid)
+            # algorithm is numerically unstable
+            if median <= 1e-7:
+                break
+            else:
+                scale = 6.0 * median
+                for j in range(n):
+                    std_resid_data[j] /= scale
+
+            # Some trimming of outlier residuals.
+            for j in range(n):
+                if std_resid_data[j] > 1:
+                    std_resid_data[j] = 1.0
+            # std_resid[std_resid >= 0.999] = 1.0
+            # std_resid[std_resid <= 0.001] = 0.0
+
+            resid_weights = bisquare(std_resid)
 
     return np.array([xvals, y_fit]).T, resid_weights
 
@@ -344,13 +373,11 @@ cdef bint calculate_weights(np.ndarray[DTYPE_t, ndim = 1] x,
                                       resid_weights[left_end:right_end])
 
     sum_weights = np.sum(weights[left_end:right_end])
-    for j in range(left_end, right_end):
-        num_nonzero_weights += weights[j] > 1e-12
-
-    if num_nonzero_weights < 2:
-        # Need at least 2 non-zero weights to get an okay regression fit
-        # see 1960
+    if sum_weights <= 1e-12:
+        # Need at least 1 non-zero weights to get regression fit
+        # see 1960/9220
         return 0  # False
+
     for j in range(left_end, right_end):
         weights[j] /= sum_weights
 
@@ -546,53 +573,6 @@ cpdef update_indices(const double[::1] xvals,
     return i, last_fit_i
 
 
-cpdef np.ndarray calculate_residual_weights(const double[::1] y, const double[::1] y_fit):
-    """
-    Calculate residual weights for the next `robustifying` iteration.
-
-    Parameters
-    ----------
-    y: 1-D numpy array
-        The vector of actual input y-values.
-    y_fit: 1-D numpy array
-        The vector of fitted y-values from the current
-        iteration.
-
-    Returns
-    -------
-    resid_weights: 1-D numpy array
-        The vector of residual weights, to be used in the
-        next iteration of regressions.
-    """
-    cdef:
-        Py_ssize_t j
-        np.npy_intp n = y.size
-        np.ndarray std_resid = np.empty(n)
-        double median, scale
-        double* std_resid_data = <double *>np.PyArray_DATA(std_resid)
-
-    for j in range(n):
-        std_resid_data[j] = fabs(y[j] - y_fit[j])
-
-    median = <double>np.median(std_resid)
-    if median == 0:
-        for j in range(n):
-            std_resid_data[j] = <double>(std_resid_data[j] > 0)
-    else:
-        scale = 6.0 * median
-        for j in range(n):
-            std_resid_data[j] /= scale
-
-    # Some trimming of outlier residuals.
-    for j in range(n):
-        if std_resid_data[j] > 1:
-            std_resid_data[j] = 1.0
-    # std_resid[std_resid >= 0.999] = 1.0
-    # std_resid[std_resid <= 0.001] = 0.0
-
-    return bisquare(std_resid)
-
-
 cdef void tricube(double[::1] x):
     """
     The tri-cubic function (1 - x**3)**3. Used to weight neighboring
@@ -649,8 +629,7 @@ cpdef np.ndarray bisquare(const double[::1] x):
     """
     The bi-square function (1 - x**2)**2.
 
-    Used to weight the residuals in the `robustifying`
-    iterations. Called by the calculate_residual_weights function.
+    Used to weight the residuals in the `robustifying` iterations.
 
     Parameters
     ----------

diff --git a/statsmodels/nonparametric/smoothers_lowess.py b/statsmodels/nonparametric/smoothers_lowess.py
@@ -84,7 +84,9 @@ def lowess(endog, exog, frac=2.0/3.0, it=3, delta=0.0, xvals=None, is_sorted=Fal
     are less problematic. The weights downgrade the influence of
     points with large residuals. In the extreme case, points whose
     residuals are larger than 6 times the median absolute residual
-    are given weight 0.
+    are given weight 0. If during iterations, the median absolute
+    residual becomes less than 1e-7 (basically zero), iterations
+    are stopped as the algorithm becomes unstable.
 
     `delta` can be used to save computations. For each `x_i`, regressions
     are skipped for points closer than `delta`. The next regression is

diff --git a/statsmodels/nonparametric/tests/test_lowess.py b/statsmodels/nonparametric/tests/test_lowess.py
@@ -289,13 +289,20 @@ def test_exog_predict(self):
         with pytest.raises(ValueError):
             lowess(y, x, xvals=np.array([[5], [10]]))
 
+    def test_stop_iter(self):
+        # See GH-2108
+        expected = lowess([0] * 10 + [1] * 10, np.arange(20), it=1)
+        result = lowess([0] * 10 + [1] * 10, np.arange(20), it=2)
+        assert_equal(expected, result)
 
-def test_returns_inputs():
+
+def test_nan_regression():
     # see 1960
     y = [0] * 10 + [1] * 10
     x = np.arange(20)
     result = lowess(y, x, frac=0.4)
-    assert_almost_equal(result, np.column_stack((x, y)))
+    out = lowess([0] * 7 + [1] * 7, np.arange(14), frac=.2, it=1)[:, 1]
+    assert not np.any(np.isnan(out))
 
 
 def test_xvals_dtype(reset_randomstate):
@@ -304,3 +311,25 @@ def test_xvals_dtype(reset_randomstate):
     # Previously raised ValueError: Buffer dtype mismatch
     results_xvals = lowess(y, x, frac=0.4, xvals=x[:5])
     assert_allclose(results_xvals, np.zeros(5), atol=1e-12)
+
+
+def test_interpolated_output():
+    # see #7337
+    y = np.arange(5, 15, dtype=float)
+    x = np.arange(5, 15, dtype=float)
+    xvals = np.arange(0, 20, dtype=float)
+
+    result = lowess(y, x, xvals=xvals, frac=0.6)
+    assert_allclose(result, xvals, atol=1e-8)
+
+    result = lowess(y, x, xvals=xvals, frac=0.5)
+    assert_allclose(result, xvals, atol=1e-8)
+
+    result = lowess(y, x, xvals=xvals, frac=0.4)
+    assert_allclose(result, xvals, atol=1e-8)
+
+    result = lowess(y, x, xvals=xvals, frac=0.3)
+    assert_allclose(result, xvals, atol=1e-8)
+
+    result = lowess(y, x, xvals=x, frac=0.5)
+    assert_allclose(result, x, atol=1e-8)