ENH: GLM models now save the names of input Pandas Series

Offset, exposure, freq_weights and var_weights have the name of the series saved on the model object. They can be accessed via the class properties. Closes #9100
statsmodels · Jan 18, 2024 · 5076ca5 · 5076ca5
1 parent e797469
commit 5076ca5
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 5 deletions.
diff --git a/statsmodels/genmod/generalized_linear_model.py b/statsmodels/genmod/generalized_linear_model.py
@@ -41,6 +41,7 @@
     cached_data,
     cached_value,
 )
+from statsmodels.tools.data import _as_array_with_name
 from statsmodels.tools.docstring import Docstring
 from statsmodels.tools.sm_exceptions import (
     DomainWarning,
@@ -307,15 +308,21 @@ def __init__(self, endog, exog, family=None, offset=None,
                            f"{type(family).__name__} family."),
                           DomainWarning)
 
+        self._exposure_name = None
+        self._offset_name = None
+        self._freq_weights_name = None
+        self._var_weights_name = None
+
         if exposure is not None:
-            exposure = np.log(exposure)
+            exposure_array, self._exposure_name = _as_array_with_name(exposure)
+            exposure = np.log(exposure_array)
         if offset is not None:  # this should probably be done upstream
-            offset = np.asarray(offset)
+            offset, self._offset_name = _as_array_with_name(offset)
 
         if freq_weights is not None:
-            freq_weights = np.asarray(freq_weights)
+            freq_weights, self._freq_weights_name = _as_array_with_name(freq_weights)
         if var_weights is not None:
-            var_weights = np.asarray(var_weights)
+            var_weights, self._var_weights_name = _as_array_with_name(var_weights)
 
         self.freq_weights = freq_weights
         self.var_weights = var_weights
@@ -1558,6 +1565,34 @@ def fit_constrained(self, constraints, start_params=None, **fit_kwds):
         res._results.results_constrained = res_constr
         return res
 
+    @property
+    def offset_name(self):
+        """
+        Name of the offset variable if available.
+        """
+        return self._offset_name
+
+    @property
+    def exposure_name(self):
+        """
+        Name of the exposure variable if available.
+        """
+        return self._exposure_name
+
+    @property
+    def freq_weights_name(self):
+        """
+        Name of the freq weights variable if available.
+        """
+        return self._freq_weights_name
+
+    @property
+    def var_weights_name(self):
+        """
+        Name of var weights variable if available.
+        """
+        return self._var_weights_name
+
 
 get_prediction_doc = Docstring(pred.get_prediction_glm.__doc__)
 get_prediction_doc.remove_parameters("pred_kwds")

diff --git a/statsmodels/genmod/tests/test_glm.py b/statsmodels/genmod/tests/test_glm.py
@@ -2661,3 +2661,31 @@ def test_tweedie_score():
             nhess = approx_hess_cs(pa, lambda x: model.loglike(x, scale=1))
             ahess = model.hessian(pa, scale=1)
             assert_allclose(nhess, ahess, atol=5e-8, rtol=5e-8)
+
+def test_names():
+    """Test the name properties if using a pandas series.
+
+    Don't care about the data here, only testing the name properties.
+    """
+    y = pd.Series([0, 1], name="endog")
+    x = pd.DataFrame({"a": [1, 1], "b": [1, 0]})
+    exposure = pd.Series([0, 0], name="exposure")
+    freq_weights = pd.Series([0, 0], name="freq_weights")
+    offset = pd.Series([0, 0], name="offset")
+    var_weights = pd.Series([0, 0], name="var_weights")
+
+    model = GLM(
+        endog=y,
+        exog=x,
+        exposure=exposure,
+        freq_weights=freq_weights,
+        offset=offset,
+        var_weights=var_weights,
+        family=sm.families.Tweedie(),
+    )
+    assert model.offset_name == "offset"
+    assert model.exposure_name == "exposure"
+    assert model.freq_weights_name == "freq_weights"
+    assert model.var_weights_name == "var_weights"
+    assert model.endog_names == "endog"
+    assert model.exog_names == ["a", "b"]
diff --git a/statsmodels/tools/data.py b/statsmodels/tools/data.py
@@ -19,7 +19,8 @@ def _check_period_index(x, freq="M"):
     if not inferred_freq.startswith(freq):
         raise ValueError("Expected frequency {}. Got {}".format(freq,
                                                                 inferred_freq))
-
+def is_series(obj):
+    return isinstance(obj, pd.Series)
 
 def is_data_frame(obj):
     return isinstance(obj, pd.DataFrame)
@@ -121,3 +122,16 @@ def _is_recarray(data):
         return isinstance(data, np.core.recarray)
     else:
         return isinstance(data, np.rec.recarray)
+
+def _as_array_with_name(obj):
+    """
+    Call np.asarray() on obj and attempt to get the name if its a Series.
+
+    Returns
+    -------
+    array_and_name: tuple[np.ndarray, str | None]
+        The data casted to np.ndarra and the series name or None
+    """
+    if is_series(obj):
+        return (np.asarray(obj), obj.name)
+    return (np.asarray(obj), None)
diff --git a/statsmodels/tools/tests/test_data.py b/statsmodels/tools/tests/test_data.py
@@ -33,3 +33,15 @@ def test_patsy_577():
     np.testing.assert_(data._is_using_patsy(endog, None))
     exog = dmatrix("var2 - 1", df)
     np.testing.assert_(data._is_using_patsy(endog, exog))
+
+
+def test_as_array_with_name_series():
+    arr, name = data._as_array_with_name(pandas.Series([1], name="hello"))
+    np.testing.assert_array_equal(np.array([1]), arr)
+    assert name == "hello"
+
+
+def test_as_array_with_name_array():
+    arr, name = data._as_array_with_name(np.array([1]))
+    np.testing.assert_array_equal(np.array([1]), arr)
+    assert name is None