Skip to content

Commit

Permalink
ENH: GLM models now save the names of input Pandas Series
Browse files Browse the repository at this point in the history
Offset, exposure, freq_weights and var_weights have the name of the
series saved on the model object. They can be accessed via the class
properties.

Closes #9100
  • Loading branch information
jmahlik committed Jan 18, 2024
1 parent e797469 commit 5076ca5
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 5 deletions.
43 changes: 39 additions & 4 deletions statsmodels/genmod/generalized_linear_model.py
Expand Up @@ -41,6 +41,7 @@
cached_data,
cached_value,
)
from statsmodels.tools.data import _as_array_with_name
from statsmodels.tools.docstring import Docstring
from statsmodels.tools.sm_exceptions import (
DomainWarning,
Expand Down Expand Up @@ -307,15 +308,21 @@ def __init__(self, endog, exog, family=None, offset=None,
f"{type(family).__name__} family."),
DomainWarning)

self._exposure_name = None
self._offset_name = None
self._freq_weights_name = None
self._var_weights_name = None

if exposure is not None:
exposure = np.log(exposure)
exposure_array, self._exposure_name = _as_array_with_name(exposure)
exposure = np.log(exposure_array)
if offset is not None: # this should probably be done upstream
offset = np.asarray(offset)
offset, self._offset_name = _as_array_with_name(offset)

if freq_weights is not None:
freq_weights = np.asarray(freq_weights)
freq_weights, self._freq_weights_name = _as_array_with_name(freq_weights)
if var_weights is not None:
var_weights = np.asarray(var_weights)
var_weights, self._var_weights_name = _as_array_with_name(var_weights)

self.freq_weights = freq_weights
self.var_weights = var_weights
Expand Down Expand Up @@ -1558,6 +1565,34 @@ def fit_constrained(self, constraints, start_params=None, **fit_kwds):
res._results.results_constrained = res_constr
return res

@property
def offset_name(self):
"""
Name of the offset variable if available.
"""
return self._offset_name

@property
def exposure_name(self):
"""
Name of the exposure variable if available.
"""
return self._exposure_name

@property
def freq_weights_name(self):
"""
Name of the freq weights variable if available.
"""
return self._freq_weights_name

@property
def var_weights_name(self):
"""
Name of var weights variable if available.
"""
return self._var_weights_name


get_prediction_doc = Docstring(pred.get_prediction_glm.__doc__)
get_prediction_doc.remove_parameters("pred_kwds")
Expand Down
28 changes: 28 additions & 0 deletions statsmodels/genmod/tests/test_glm.py
Expand Up @@ -2661,3 +2661,31 @@ def test_tweedie_score():
nhess = approx_hess_cs(pa, lambda x: model.loglike(x, scale=1))
ahess = model.hessian(pa, scale=1)
assert_allclose(nhess, ahess, atol=5e-8, rtol=5e-8)

def test_names():
"""Test the name properties if using a pandas series.
Don't care about the data here, only testing the name properties.
"""
y = pd.Series([0, 1], name="endog")
x = pd.DataFrame({"a": [1, 1], "b": [1, 0]})
exposure = pd.Series([0, 0], name="exposure")
freq_weights = pd.Series([0, 0], name="freq_weights")
offset = pd.Series([0, 0], name="offset")
var_weights = pd.Series([0, 0], name="var_weights")

model = GLM(
endog=y,
exog=x,
exposure=exposure,
freq_weights=freq_weights,
offset=offset,
var_weights=var_weights,
family=sm.families.Tweedie(),
)
assert model.offset_name == "offset"
assert model.exposure_name == "exposure"
assert model.freq_weights_name == "freq_weights"
assert model.var_weights_name == "var_weights"
assert model.endog_names == "endog"
assert model.exog_names == ["a", "b"]
16 changes: 15 additions & 1 deletion statsmodels/tools/data.py
Expand Up @@ -19,7 +19,8 @@ def _check_period_index(x, freq="M"):
if not inferred_freq.startswith(freq):
raise ValueError("Expected frequency {}. Got {}".format(freq,
inferred_freq))

def is_series(obj):
return isinstance(obj, pd.Series)

def is_data_frame(obj):
return isinstance(obj, pd.DataFrame)
Expand Down Expand Up @@ -121,3 +122,16 @@ def _is_recarray(data):
return isinstance(data, np.core.recarray)
else:
return isinstance(data, np.rec.recarray)

def _as_array_with_name(obj):
"""
Call np.asarray() on obj and attempt to get the name if its a Series.
Returns
-------
array_and_name: tuple[np.ndarray, str | None]
The data casted to np.ndarra and the series name or None
"""
if is_series(obj):
return (np.asarray(obj), obj.name)
return (np.asarray(obj), None)
12 changes: 12 additions & 0 deletions statsmodels/tools/tests/test_data.py
Expand Up @@ -33,3 +33,15 @@ def test_patsy_577():
np.testing.assert_(data._is_using_patsy(endog, None))
exog = dmatrix("var2 - 1", df)
np.testing.assert_(data._is_using_patsy(endog, exog))


def test_as_array_with_name_series():
arr, name = data._as_array_with_name(pandas.Series([1], name="hello"))
np.testing.assert_array_equal(np.array([1]), arr)
assert name == "hello"


def test_as_array_with_name_array():
arr, name = data._as_array_with_name(np.array([1]))
np.testing.assert_array_equal(np.array([1]), arr)
assert name is None

0 comments on commit 5076ca5

Please sign in to comment.