Skip to content

Commit

Permalink
ENH: GLM models now save the names of input Pandas Series
Browse files Browse the repository at this point in the history
Offset, exposure, freq_weights and var_weights have the name of the
series saved on the model object. They can be accessed via the class
properties.

Closes statsmodels#9100
  • Loading branch information
jmahlik committed Apr 15, 2024
1 parent a0eca86 commit c025465
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 5 deletions.
48 changes: 44 additions & 4 deletions statsmodels/genmod/generalized_linear_model.py
Expand Up @@ -41,6 +41,7 @@
cached_data,
cached_value,
)
from statsmodels.tools.data import _as_array_with_name
from statsmodels.tools.docstring import Docstring
from statsmodels.tools.sm_exceptions import (
DomainWarning,
Expand Down Expand Up @@ -307,15 +308,21 @@ def __init__(self, endog, exog, family=None, offset=None,
f"{type(family).__name__} family."),
DomainWarning)

self._exposure_name = None
self._offset_name = None
self._freq_weights_name = None
self._var_weights_name = None

if exposure is not None:
exposure = np.log(exposure)
exposure_array, self._exposure_name = _as_array_with_name(exposure, "exposure")
exposure = np.log(exposure_array)
if offset is not None: # this should probably be done upstream
offset = np.asarray(offset)
offset, self._offset_name = _as_array_with_name(offset, "offset")

if freq_weights is not None:
freq_weights = np.asarray(freq_weights)
freq_weights, self._freq_weights_name = _as_array_with_name(freq_weights, "freq_weights")
if var_weights is not None:
var_weights = np.asarray(var_weights)
var_weights, self._var_weights_name = _as_array_with_name(var_weights, "var_weights")

self.freq_weights = freq_weights
self.var_weights = var_weights
Expand Down Expand Up @@ -1558,6 +1565,39 @@ def fit_constrained(self, constraints, start_params=None, **fit_kwds):
res._results.results_constrained = res_constr
return res

@property
def offset_name(self):
"""
Name of the offset variable if available. If offset is not a pd.Series,
defaults to 'offset'.
"""
return self._offset_name

@property
def exposure_name(self):
"""
Name of the exposure variable if available. If exposure is not a pd.Series,
defaults to 'exposure'.
"""
return self._exposure_name

@property
def freq_weights_name(self):
"""
Name of the freq weights variable if available. If freq_weights is not a
pd.Series, defaults to 'freq_weights'.
"""
return self._freq_weights_name

@property
def var_weights_name(self):
"""
Name of var weights variable if available. If var_weights is not a pd.Series,
defaults to 'var_weights'.
"""
return self._var_weights_name


get_prediction_doc = Docstring(pred.get_prediction_glm.__doc__)
get_prediction_doc.remove_parameters("pred_kwds")
Expand Down
59 changes: 59 additions & 0 deletions statsmodels/genmod/tests/test_glm.py
Expand Up @@ -2661,3 +2661,62 @@ def test_tweedie_score():
nhess = approx_hess_cs(pa, lambda x: model.loglike(x, scale=1))
ahess = model.hessian(pa, scale=1)
assert_allclose(nhess, ahess, atol=5e-8, rtol=5e-8)

def test_names():
"""Test the name properties if using a pandas series.
They should not be the defaults if the series has a name.
Don't care about the data here, only testing the name properties.
"""
y = pd.Series([0, 1], name="endog_not_default")
x = pd.DataFrame({"a": [1, 1], "b": [1, 0]})
exposure = pd.Series([0, 0], name="exposure_not_default")
freq_weights = pd.Series([0, 0], name="freq_weights_not_default")
offset = pd.Series([0, 0], name="offset_not_default")
var_weights = pd.Series([0, 0], name="var_weights_not_default")

model = GLM(
endog=y,
exog=x,
exposure=exposure,
freq_weights=freq_weights,
offset=offset,
var_weights=var_weights,
family=sm.families.Tweedie(),
)
assert model.offset_name == "offset_not_default"
assert model.exposure_name == "exposure_not_default"
assert model.freq_weights_name == "freq_weights_not_default"
assert model.var_weights_name == "var_weights_not_default"
assert model.endog_names == "endog_not_default"
assert model.exog_names == ["a", "b"]


def test_names_default():
"""Test the name properties if using a numpy arrays.
Don't care about the data here, only testing the name properties.
"""
y = np.array([0, 1])
x = np.array([[1, 1,], [1, 0]])
exposure = np.array([0, 0])
freq_weights = np.array([0, 0])
offset = np.array([0, 0])
var_weights = np.array([0, 0])

model = GLM(
endog=y,
exog=x,
exposure=exposure,
freq_weights=freq_weights,
offset=offset,
var_weights=var_weights,
family=sm.families.Tweedie(),
)
assert model.offset_name == "offset"
assert model.exposure_name == "exposure"
assert model.freq_weights_name == "freq_weights"
assert model.var_weights_name == "var_weights"
assert model.endog_names == "y"
assert model.exog_names == ["const", "x1"]
24 changes: 23 additions & 1 deletion statsmodels/tools/data.py
Expand Up @@ -19,7 +19,8 @@ def _check_period_index(x, freq="M"):
if not inferred_freq.startswith(freq):
raise ValueError("Expected frequency {}. Got {}".format(freq,
inferred_freq))

def is_series(obj):
return isinstance(obj, pd.Series)

def is_data_frame(obj):
return isinstance(obj, pd.DataFrame)
Expand Down Expand Up @@ -121,3 +122,24 @@ def _is_recarray(data):
return isinstance(data, np.core.recarray)
else:
return isinstance(data, np.rec.recarray)

def _as_array_with_name(obj, default_name):
"""
Call np.asarray() on obj and attempt to get the name if its a Series.
Parameters
----------
obj: pd.Series
Series to convert to an array
default_name: str
The default name to return in case the object isn't a pd.Series or has
no name attribute.
Returns
-------
array_and_name: tuple[np.ndarray, str]
The data casted to np.ndarra and the series name or None
"""
if is_series(obj):
return (np.asarray(obj), obj.name)
return (np.asarray(obj), default_name)
13 changes: 13 additions & 0 deletions statsmodels/tools/tests/test_data.py
Expand Up @@ -33,3 +33,16 @@ def test_patsy_577():
np.testing.assert_(data._is_using_patsy(endog, None))
exog = dmatrix("var2 - 1", df)
np.testing.assert_(data._is_using_patsy(endog, exog))


def test_as_array_with_name_series():
s = pandas.Series([1], name="hello")
arr, name = data._as_array_with_name(s, "not_used")
np.testing.assert_array_equal(np.array([1]), arr)
assert name == "hello"


def test_as_array_with_name_array():
arr, name = data._as_array_with_name(np.array([1]), "default")
np.testing.assert_array_equal(np.array([1]), arr)
assert name == "default"

0 comments on commit c025465

Please sign in to comment.