diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 6e077de2134..b15e683f290 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -27,7 +27,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL diff --git a/.github/workflows/generate-documentation.yml b/.github/workflows/generate-documentation.yml index 60730e23c1b..2fe0b96278c 100644 --- a/.github/workflows/generate-documentation.yml +++ b/.github/workflows/generate-documentation.yml @@ -25,18 +25,18 @@ jobs: python-version: ["3.10"] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Checkout statsmodels.github.io without token - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 1 repository: statsmodels/statsmodels.github.io path: statsmodels.github.io if: ${{ github.event_name == 'pull_request' }} - name: Checkout statsmodels.github.io with token - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 1 repository: statsmodels/statsmodels.github.io @@ -52,7 +52,7 @@ jobs: - name: Install graphviz uses: ts-graphviz/setup-graphviz@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" diff --git a/.gitignore b/.gitignore index 38442588fb7..0c32bf53261 100644 --- a/.gitignore +++ b/.gitignore @@ -18,10 +18,13 @@ examples/executed #setuptools-scm generated version file statsmodels/_version.py -# generated c source and built extensions +# Generated c source and built extensions +# and Byte-compiled / optimized / DLL files *.c *.so -*.pyd +*.py[cod] +__pycache__/ + # repository directories for bzr-git .bzr @@ -104,3 +107,6 @@ statsmodels/tsa/statespace/_smoothers/_classical.pyx # Temporary copies for packaging statsmodels/setup.cfg statsmodels/LICENSE.txt + +# Dataset downloads +statsmodels/datasets/tests/*.zip \ No newline at end of file diff --git a/INSTALL.txt b/INSTALL.txt index 6e7aa94f8dd..fb77f2f5269 100644 --- a/INSTALL.txt +++ b/INSTALL.txt @@ -21,7 +21,7 @@ patsy >= 0.5.2 patsy.readthedocs.org -cython >= 0.29.26 +cython >= 0.29.33 and < 4.0.0 https://cython.org/ diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 41cbd092543..bf8ed8bbbd9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -16,7 +16,6 @@ variables: VML_NUM_THREADS: 1 OPENBLAS_NUM_THREADS: 1 PYTHONHASHSEED: 0 # Ensure tests are correctly gathered by xdist - SETUPTOOLS_USE_DISTUTILS: "stdlib" USE_MATPLOTLIB: true jobs: diff --git a/docs/source/install.rst b/docs/source/install.rst index 8d2741d131d..e35e7677c9c 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -134,7 +134,7 @@ The current minimum dependencies are: Cython is required to build from a git checkout but not to run or install from PyPI: -* `Cython `__ >= 0.29.26 is required to build the code from +* `Cython `__ >= 0.29.33 is required to build the code from github but not from a source distribution. Given the long release cycle, statsmodels follows a loose time-based policy for diff --git a/pyproject.toml b/pyproject.toml index 0b6e99556b8..a8af48ebc85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,14 +2,15 @@ # These are strictly build requirements. Runtime requirements are listed in # INSTALL_REQUIRES in setup.py requires = [ - "setuptools>=59.2.0", - "cython>=0.29.26,<3", # Sync with CYTHON_MIN_VER in setup + "setuptools>=69.0.2; python_version>='3.12'", + "setuptools>=63.4.3", + "cython>=0.29.33,<4", # Sync with CYTHON_MIN_VER in setup # Workaround for oldest supported numpy using 1.21.6, but SciPy 1.9.2+ requiring 1.22.3+ "oldest-supported-numpy; python_version!='3.10' or platform_system!='Windows' or platform_python_implementation=='PyPy'", "numpy>=1.22.3,<2; python_version=='3.10' and platform_system=='Windows' and platform_python_implementation != 'PyPy'", "numpy<2; python_version>='3.12'", "scipy>=1.4", - "setuptools_scm[toml]>=7,<8" + "setuptools_scm[toml]>=8,<9" ] build-backend = "setuptools.build_meta" diff --git a/requirements-dev.txt b/requirements-dev.txt index 06fcef5930d..aa05144d6e9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,6 @@ # build -cython>=0.29.28,<3.0.0 -setuptools_scm[toml]~=7.0 +cython>=0.29.33,<4.0.0 +setuptools_scm[toml]~=8.0 oldest-supported-numpy>=2022.4.18 # run @@ -14,6 +14,7 @@ joblib pytest>=7.3.0 pytest-randomly pytest-xdist +pytest-cov # Pin on Win32 pywinpty; os_name == "nt" diff --git a/requirements.txt b/requirements.txt index 3e293f86e31..2d5bf7d9e2f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ numpy >=1.18,<2 # released December 2019 scipy>=1.4,!=1.9.2 # released December 2019 scipy>=1.4,!=1.9.2; sys_platform == "win32" # Blocked 1.9.2 due to Windows issues pandas>=1.0,!=2.1.0 # released January 2020, 2.1.0 blocked due to bug -patsy>=0.5.2 # released January 2018 +patsy>=0.5.4 # released January 2018 packaging>=21.3 # released Nov 2021 diff --git a/setup.cfg b/setup.cfg index 57c41e0abff..f18bd97ff67 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,7 +23,6 @@ filterwarnings = ignore:The --strict option is deprecated:pytest.PytestDeprecationWarning: error:genfromdta:FutureWarning error:StataReader:FutureWarning - error:Estimation of VARMA:statsmodels.tools.sm_exceptions.EstimationWarning error:Care should be used:UserWarning error::statsmodels.tools.sm_exceptions.HypothesisTestWarning error::statsmodels.tools.sm_exceptions.SpecificationWarning @@ -79,6 +78,21 @@ filterwarnings = error:A value is trying to be set on a copy:: error:Conversion of an array with ndim:DeprecationWarning: error:Series.__getitem__ treating keys:FutureWarning: + error:'Y' is deprecated, please use 'YE' instead:FutureWarning + error:'A' is deprecated, please use 'YE' instead:FutureWarning + error:'H' is deprecated and will:FutureWarning + error:'M' is deprecated, please use:FutureWarning + error:'Q' is deprecated, please use 'QE' instead:FutureWarning + error:'Q-DEC' is deprecated, please use 'QE-DEC' instead:FutureWarning + error:'Q-JAN' is deprecated, please use 'QE-JAN' instead:FutureWarning + error:'BQ-MAR' is deprecated, please use 'BQE-MAR' instead:FutureWarning + error:'AS-MAR' is deprecated and will be removed in a future version:FutureWarning + error:Downcasting object dtype arrays on:FutureWarning + error:The previous implementation of stack is deprecated:FutureWarning + error:Series.__setitem__ treating keys as positions is deprecated:FutureWarning + error:The provided callable:FutureWarning + error:divide by zero encountered in log1p:RuntimeWarning + markers = example: mark a test that runs example code matplotlib: mark a test that requires matplotlib diff --git a/setup.py b/setup.py index fb4adc7b4e3..7a353eff111 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ from setuptools import Command, Extension, find_packages, setup from setuptools.dist import Distribution +from packaging.version import parse from collections import defaultdict import fnmatch import inspect @@ -25,15 +26,16 @@ FORCE_C = int(os.environ.get("SM_FORCE_C", 0)) if FORCE_C: raise ImportError("Force import error for testing") - from Cython import Tempita + from Cython import Tempita, __version__ as cython_version from Cython.Build import cythonize from Cython.Distutils import build_ext HAS_CYTHON = True + CYTHON_3 = parse(cython_version) >= parse("3.0") except ImportError: from setuptools.command.build_ext import build_ext - HAS_CYTHON = False + HAS_CYTHON = CYTHON_3 = False try: import numpy # noqa: F401 @@ -57,7 +59,7 @@ for line in req.readlines(): DEVELOP_REQUIRES.append(line.split("#")[0].strip()) -CYTHON_MIN_VER = "0.29.26" # released 2020 +CYTHON_MIN_VER = "0.29.33" # released January 2023 EXTRAS_REQUIRE = { "build": ["cython>=" + CYTHON_MIN_VER], @@ -346,8 +348,9 @@ def process_tempita(source_name): for extension in extensions: requires_math = extension.name in EXT_REQUIRES_NUMPY_MATH_LIBS update_extension(extension, requires_math=requires_math) - if HAS_CYTHON: + if CYTHON_3: + COMPILER_DIRECTIVES["cpow"] = True extensions = cythonize( extensions, compiler_directives=COMPILER_DIRECTIVES, diff --git a/statsmodels/base/tests/test_optimize.py b/statsmodels/base/tests/test_optimize.py index 9a17f43a972..e550dafe678 100644 --- a/statsmodels/base/tests/test_optimize.py +++ b/statsmodels/base/tests/test_optimize.py @@ -85,7 +85,13 @@ def test_full_output_false(reset_randomstate): else: xopt, retvals = func( - dummy_func, dummy_score, [1], (), {}, full_output=False, disp=0 + dummy_func, + dummy_score, + [1.0], + (), + {}, + full_output=False, + disp=0 ) assert_(retvals is None) if method == "powell" and SP_LT_15: @@ -102,7 +108,7 @@ def test_full_output(reset_randomstate): xopt, retvals = func( dummy_func, dummy_score, - [1], + [1.0], (), {}, hess=dummy_hess, @@ -112,7 +118,13 @@ def test_full_output(reset_randomstate): else: xopt, retvals = func( - dummy_func, dummy_score, [1], (), {}, full_output=True, disp=0 + dummy_func, + dummy_score, + [1.0], + (), + {}, + full_output=True, + disp=0 ) assert_(retvals is not None) @@ -130,7 +142,7 @@ def test_minimize_scipy_slsqp(): xopt, _ = func( dummy_bounds_constraint_func, None, - (2, 0), + (2.0, 0.0), (), { "min_method": "SLSQP", diff --git a/statsmodels/compat/pandas.py b/statsmodels/compat/pandas.py index 5b804942032..3af29c946d5 100644 --- a/statsmodels/compat/pandas.py +++ b/statsmodels/compat/pandas.py @@ -32,11 +32,17 @@ "call_cached_func", "PD_LT_1_4", "PD_LT_2", + "MONTH_END", + "QUARTER_END", + "YEAR_END", + "FUTURE_STACK", ] version = parse(pd.__version__) -PD_LT_1_0_0 = version < Version("1.0.0") +PD_LT_2_2_0 = version < Version("2.1.99") +PD_LT_2_1_0 = version < Version("2.0.99") +PD_LT_1_0_0 = version < Version("0.99.0") PD_LT_1_4 = version < Version("1.3.99") PD_LT_2 = version < Version("1.9.99") @@ -174,3 +180,9 @@ def call_cached_func(cached_prop, *args, **kwargs): def get_cached_doc(cached_prop) -> Optional[str]: return get_cached_func(cached_prop).__doc__ + + +MONTH_END = "M" if PD_LT_2_2_0 else "ME" +QUARTER_END = "Q" if PD_LT_2_2_0 else "QE" +YEAR_END = "Y" if PD_LT_2_2_0 else "YE" +FUTURE_STACK = {} if PD_LT_2_1_0 else {"future_stack": True} diff --git a/statsmodels/discrete/tests/test_discrete.py b/statsmodels/discrete/tests/test_discrete.py index fb79818e3f8..c74d324212a 100644 --- a/statsmodels/discrete/tests/test_discrete.py +++ b/statsmodels/discrete/tests/test_discrete.py @@ -2586,16 +2586,15 @@ def test_cov_confint_pandas(): assert isinstance(ci.index, pd.MultiIndex) -def test_t_test(): +def test_mlogit_t_test(): # GH669, check t_test works in multivariate model - data = load_anes96() + data = sm.datasets.anes96.load() exog = sm.add_constant(data.exog, prepend=False) res1 = sm.MNLogit(data.endog, exog).fit(disp=0) r = np.ones(res1.cov_params().shape[0]) t1 = res1.t_test(r) f1 = res1.f_test(r) - data = sm.datasets.anes96.load() exog = sm.add_constant(data.exog, prepend=False) endog, exog = np.asarray(data.endog), np.asarray(exog) res2 = sm.MNLogit(endog, exog).fit(disp=0) @@ -2604,3 +2603,20 @@ def test_t_test(): assert_allclose(t1.effect, t2.effect) assert_allclose(f1.statistic, f2.statistic) + + tt = res1.t_test(np.eye(np.size(res2.params))) + assert_allclose(tt.tvalue.reshape(6,6, order="F"), res1.tvalues.to_numpy()) + tt = res2.t_test(np.eye(np.size(res2.params))) + assert_allclose(tt.tvalue.reshape(6,6, order="F"), res2.tvalues) + + wt = res1.wald_test(np.eye(np.size(res2.params))[0], scalar=True) + assert_allclose(wt.pvalue, res1.pvalues.to_numpy()[0, 0]) + + + tt = res1.t_test("y1_logpopul") + wt = res1.wald_test("y1_logpopul", scalar=True) + assert_allclose(tt.pvalue, wt.pvalue) + + wt = res1.wald_test("y1_logpopul, y2_logpopul", scalar=True) + # regression test + assert_allclose(wt.statistic, 5.68660562, rtol=1e-8) diff --git a/statsmodels/discrete/tests/test_sandwich_cov.py b/statsmodels/discrete/tests/test_sandwich_cov.py index 2d641207602..70738aa9415 100644 --- a/statsmodels/discrete/tests/test_sandwich_cov.py +++ b/statsmodels/discrete/tests/test_sandwich_cov.py @@ -9,6 +9,8 @@ import os import numpy as np import pandas as pd +import pytest + import statsmodels.discrete.discrete_model as smd from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families @@ -499,6 +501,21 @@ def test_score_test(self): res_lm2 = res2.score_test(exog_extra, cov_type='nonrobust') assert_allclose(np.hstack(res_lm1), np.hstack(res_lm2), rtol=5e-7) + def test_margeff(self): + if (isinstance(self.res2.model, OLS) or + hasattr(self.res1.model, "offset")): + pytest.skip("not available yet") + + marg1 = self.res1.get_margeff() + marg2 = self.res2.get_margeff() + assert_allclose(marg1.margeff, marg2.margeff, rtol=1e-10) + assert_allclose(marg1.margeff_se, marg2.margeff_se, rtol=1e-10) + + marg1 = self.res1.get_margeff(count=True, dummy=True) + marg2 = self.res2.get_margeff(count=True, dummy=True) + assert_allclose(marg1.margeff, marg2.margeff, rtol=1e-10) + assert_allclose(marg1.margeff_se, marg2.margeff_se, rtol=1e-10) + class TestGLMPoisson(CheckDiscreteGLM): diff --git a/statsmodels/emplike/originregress.py b/statsmodels/emplike/originregress.py index 15013c36639..28956a5e3b6 100644 --- a/statsmodels/emplike/originregress.py +++ b/statsmodels/emplike/originregress.py @@ -207,7 +207,7 @@ def el_test(self, b0_vals, param_nums, method='nm', def conf_int_el(self, param_num, upper_bound=None, lower_bound=None, sig=.05, method='nm', - stochastic_exog=1): + stochastic_exog=True): """ Returns the confidence interval for a regression parameter when the regression is forced through the origin. @@ -217,22 +217,20 @@ def conf_int_el(self, param_num, upper_bound=None, param_num : int The parameter number to be tested. Note this uses python indexing but the '0' parameter refers to the intercept term. - upper_bound : float The maximum value the upper confidence limit can be. The closer this is to the confidence limit, the quicker the computation. Default is .00001 confidence limit under normality. - lower_bound : float The minimum value the lower confidence limit can be. Default is .00001 confidence limit under normality. - sig : float, optional The significance level. Default .05. - method : str, optional Algorithm to optimize of nuisance params. Can be 'nm' or 'powell'. Default is 'nm'. + stochastic_exog : bool + Default is True. Returns ------- @@ -247,9 +245,14 @@ def conf_int_el(self, param_num, upper_bound=None, if lower_bound is None: ci = np.asarray(self.model.fit().conf_int(.0001)) lower_bound = (np.squeeze(ci[param_num])[0]) - f = lambda b0: self.el_test(np.array([b0]), param_num, - method=method, - stochastic_exog=stochastic_exog)[0] - r0 + + def f(b0): + b0 = np.array([b0]) + val = self.el_test( + b0, param_num, method=method, stochastic_exog=stochastic_exog + ) + return val[0] - r0 + _param = np.squeeze(self.params[param_num]) lowerl = optimize.brentq(f, np.squeeze(lower_bound), _param) upperl = optimize.brentq(f, _param, np.squeeze(upper_bound)) diff --git a/statsmodels/genmod/generalized_linear_model.py b/statsmodels/genmod/generalized_linear_model.py index c0799b3d7e8..849e7c61aaa 100644 --- a/statsmodels/genmod/generalized_linear_model.py +++ b/statsmodels/genmod/generalized_linear_model.py @@ -54,6 +54,7 @@ # need import in module instead of lazily to copy `__doc__` from . import families + __all__ = ['GLM', 'PredictionResultsMean'] @@ -624,6 +625,90 @@ def information(self, params, scale=None): scale = float_like(scale, "scale", optional=True) return self.hessian(params, scale=scale, observed=False) + def _derivative_exog(self, params, exog=None, transform="dydx", + dummy_idx=None, count_idx=None, + offset=None, exposure=None): + """ + Derivative of mean, expected endog with respect to the parameters + """ + if exog is None: + exog = self.exog + if (offset is not None) or (exposure is not None): + raise NotImplementedError("offset and exposure not supported") + + lin_pred = self.predict(params, exog, which="linear", + offset=offset, exposure=exposure) + + k_extra = getattr(self, 'k_extra', 0) + params_exog = params if k_extra == 0 else params[:-k_extra] + + margeff = (self.family.link.inverse_deriv(lin_pred)[:, None] * + params_exog) + if 'ex' in transform: + margeff *= exog + if 'ey' in transform: + mean = self.family.link.inverse(lin_pred) + margeff /= mean[:,None] + + return self._derivative_exog_helper(margeff, params, exog, + dummy_idx, count_idx, transform) + + def _derivative_exog_helper(self, margeff, params, exog, dummy_idx, + count_idx, transform): + """ + Helper for _derivative_exog to wrap results appropriately + """ + from statsmodels.discrete.discrete_margins import ( + _get_count_effects, + _get_dummy_effects, + ) + + if count_idx is not None: + margeff = _get_count_effects(margeff, exog, count_idx, transform, + self, params) + if dummy_idx is not None: + margeff = _get_dummy_effects(margeff, exog, dummy_idx, transform, + self, params) + + return margeff + + def _derivative_predict(self, params, exog=None, transform='dydx', + offset=None, exposure=None): + """ + Derivative of the expected endog with respect to the parameters. + + Parameters + ---------- + params : ndarray + parameter at which score is evaluated + exog : ndarray or None + Explanatory variables at which derivative are computed. + If None, then the estimation exog is used. + offset, exposure : None + Not yet implemented. + + Returns + ------- + The value of the derivative of the expected endog with respect + to the parameter vector. + """ + # core part is same as derivative_mean_params + # additionally handles exog and transform + if exog is None: + exog = self.exog + if (offset is not None) or (exposure is not None) or ( + getattr(self, 'offset', None) is not None): + raise NotImplementedError("offset and exposure not supported") + + lin_pred = self.predict(params, exog=exog, which="linear") + idl = self.family.link.inverse_deriv(lin_pred) + dmat = exog * idl[:, None] + if 'ey' in transform: + mean = self.family.link.inverse(lin_pred) + dmat /= mean[:, None] + + return dmat + def _deriv_mean_dparams(self, params): """ Derivative of the expected endog with respect to the parameters. @@ -2210,6 +2295,92 @@ def get_distribution(self, exog=None, exposure=None, mu, scale, var_weights=var_weights, **kwds) return distr + def get_margeff(self, at='overall', method='dydx', atexog=None, + dummy=False, count=False): + """Get marginal effects of the fitted model. + + Warning: offset, exposure and weights (var_weights and freq_weights) + are not supported by margeff. + + Parameters + ---------- + at : str, optional + Options are: + + - 'overall', The average of the marginal effects at each + observation. + - 'mean', The marginal effects at the mean of each regressor. + - 'median', The marginal effects at the median of each regressor. + - 'zero', The marginal effects at zero for each regressor. + - 'all', The marginal effects at each observation. If `at` is all + only margeff will be available from the returned object. + + Note that if `exog` is specified, then marginal effects for all + variables not specified by `exog` are calculated using the `at` + option. + method : str, optional + Options are: + + - 'dydx' - dy/dx - No transformation is made and marginal effects + are returned. This is the default. + - 'eyex' - estimate elasticities of variables in `exog` -- + d(lny)/d(lnx) + - 'dyex' - estimate semi-elasticity -- dy/d(lnx) + - 'eydx' - estimate semi-elasticity -- d(lny)/dx + + Note that tranformations are done after each observation is + calculated. Semi-elasticities for binary variables are computed + using the midpoint method. 'dyex' and 'eyex' do not make sense + for discrete variables. For interpretations of these methods + see notes below. + atexog : array_like, optional + Optionally, you can provide the exogenous variables over which to + get the marginal effects. This should be a dictionary with the key + as the zero-indexed column number and the value of the dictionary. + Default is None for all independent variables less the constant. + dummy : bool, optional + If False, treats binary variables (if present) as continuous. This + is the default. Else if True, treats binary variables as + changing from 0 to 1. Note that any variable that is either 0 or 1 + is treated as binary. Each binary variable is treated separately + for now. + count : bool, optional + If False, treats count variables (if present) as continuous. This + is the default. Else if True, the marginal effect is the + change in probabilities when each observation is increased by one. + + Returns + ------- + DiscreteMargins : marginal effects instance + Returns an object that holds the marginal effects, standard + errors, confidence intervals, etc. See + `statsmodels.discrete.discrete_margins.DiscreteMargins` for more + information. + + Notes + ----- + Interpretations of methods: + + - 'dydx' - change in `endog` for a change in `exog`. + - 'eyex' - proportional change in `endog` for a proportional change + in `exog`. + - 'dyex' - change in `endog` for a proportional change in `exog`. + - 'eydx' - proportional change in `endog` for a change in `exog`. + + When using after Poisson, returns the expected number of events per + period, assuming that the model is loglinear. + + Status : unsupported features offset, exposure and weights. Default + handling of freq_weights for average effect "overall" might change. + + """ + if getattr(self.model, "offset", None) is not None: + raise NotImplementedError("Margins with offset are not available.") + if (np.any(self.model.var_weights != 1) or + np.any(self.model.freq_weights != 1)): + warnings.warn("weights are not taken into account by margeff") + from statsmodels.discrete.discrete_margins import DiscreteMargins + return DiscreteMargins(self, (at, method, atexog, dummy, count)) @Appender(base.LikelihoodModelResults.remove_data.__doc__) def remove_data(self): diff --git a/statsmodels/graphics/tests/test_tsaplots.py b/statsmodels/graphics/tests/test_tsaplots.py index 2776bbc9b35..5ba24f02a0f 100644 --- a/statsmodels/graphics/tests/test_tsaplots.py +++ b/statsmodels/graphics/tests/test_tsaplots.py @@ -1,3 +1,4 @@ +from statsmodels.compat.pandas import MONTH_END from statsmodels.compat.python import lmap import calendar @@ -362,7 +363,9 @@ def test_predict_plot(use_pandas, model_and_args, alpha): y[i] += 1.8 * y[i - 1] - 0.9 * y[i - 2] y = y[100:] if use_pandas: - index = pd.date_range("1960-1-1", freq="M", periods=y.shape[0] + 24) + index = pd.date_range( + "1960-1-1", freq=MONTH_END, periods=y.shape[0] + 24 + ) start = index[index.shape[0] // 2] end = index[-1] y = pd.Series(y, index=index[:-24]) @@ -372,3 +375,14 @@ def test_predict_plot(use_pandas, model_and_args, alpha): res = model(y, **kwargs).fit() fig = plot_predict(res, start, end, alpha=alpha) assert isinstance(fig, plt.Figure) + + +@pytest.mark.matplotlib +def test_plot_pacf_small_sample(): + idx = [pd.Timestamp.now() + pd.Timedelta(seconds=i) for i in range(10)] + df = pd.DataFrame( + index=idx, + columns=["a"], + data=list(range(10)) + ) + plot_pacf(df) diff --git a/statsmodels/graphics/tsaplots.py b/statsmodels/graphics/tsaplots.py index ddeec5293cb..c3358cd2829 100644 --- a/statsmodels/graphics/tsaplots.py +++ b/statsmodels/graphics/tsaplots.py @@ -17,7 +17,7 @@ def _prepare_data_corr_plot(x, lags, zero): if lags is None: # GH 4663 - use a sensible default value nobs = x.shape[0] - lim = min(int(np.ceil(10 * np.log10(nobs))), nobs - 1) + lim = min(int(np.ceil(10 * np.log10(nobs))), nobs // 2) lags = np.arange(not zero, lim + 1) elif np.isscalar(lags): lags = np.arange(not zero, int(lags) + 1) # +1 for zero lag diff --git a/statsmodels/imputation/tests/test_mice.py b/statsmodels/imputation/tests/test_mice.py index 9086c509e1b..7d89260cdde 100644 --- a/statsmodels/imputation/tests/test_mice.py +++ b/statsmodels/imputation/tests/test_mice.py @@ -36,17 +36,17 @@ def gendat(): Create a data set with missing values. """ - np.random.seed(34243) + gen = np.random.RandomState(34243) n = 200 p = 5 - exog = np.random.normal(size=(n, p)) + exog = gen.normal(size=(n, p)) exog[:, 0] = exog[:, 1] - exog[:, 2] + 2*exog[:, 4] - exog[:, 0] += np.random.normal(size=n) + exog[:, 0] += gen.normal(size=n) exog[:, 2] = 1*(exog[:, 2] > 0) - endog = exog.sum(1) + np.random.normal(size=n) + endog = exog.sum(1) + gen.normal(size=n) df = pd.DataFrame(exog) df.columns = ["x%d" % k for k in range(1, p+1)] @@ -97,7 +97,12 @@ def test_default(self): fml = 'x1 ~ x2 + x3 + x4 + x5 + y' assert_equal(imp_data.conditional_formula['x1'], fml) - assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) + # Order of 3 and 4 is not deterministic + # since both have 10 missing + assert tuple(imp_data._cycle_order) in ( + ('x5', 'x3', 'x4', 'y', 'x2', 'x1'), + ('x5', 'x4', 'x3', 'y', 'x2', 'x1') + ) # Should make a copy assert not (df is imp_data.data) @@ -161,17 +166,21 @@ def test_pertmeth(self): assert_equal(imp_data.data.shape[1], ncol) assert_allclose(orig[mx], imp_data.data[mx]) - assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) - + # Order of 3 and 4 is not deterministic + # since both have 10 missing + assert tuple(imp_data._cycle_order) in ( + ('x5', 'x3', 'x4', 'y', 'x2', 'x1'), + ('x5', 'x4', 'x3', 'y', 'x2', 'x1') + ) def test_phreg(self): - np.random.seed(8742) + gen = np.random.RandomState(8742) n = 300 - x1 = np.random.normal(size=n) - x2 = np.random.normal(size=n) - event_time = np.random.exponential(size=n) * np.exp(x1) - obs_time = np.random.exponential(size=n) + x1 = gen.normal(size=n) + x2 = gen.normal(size=n) + event_time = gen.exponential(size=n) * np.exp(x1) + obs_time = gen.exponential(size=n) time = np.where(event_time < obs_time, event_time, obs_time) status = np.where(time == event_time, 1, 0) df = pd.DataFrame({"time": time, "status": status, "x1": x1, "x2": x2}) @@ -236,7 +245,13 @@ def test_set_imputer(self): fml = 'x4 ~ x1 + x2 + x3 + x5 + y' assert_equal(imp_data.conditional_formula['x4'], fml) - assert_equal(imp_data._cycle_order, ['x5', 'x3', 'x4', 'y', 'x2', 'x1']) + # Order of 3 and 4 is not deterministic + # since both have 10 missing + assert tuple(imp_data._cycle_order) in ( + ('x5', 'x3', 'x4', 'y', 'x2', 'x1'), + ('x5', 'x4', 'x3', 'y', 'x2', 'x1') + ) + @pytest.mark.matplotlib @@ -354,10 +369,10 @@ def test_MICE2(self): @pytest.mark.slow def t_est_combine(self): - np.random.seed(3897) - x1 = np.random.normal(size=300) - x2 = np.random.normal(size=300) - y = x1 + x2 + np.random.normal(size=300) + gen = np.random.RandomState(3897) + x1 = gen.normal(size=300) + x2 = gen.normal(size=300) + y = x1 + x2 + gen.normal(size=300) x1[0:100] = np.nan x2[250:] = np.nan df = pd.DataFrame({"x1": x1, "x2": x2, "y": y}) @@ -377,8 +392,8 @@ def t_est_combine(self): def test_micedata_miss1(): # test for #4375 - np.random.seed(0) - data = pd.DataFrame(np.random.rand(50, 4)) + gen = np.random.RandomState(3897) + data = pd.DataFrame(gen.rand(50, 4)) data.columns = ['var1', 'var2', 'var3', 'var4'] # one column with a single missing value data.iloc[1, 1] = np.nan diff --git a/statsmodels/iolib/summary.py b/statsmodels/iolib/summary.py index 01f850b8886..88f9f364c02 100644 --- a/statsmodels/iolib/summary.py +++ b/statsmodels/iolib/summary.py @@ -450,7 +450,11 @@ def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True, params_stubs = xname exog_idx = lrange(len(xname)) - + params = np.asarray(params) + std_err = np.asarray(std_err) + tvalues = np.asarray(tvalues) + pvalues = np.asarray(pvalues) + conf_int = np.asarray(conf_int) params_data = lzip([forg(params[i], prec=4) for i in exog_idx], [forg(std_err[i]) for i in exog_idx], [forg(tvalues[i]) for i in exog_idx], @@ -469,7 +473,8 @@ def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True, def summary_params_frame(results, yname=None, xname=None, alpha=.05, use_t=True): - '''create a summary table for the parameters + """ + Create a summary table for the parameters Parameters ---------- @@ -492,7 +497,7 @@ def summary_params_frame(results, yname=None, xname=None, alpha=.05, Returns ------- params_table : SimpleTable instance - ''' + """ # Parameters part of the summary table # ------------------------------------ @@ -529,7 +534,7 @@ def summary_params_frame(results, yname=None, xname=None, alpha=.05, def summary_params_2d(result, extras=None, endog_names=None, exog_names=None, title=None): - '''create summary table of regression parameters with several equations + """create summary table of regression parameters with several equations This allows interleaving of parameters with bse and/or tvalues @@ -555,7 +560,7 @@ def summary_params_2d(result, extras=None, endog_names=None, exog_names=None, the merged table with results concatenated for each row of the parameter array - ''' + """ if endog_names is None: # TODO: note the [1:] is specific to current MNLogit endog_names = ['endog_%d' % i for i in @@ -590,7 +595,7 @@ def summary_params_2d(result, extras=None, endog_names=None, exog_names=None, def summary_params_2dflat(result, endog_names=None, exog_names=None, alpha=0.05, use_t=True, keep_headers=True, endog_cols=False): - '''summary table for parameters that are 2d, e.g. multi-equation models + """summary table for parameters that are 2d, e.g. multi-equation models Parameters ---------- @@ -621,7 +626,7 @@ def summary_params_2dflat(result, endog_names=None, exog_names=None, alpha=0.05, the merged table with results concatenated for each row of the parameter array - ''' + """ res = result params = res.params @@ -669,7 +674,7 @@ def summary_params_2dflat(result, endog_names=None, exog_names=None, alpha=0.05, def table_extend(tables, keep_headers=True): - '''extend a list of SimpleTables, adding titles to header of subtables + """extend a list of SimpleTables, adding titles to header of subtables This function returns the merged table as a deepcopy, in contrast to the SimpleTable extend method. @@ -686,7 +691,7 @@ def table_extend(tables, keep_headers=True): table_all : SimpleTable merged tables as a single SimpleTable instance - ''' + """ from copy import deepcopy for ii, t in enumerate(tables[:]): #[1:]: t = deepcopy(t) @@ -762,11 +767,11 @@ def __repr__(self): return str(type(self)) + '\n"""\n' + self.__str__() + '\n"""' def _repr_html_(self): - '''Display as HTML in IPython notebook.''' + """Display as HTML in IPython notebook.""" return self.as_html() def _repr_latex_(self): - '''Display as LaTeX when converting IPython notebook to PDF.''' + """Display as LaTeX when converting IPython notebook to PDF.""" return self.as_latex() def add_table_2cols(self, res, title=None, gleft=None, gright=None, @@ -799,7 +804,7 @@ def add_table_2cols(self, res, title=None, gleft=None, gright=None, def add_table_params(self, res, yname=None, xname=None, alpha=.05, use_t=True): - '''create and add a table for the parameter estimates + """create and add a table for the parameter estimates Parameters ---------- @@ -820,7 +825,7 @@ def add_table_params(self, res, yname=None, xname=None, alpha=.05, ------- None : table is attached - ''' + """ if res.params.ndim == 1: table = summary_params(res, yname=yname, xname=xname, alpha=alpha, use_t=use_t) @@ -833,32 +838,32 @@ def add_table_params(self, res, yname=None, xname=None, alpha=.05, self.tables.append(table) def add_extra_txt(self, etext): - '''add additional text that will be added at the end in text format + """add additional text that will be added at the end in text format Parameters ---------- etext : list[str] string with lines that are added to the text output. - ''' + """ self.extra_txt = '\n'.join(etext) def as_text(self): - '''return tables as string + """return tables as string Returns ------- txt : str summary tables and extra text as one string - ''' + """ txt = summary_return(self.tables, return_fmt='text') if self.extra_txt is not None: txt = txt + '\n\n' + self.extra_txt return txt def as_latex(self): - '''return tables as string + """return tables as string Returns ------- @@ -871,35 +876,35 @@ def as_latex(self): It is recommended to use `as_latex_tabular` directly on the individual tables. - ''' + """ latex = summary_return(self.tables, return_fmt='latex') if self.extra_txt is not None: latex = latex + '\n\n' + self.extra_txt.replace('\n', ' \\newline\n ') return latex def as_csv(self): - '''return tables as string + """return tables as string Returns ------- csv : str concatenated summary tables in comma delimited format - ''' + """ csv = summary_return(self.tables, return_fmt='csv') if self.extra_txt is not None: csv = csv + '\n\n' + self.extra_txt return csv def as_html(self): - '''return tables as string + """return tables as string Returns ------- html : str concatenated summary tables in HTML format - ''' + """ html = summary_return(self.tables, return_fmt='html') if self.extra_txt is not None: html = html + '

' + self.extra_txt.replace('\n', '
') diff --git a/statsmodels/iolib/summary2.py b/statsmodels/iolib/summary2.py index 64d85acdf27..d874096a29e 100644 --- a/statsmodels/iolib/summary2.py +++ b/statsmodels/iolib/summary2.py @@ -1,3 +1,4 @@ +from statsmodels.compat.pandas import FUTURE_STACK from statsmodels.compat.python import lzip import datetime @@ -416,7 +417,7 @@ def _col_params(result, float_format='%.4f', stars=True, include_r2=False): res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*' # Stack Coefs and Std.Errors res = res.iloc[:, :2] - res = res.stack() + res = res.stack(**FUTURE_STACK) # Add R-squared if include_r2: @@ -520,10 +521,21 @@ def summary_col(results, float_format='%.4f', model_names=(), stars=False, cols[i].columns = [colnames[i]] def merg(x, y): - return x.merge(y, how='outer', right_index=True, - left_index=True) + return x.merge(y, how='outer', right_index=True, left_index=True) + + # Changes due to how pandas 2.2.0 handles merge + index = list(cols[0].index) + for col in cols[1:]: + for key in col.index: + if key not in index: + index.append(key) + for special in (('R-squared', ''), ('R-squared Adj.', '')): + if special in index: + index.remove(special) + index.insert(len(index), special) summ = reduce(merg, cols) + summ = summ.reindex(index) if regressor_order: varnames = summ.index.get_level_values(0).tolist() @@ -538,7 +550,7 @@ def merg(x, y): if drop_omitted: for uo in unordered: new_order.remove(uo) - summ = summ.loc[new_order] + summ = summ.reindex(new_order, level=0) idx = [] index = summ.index.get_level_values(0) @@ -561,10 +573,6 @@ def merg(x, y): for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])): df.columns = [name] - def merg(x, y): - return x.merge(y, how='outer', right_index=True, - left_index=True) - info = reduce(merg, cols) dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error dat.columns = summ.columns diff --git a/statsmodels/iolib/tests/test_summary2.py b/statsmodels/iolib/tests/test_summary2.py index 0d342949a40..f18c45ea09b 100644 --- a/statsmodels/iolib/tests/test_summary2.py +++ b/statsmodels/iolib/tests/test_summary2.py @@ -70,13 +70,14 @@ def test_summarycol_float_format(self): reg2 = OLS(y2, x).fit() actual = summary_col([reg1, reg2], float_format='%0.1f').as_text() actual = '%s\n' % actual - assert_equal(actual, desired) starred = summary_col([reg1, reg2], stars=True, float_format='%0.1f') assert "7.7***" in str(starred) assert "12.4**" in str(starred) assert "12.4***" not in str(starred) + assert_equal(actual, desired) + def test_summarycol_drop_omitted(self): # gh-3702 x = [1, 5, 7, 3, 5] diff --git a/statsmodels/multivariate/cancorr.py b/statsmodels/multivariate/cancorr.py index f714c0fe8b4..408d9548ce0 100644 --- a/statsmodels/multivariate/cancorr.py +++ b/statsmodels/multivariate/cancorr.py @@ -34,9 +34,9 @@ class CanCorr(Model): See Parameters. cancorr : ndarray The canonical correlation values - y_cancoeff : ndarray + y_cancoef : ndarray The canonical coefficients for endog - x_cancoeff : ndarray + x_cancoef : ndarray The canonical coefficients for exog References diff --git a/statsmodels/regression/linear_model.py b/statsmodels/regression/linear_model.py index baab82c287a..34166588e93 100644 --- a/statsmodels/regression/linear_model.py +++ b/statsmodels/regression/linear_model.py @@ -796,7 +796,7 @@ def loglike(self, params): -\frac{n}{2}\left(1+\log\left(\frac{2\pi}{n}\right)\right) +\frac{1}{2}\log\left(\left|W\right|\right) - where :math:`W` is a diagonal weight matrix and + where :math:`W` is a diagonal weight matrix, :math:`\left|W\right|` is its determinant, and :math:`SSR=\left(Y-\hat{Y}\right)^\prime W \left(Y-\hat{Y}\right)` is the sum of the squared weighted residuals. diff --git a/statsmodels/regression/tests/test_lme.py b/statsmodels/regression/tests/test_lme.py index 3e453f956e0..571dcc8d5a7 100644 --- a/statsmodels/regression/tests/test_lme.py +++ b/statsmodels/regression/tests/test_lme.py @@ -1076,7 +1076,11 @@ def test_summary_col(): mod2 = MixedLM.from_formula('X ~ Y', d, groups=d['IDS']) results2 = mod2.fit(start_params=sp2) - out = summary_col([results1, results2], stars=True) + out = summary_col( + [results1, results2], + stars=True, + regressor_order=["Group Var", "Intercept", "X", "Y"] + ) s = ('\n=============================\n Y X \n' '-----------------------------\nGroup Var 0.1955 1.3854 \n' ' (0.6032) (2.7377) \nIntercept -1.2672 3.4842* \n' diff --git a/statsmodels/src/math.pxd b/statsmodels/src/math.pxd index 9ed4555268a..6812e2f0197 100644 --- a/statsmodels/src/math.pxd +++ b/statsmodels/src/math.pxd @@ -6,16 +6,16 @@ from libc.string cimport memcpy cdef extern from "numpy/npy_math.h": np.float64_t NPY_PI - np.float64_t npy_cabs(np.npy_cdouble z) nogil - np.npy_cdouble npy_clog(np.npy_cdouble z) nogil - np.npy_cdouble npy_cexp(np.npy_cdouble z) nogil + np.float64_t npy_cabs(np.npy_cdouble z) noexcept nogil + np.npy_cdouble npy_clog(np.npy_cdouble z) noexcept nogil + np.npy_cdouble npy_cexp(np.npy_cdouble z) noexcept nogil -cdef inline np.float64_t zabs(np.complex128_t z) nogil: +cdef inline np.float64_t zabs(np.complex128_t z) noexcept nogil: cdef np.npy_cdouble x memcpy(&x, &z, sizeof(z)) return npy_cabs(x) -cdef inline np.complex128_t zlog(np.complex128_t z) nogil: +cdef inline np.complex128_t zlog(np.complex128_t z) noexcept nogil: cdef np.npy_cdouble x cdef np.complex128_t out memcpy(&x, &z, sizeof(z)) @@ -23,7 +23,7 @@ cdef inline np.complex128_t zlog(np.complex128_t z) nogil: memcpy(&out, &x, sizeof(x)) return out -cdef inline np.complex128_t zexp(np.complex128_t z) nogil: +cdef inline np.complex128_t zexp(np.complex128_t z) noexcept nogil: cdef np.npy_cdouble x cdef np.complex128_t out memcpy(&x, &z, sizeof(z)) diff --git a/statsmodels/stats/_adnorm.py b/statsmodels/stats/_adnorm.py index 05b01620b3f..bed38c4fd02 100644 --- a/statsmodels/stats/_adnorm.py +++ b/statsmodels/stats/_adnorm.py @@ -5,6 +5,8 @@ Author: Josef Perktold and Scipy developers License : BSD-3 """ +import warnings + import numpy as np from scipy import stats @@ -64,8 +66,12 @@ def anderson_statistic(x, dist='norm', fit=True, params=(), axis=0): sl2 = [slice(None)] * x.ndim sl2[axis] = slice(None, None, -1) sl2 = tuple(sl2) - s = np.sum((2 * i[sl1] - 1.0) / nobs * (np.log(z) + np.log1p(-z[sl2])), - axis=axis) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message="divide by zero encountered in log1p" + ) + ad_values = (2 * i[sl1] - 1.0) / nobs * (np.log(z) + np.log1p(-z[sl2])) + s = np.sum(ad_values, axis=axis) a2 = -nobs - s return a2 diff --git a/statsmodels/stats/_delta_method.py b/statsmodels/stats/_delta_method.py index 931080a6128..956e1531617 100644 --- a/statsmodels/stats/_delta_method.py +++ b/statsmodels/stats/_delta_method.py @@ -151,6 +151,9 @@ def var(self): """ g = self.grad() var = (np.dot(g, self.cov_params) * g).sum(-1) + + if var.ndim == 2: + var = var.T return var def se_vectorized(self): diff --git a/statsmodels/stats/anova.py b/statsmodels/stats/anova.py index e6bb583fe2e..e32bc7b85e7 100644 --- a/statsmodels/stats/anova.py +++ b/statsmodels/stats/anova.py @@ -1,14 +1,18 @@ +from statsmodels.compat.python import lrange + import numpy as np -from scipy import stats import pandas as pd from pandas import DataFrame, Index import patsy +from scipy import stats -from statsmodels.regression.linear_model import OLS -from statsmodels.compat.python import lrange -from statsmodels.formula.formulatools import (_remove_intercept_patsy, - _has_intercept, _intercept_idx) +from statsmodels.formula.formulatools import ( + _has_intercept, + _intercept_idx, + _remove_intercept_patsy, +) from statsmodels.iolib import summary2 +from statsmodels.regression.linear_model import OLS def _get_covariance(model, robust): @@ -489,7 +493,7 @@ def __init__(self, data, depvar, subject, within=None, between=None, self.subject = subject if aggregate_func == 'mean': - self.aggregate_func = np.mean + self.aggregate_func = pd.Series.mean else: self.aggregate_func = aggregate_func @@ -640,7 +644,9 @@ def summary(self): if __name__ == "__main__": import pandas + from statsmodels.formula.api import ols + # in R #library(car) #write.csv(Moore, "moore.csv", row.names=FALSE) diff --git a/statsmodels/stats/descriptivestats.py b/statsmodels/stats/descriptivestats.py index 751c554047b..b4366e1b5c8 100644 --- a/statsmodels/stats/descriptivestats.py +++ b/statsmodels/stats/descriptivestats.py @@ -1,10 +1,11 @@ -from statsmodels.compat.pandas import Appender, is_numeric_dtype +from statsmodels.compat.pandas import PD_LT_2, Appender, is_numeric_dtype from statsmodels.compat.scipy import SP_LT_19 -from statsmodels.compat.pandas import PD_LT_2 + from typing import Sequence, Union import numpy as np import pandas as pd + if PD_LT_2: from pandas.core.dtypes.common import is_categorical_dtype else: @@ -438,7 +439,8 @@ def _mode(ser): _df = df.copy() for col in df: if is_extension_array_dtype(df[col].dtype): - _df[col] = _df[col].astype(object).fillna(np.nan) + if _df[col].isnull().any(): + _df[col] = _df[col].fillna(np.nan) except ImportError: pass @@ -587,7 +589,8 @@ def summary(self) -> SimpleTable: A table instance supporting export to text, csv and LaTeX """ df = self.frame.astype(object) - df = df.fillna("") + if df.isnull().any().any(): + df = df.fillna("") cols = [str(col) for col in df.columns] stubs = [str(idx) for idx in df.index] data = [] diff --git a/statsmodels/stats/diagnostic.py b/statsmodels/stats/diagnostic.py index 504061bff79..07c1c4ecdce 100644 --- a/statsmodels/stats/diagnostic.py +++ b/statsmodels/stats/diagnostic.py @@ -32,12 +32,22 @@ from scipy import stats from statsmodels.regression.linear_model import OLS, RegressionResultsWrapper +from statsmodels.stats._adnorm import anderson_statistic, normal_ad +from statsmodels.stats._lilliefors import ( + kstest_exponential, + kstest_fit, + kstest_normal, + lilliefors, +) +from statsmodels.tools.validation import ( + array_like, + bool_like, + dict_like, + float_like, + int_like, + string_like, +) from statsmodels.tsa.tsatools import lagmat -from statsmodels.tools.validation import (array_like, int_like, bool_like, - string_like, dict_like, float_like) -from statsmodels.stats._lilliefors import (kstest_fit, lilliefors, - kstest_normal, kstest_exponential) -from statsmodels.stats._adnorm import normal_ad, anderson_statistic __all__ = ["kstest_fit", "lilliefors", "kstest_normal", "kstest_exponential", "normal_ad", "compare_cox", "compare_j", "acorr_breusch_godfrey", @@ -1062,7 +1072,7 @@ def linear_reset(res, power=3, test_type="fitted", use_f=False, raise ValueError("power must contains distinct integers all >= 2") exog = res.model.exog if test_type == "fitted": - aug = res.fittedvalues[:, None] + aug = np.asarray(res.fittedvalues)[:, None] elif test_type == "exog": # Remove constant and binary aug = res.model.exog @@ -1283,7 +1293,7 @@ def linear_lm(resid, exog, func=None): if func is None: def func(x): return np.power(x, 2) - + exog = np.asarray(exog) exog_aux = np.column_stack((exog, func(exog[:, 1:]))) nobs, k_vars = exog.shape @@ -1609,7 +1619,7 @@ def breaks_cusumolsresid(resid, ddof=0): Ploberger, Werner, and Walter Kramer. “The Cusum Test with OLS Residuals.” Econometrica 60, no. 2 (March 1992): 271-285. """ - resid = resid.ravel() + resid = np.asarray(resid).ravel() nobs = len(resid) nobssigma2 = (resid ** 2).sum() if ddof > 0: diff --git a/statsmodels/stats/oneway.py b/statsmodels/stats/oneway.py index 04dbd1d97da..943006d9644 100644 --- a/statsmodels/stats/oneway.py +++ b/statsmodels/stats/oneway.py @@ -586,7 +586,7 @@ def anova_oneway(data, groups=None, use_var="unequal", welch_correction=True, This is the default. "equal" : Variances are assumed to be equal across samples. This is the standard Anova. - "bf: Variances are not assumed to be equal across samples. + "bf" : Variances are not assumed to be equal across samples. The method is Browne-Forsythe (1971) for testing equality of means with the corrected degrees of freedom by Merothra. The original BF degrees of freedom are available as additional attributes in the diff --git a/statsmodels/stats/tests/test_anova_rm.py b/statsmodels/stats/tests/test_anova_rm.py index 007cb0f3218..b5b97e49717 100644 --- a/statsmodels/stats/tests/test_anova_rm.py +++ b/statsmodels/stats/tests/test_anova_rm.py @@ -1,10 +1,13 @@ from statsmodels.compat.pandas import assert_frame_equal +from numpy.testing import ( + assert_array_almost_equal, + assert_equal, + assert_raises, +) import pandas as pd -import numpy as np + from statsmodels.stats.anova import AnovaRM -from numpy.testing import (assert_array_almost_equal, assert_raises, - assert_equal) DV = [7, 3, 6, 6, 5, 8, 6, 7, 7, 11, 9, 11, 10, 10, 11, 11, @@ -141,7 +144,7 @@ def test_repeated_measures_aggregation(): df1 = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() double_data = pd.concat([data, data], axis=0) df2 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], - aggregate_func=np.mean).fit() + aggregate_func=pd.Series.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table) @@ -152,7 +155,7 @@ def test_repeated_measures_aggregation_one_subject_duplicated(): data2 = data2.reset_index() df2 = AnovaRM(data2, 'DV', 'id', within=['A', 'B', 'D'], - aggregate_func=np.mean).fit() + aggregate_func=pd.Series.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table) @@ -163,9 +166,9 @@ def test_repeated_measures_aggregate_func(): within=['A', 'B', 'D']) m1 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], - aggregate_func=np.mean) + aggregate_func=pd.Series.mean) m2 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], - aggregate_func=np.median) + aggregate_func=pd.Series.median) assert_raises(AssertionError, assert_equal, m1.aggregate_func, m2.aggregate_func) @@ -175,7 +178,7 @@ def test_repeated_measures_aggregate_func(): def test_repeated_measures_aggregate_func_mean(): double_data = pd.concat([data, data], axis=0) m1 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], - aggregate_func=np.mean) + aggregate_func=pd.Series.mean) m2 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], aggregate_func='mean') @@ -197,7 +200,7 @@ def test_repeated_measures_aggregate_compare_with_ezANOVA(): double_data = pd.concat([data, data], axis=0) df = (AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], - aggregate_func=np.mean) + aggregate_func=pd.Series.mean) .fit() .anova_table) diff --git a/statsmodels/stats/tests/test_diagnostic.py b/statsmodels/stats/tests/test_diagnostic.py index b1f803ed7eb..b05698e6414 100644 --- a/statsmodels/stats/tests/test_diagnostic.py +++ b/statsmodels/stats/tests/test_diagnostic.py @@ -1957,3 +1957,43 @@ def test_small_skip(reset_randomstate): # M2 + fit(M1)-exp(fit(M2)) < 2.22e-16 *** # --- # Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 + +@pytest.mark.smoke +def test_diagnostics_pandas(reset_randomstate): + # GH 8879 + n = 100 + df = pd.DataFrame( + { + "y": np.random.rand(n), + "x": np.random.rand(n), + "z": np.random.rand(n)} + ) + y, x = df["y"], add_constant(df["x"]) + + res = OLS(df["y"], add_constant(df[["x"]])).fit() + res_large = OLS(df["y"], add_constant(df[["x", "z"]])).fit() + res_other = OLS(df["y"], add_constant(df[["z"]])).fit() + smsdia.linear_reset(res_large) + smsdia.linear_reset(res_large, test_type="fitted") + smsdia.linear_reset(res_large, test_type="exog") + smsdia.linear_reset(res_large, test_type="princomp") + smsdia.het_goldfeldquandt(y, x) + smsdia.het_breuschpagan(res.resid, x) + smsdia.het_white(res.resid, x) + smsdia.het_arch(res.resid) + smsdia.acorr_breusch_godfrey(res) + smsdia.acorr_ljungbox(y) + smsdia.linear_rainbow(res) + smsdia.linear_lm(res.resid, x) + smsdia.linear_harvey_collier(res) + smsdia.acorr_lm(res.resid) + smsdia.breaks_cusumolsresid(res.resid) + smsdia.breaks_hansen(res) + smsdia.compare_cox(res, res_other) + smsdia.compare_encompassing(res, res_other) + smsdia.compare_j(res, res_other) + smsdia.recursive_olsresiduals(res) + smsdia.recursive_olsresiduals( + res, order_by=np.arange(y.shape[0] - 1, 0 - 1, -1) + ) + smsdia.spec_white(res.resid, x) diff --git a/statsmodels/stats/tests/test_weightstats.py b/statsmodels/stats/tests/test_weightstats.py index 3ac7f9a39f6..60a4a37bd12 100644 --- a/statsmodels/stats/tests/test_weightstats.py +++ b/statsmodels/stats/tests/test_weightstats.py @@ -677,6 +677,45 @@ def test_ztest_ztost(): ztest_larger_mu_1s.method = 'One-sample z-Test' ztest_larger_mu_1s.data_name = 'x' +# > zt = z.test(x, sigma.x=0.46436662631627995, y, sigma.y=0.7069805008424409) +# > cat_items(zt, "ztest_unequal.") +ztest_unequal = Holder() +ztest_unequal.statistic = 6.12808151466544 +ztest_unequal.p_value = 8.89450168270109e-10 +ztest_unequal.conf_int = np.array([1.19415646579981, 2.31720717056382]) +ztest_unequal.estimate = np.array([7.01818181818182, 5.2625]) +ztest_unequal.null_value = 0 +ztest_unequal.alternative = 'two.sided' +ztest_unequal.usevar = 'unequal' +ztest_unequal.method = 'Two-sample z-Test' +ztest_unequal.data_name = 'x and y' + +# > zt = z.test(x, sigma.x=0.46436662631627995, y, sigma.y=0.7069805008424409, alternative="less") +# > cat_items(zt, "ztest_smaller_unequal.") +ztest_smaller_unequal = Holder() +ztest_smaller_unequal.statistic = 6.12808151466544 +ztest_smaller_unequal.p_value = 0.999999999555275 +ztest_smaller_unequal.conf_int = np.array([np.nan, 2.22692874913371]) +ztest_smaller_unequal.estimate = np.array([7.01818181818182, 5.2625]) +ztest_smaller_unequal.null_value = 0 +ztest_smaller_unequal.alternative = 'less' +ztest_smaller_unequal.usevar = 'unequal' +ztest_smaller_unequal.method = 'Two-sample z-Test' +ztest_smaller_unequal.data_name = 'x and y' + +# > zt = z.test(x, sigma.x=0.46436662631627995, y, sigma.y=0.7069805008424409, alternative="greater") +# > cat_items(zt, "ztest_larger_unequal.") +ztest_larger_unequal = Holder() +ztest_larger_unequal.statistic = 6.12808151466544 +ztest_larger_unequal.p_value = 4.44725034576265e-10 +ztest_larger_unequal.conf_int = np.array([1.28443488722992, np.nan]) +ztest_larger_unequal.estimate = np.array([7.01818181818182, 5.2625]) +ztest_larger_unequal.null_value = 0 +ztest_larger_unequal.alternative = 'greater' +ztest_larger_unequal.usevar = 'unequal' +ztest_larger_unequal.method = 'Two-sample z-Test' +ztest_larger_unequal.data_name = 'x and y' + alternatives = {'less': 'smaller', 'greater': 'larger', @@ -733,6 +772,14 @@ def test(self): alternative=alternatives[tc.alternative]) assert_allclose(ci, tc_conf_int - tc.null_value, rtol=1e-10) + # unequal variances + for tc in [ztest_unequal, ztest_smaller_unequal, ztest_larger_unequal]: + zstat, pval = ztest(x1, x2, value=tc.null_value, + alternative=alternatives[tc.alternative], + usevar="unequal") + assert_allclose(zstat, tc.statistic, rtol=1e-10) + assert_allclose(pval, tc.p_value, rtol=1e-10, atol=1e-16) + # 1 sample test copy-paste d1 = self.d1 for tc in [ztest_mu_1s, ztest_smaller_mu_1s, ztest_larger_mu_1s]: diff --git a/statsmodels/stats/weightstats.py b/statsmodels/stats/weightstats.py index 9af1f5eed7e..57f136c9e91 100644 --- a/statsmodels/stats/weightstats.py +++ b/statsmodels/stats/weightstats.py @@ -1510,10 +1510,10 @@ def ztest( 'larger' : H1: difference in means larger than value 'smaller' : H1: difference in means smaller than value - usevar : str, 'pooled' - Currently, only 'pooled' is implemented. + usevar : str, 'pooled' or 'unequal' If ``pooled``, then the standard deviation of the samples is assumed to be - the same. see CompareMeans.ztest_ind for different options. + the same. If ``unequal``, then the standard deviation of the sample is + assumed to be different. ddof : int Degrees of freedom use in the calculation of the variance of the mean estimate. In the case of comparing means this is one, however it can @@ -1528,35 +1528,38 @@ def ztest( Notes ----- - usevar not implemented, is always pooled in two sample case - use CompareMeans instead. + usevar can be pooled or unequal in two sample case """ # TODO: this should delegate to CompareMeans like ttest_ind # However that does not implement ddof - # usevar is not used, always pooled + # usevar can be pooled or unequal - if usevar != "pooled": - raise NotImplementedError('only usevar="pooled" is implemented') + if usevar not in {"pooled", "unequal"}: + raise NotImplementedError('usevar can only be "pooled" or "unequal"') x1 = np.asarray(x1) nobs1 = x1.shape[0] x1_mean = x1.mean(0) x1_var = x1.var(0) + if x2 is not None: x2 = np.asarray(x2) nobs2 = x2.shape[0] x2_mean = x2.mean(0) x2_var = x2.var(0) - var_pooled = nobs1 * x1_var + nobs2 * x2_var - var_pooled /= nobs1 + nobs2 - 2 * ddof - var_pooled *= 1.0 / nobs1 + 1.0 / nobs2 + if usevar == "pooled": + var = nobs1 * x1_var + nobs2 * x2_var + var /= nobs1 + nobs2 - 2 * ddof + var *= 1.0 / nobs1 + 1.0 / nobs2 + elif usevar == "unequal": + var = x1_var / (nobs1 - ddof) + x2_var / (nobs2 - ddof) else: - var_pooled = x1_var / (nobs1 - ddof) + var = x1_var / (nobs1 - ddof) x2_mean = 0 - std_diff = np.sqrt(var_pooled) + std_diff = np.sqrt(var) # stat = x1_mean - x2_mean - value return _zstat_generic(x1_mean, x2_mean, std_diff, alternative, diff=value) diff --git a/statsmodels/tools/_testing.py b/statsmodels/tools/_testing.py index 54f5c2861d4..3c1408b413d 100644 --- a/statsmodels/tools/_testing.py +++ b/statsmodels/tools/_testing.py @@ -40,6 +40,7 @@ def __call__(self, extra_args=None, exit=False): print('Running pytest ' + ' '.join(cmd)) status = pytest.main(cmd) if exit: + print(f"Exit status: {status}") sys.exit(status) except ImportError: raise ImportError('pytest>=3 required to run the test') diff --git a/statsmodels/tools/numdiff.py b/statsmodels/tools/numdiff.py index 10296aa6108..c77df81b4e2 100644 --- a/statsmodels/tools/numdiff.py +++ b/statsmodels/tools/numdiff.py @@ -93,7 +93,7 @@ def _get_epsilon(x, s, epsilon, n): if epsilon is None: - h = EPS**(1. / s) * np.maximum(np.abs(x), 0.1) + h = EPS**(1. / s) * np.maximum(np.abs(np.asarray(x)), 0.1) else: if np.isscalar(epsilon): h = np.empty(n) diff --git a/statsmodels/tsa/ardl/_pss_critical_values/pss-process.py b/statsmodels/tsa/ardl/_pss_critical_values/pss-process.py index 256dcd428a3..6cd0fc75cd5 100644 --- a/statsmodels/tsa/ardl/_pss_critical_values/pss-process.py +++ b/statsmodels/tsa/ardl/_pss_critical_values/pss-process.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import FUTURE_STACK + from collections import defaultdict import glob import os @@ -45,7 +47,7 @@ crit_vals[key] = [round(val, 7) for val in list(row)] def setup_regressors(df, low_pow=3, high_pow=3, cut=70, log=False): - s = df.stack().reset_index() + s = df.stack(**FUTURE_STACK).reset_index() q = s.level_0 / 10000 y = stats.norm.ppf(q) cv = s[0] diff --git a/statsmodels/tsa/base/tests/test_base.py b/statsmodels/tsa/base/tests/test_base.py index ccdb1daef93..0dc9dc7c345 100644 --- a/statsmodels/tsa/base/tests/test_base.py +++ b/statsmodels/tsa/base/tests/test_base.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import PD_LT_2_2_0 + from datetime import datetime import numpy as np @@ -9,6 +11,7 @@ from statsmodels.tools.testing import assert_equal from statsmodels.tsa.base.tsa_model import TimeSeriesModel +YE_APR = "A-APR" if PD_LT_2_2_0 else "YE-APR" def test_pandas_nodates_index(): @@ -43,12 +46,13 @@ def test_predict_freq(): # there's a bug in pandas up to 0.10.2 for YearBegin #dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") - dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") + + dates = pd.date_range("1972-4-30", "2006-4-30", freq=YE_APR) series = pd.Series(x, index=dates) model = TimeSeriesModel(series) #npt.assert_(model.data.freq == "AS-APR") # two possabilities due to future changes in pandas 2.2+ - assert model._index.freqstr in ("Y-APR", "A-APR") + assert model._index.freqstr in ("Y-APR", "A-APR", "YE-APR") start, end, out_of_sample, _ = ( model._get_prediction_index("2006-4-30", "2016-4-30")) @@ -57,7 +61,7 @@ def test_predict_freq(): #expected_dates = date_range("2006-12-31", "2016-12-31", # freq="AS-APR") - expected_dates = pd.date_range("2006-4-30", "2016-4-30", freq="A-APR") + expected_dates = pd.date_range("2006-4-30", "2016-4-30", freq=YE_APR) assert_equal(predict_dates, expected_dates) #ptesting.assert_series_equal(predict_dates, expected_dates) @@ -66,7 +70,7 @@ def test_keyerror_start_date(): x = np.arange(1,36.) # dates = date_range("1972-4-1", "2007-4-1", freq="AS-APR") - dates = pd.date_range("1972-4-30", "2006-4-30", freq="A-APR") + dates = pd.date_range("1972-4-30", "2006-4-30", freq=YE_APR) series = pd.Series(x, index=dates) model = TimeSeriesModel(series) diff --git a/statsmodels/tsa/base/tests/test_prediction.py b/statsmodels/tsa/base/tests/test_prediction.py index 1fc46a5b44c..4721be67be4 100644 --- a/statsmodels/tsa/base/tests/test_prediction.py +++ b/statsmodels/tsa/base/tests/test_prediction.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import MONTH_END + import numpy as np import pandas as pd import pytest @@ -12,7 +14,7 @@ def data(request): variance = np.arange(1, 11.0) if not request.param: return mean, variance - idx = pd.date_range("2000-1-1", periods=10, freq="M") + idx = pd.date_range("2000-1-1", periods=10, freq=MONTH_END) return pd.Series(mean, index=idx), pd.Series(variance, index=idx) diff --git a/statsmodels/tsa/base/tests/test_tsa_indexes.py b/statsmodels/tsa/base/tests/test_tsa_indexes.py index 0d607fc3931..ae6939ddd2f 100644 --- a/statsmodels/tsa/base/tests/test_tsa_indexes.py +++ b/statsmodels/tsa/base/tests/test_tsa_indexes.py @@ -9,7 +9,7 @@ Author: Chad Fulton License: BSD-3 """ -from statsmodels.compat.pandas import is_int_index +from statsmodels.compat.pandas import PD_LT_2_2_0, YEAR_END, is_int_index import warnings @@ -30,15 +30,16 @@ pd.DataFrame(base_dta), ] +TWO_QE_DEC = "2Q-DEC" if PD_LT_2_2_0 else "2QE-DEC" base_date_indexes = [ # (usual candidates) pd.date_range(start="1950-01-01", periods=nobs, freq="D"), pd.date_range(start="1950-01-01", periods=nobs, freq="W"), pd.date_range(start="1950-01-01", periods=nobs, freq="MS"), pd.date_range(start="1950-01-01", periods=nobs, freq="QS"), - pd.date_range(start="1950-01-01", periods=nobs, freq="Y"), + pd.date_range(start="1950-01-01", periods=nobs, freq=YEAR_END), # (some more complicated frequencies) - pd.date_range(start="1950-01-01", periods=nobs, freq="2Q-DEC"), + pd.date_range(start="1950-01-01", periods=nobs, freq=TWO_QE_DEC), pd.date_range(start="1950-01-01", periods=nobs, freq="2QS"), pd.date_range(start="1950-01-01", periods=nobs, freq="5s"), pd.date_range(start="1950-01-01", periods=nobs, freq="1D10min"), @@ -221,7 +222,7 @@ def test_instantiation_valid(): if freq is None: freq = ix.freq if not isinstance(freq, str): - freq = freq.freqstr + freq = ix.freqstr assert_equal( isinstance(mod._index, (pd.DatetimeIndex, pd.PeriodIndex)), True, @@ -242,7 +243,7 @@ def test_instantiation_valid(): if freq is None: freq = ix.freq if not isinstance(freq, str): - freq = freq.freqstr + freq = ix.freqstr assert_equal( isinstance(mod._index, (pd.DatetimeIndex, pd.PeriodIndex)), True, @@ -296,7 +297,7 @@ def test_instantiation_valid(): if freq is None: freq = ix.freq if not isinstance(freq, str): - freq = freq.freqstr + freq = ix.freqstr assert_equal( isinstance(mod._index, (pd.DatetimeIndex, pd.PeriodIndex)), True, diff --git a/statsmodels/tsa/deterministic.py b/statsmodels/tsa/deterministic.py index 8d45c8189e7..6944730310e 100644 --- a/statsmodels/tsa/deterministic.py +++ b/statsmodels/tsa/deterministic.py @@ -1,4 +1,9 @@ -from statsmodels.compat.pandas import Appender, is_int_index, to_numpy +from statsmodels.compat.pandas import ( + PD_LT_2_2_0, + Appender, + is_int_index, + to_numpy, +) from abc import ABC, abstractmethod import datetime as dt @@ -742,19 +747,31 @@ class CalendarSeasonality(CalendarDeterministicTerm): _is_dummy = True # out_of: freq - _supported = { - "W": {"H": 24 * 7, "B": 5, "D": 7, "h": 24 * 7}, - "D": {"H": 24, "h": 24}, - "Q": {"M": 3, "MS": 3}, - "A": {"M": 12, "MS": 12, "Q": 4} - } + if PD_LT_2_2_0: + _supported = { + "W": {"B": 5, "D": 7, "h": 24 * 7, "H": 24 * 7}, + "D": {"h": 24, "H": 24}, + "Q": {"MS": 3, "M": 3}, + "A": {"MS": 12, "M": 12}, + "Y": {"MS": 12, "Q": 4, "M": 12}, + } + else: + _supported = { + "W": {"B": 5, "D": 7, "h": 24 * 7}, + "D": {"h": 24}, + "Q": {"MS": 3, "ME": 3}, + "A": {"MS": 12, "ME": 12, "QE": 4}, + "Y": {"MS": 12, "ME": 12, "QE": 4}, + "QE": {"ME": 3}, + "YE": {"ME": 12, "QE": 4}, + } def __init__(self, freq: str, period: str) -> None: freq_options: Set[str] = set() freq_options.update( *[list(val.keys()) for val in self._supported.values()] ) - period_options = list(self._supported.keys()) + period_options = tuple(self._supported.keys()) freq = string_like( freq, "freq", options=tuple(freq_options), lower=False @@ -784,7 +801,7 @@ def period(self) -> str: def _weekly_to_loc( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: - if self._freq.freqstr == "H": + if self._freq.freqstr in ("h", "H"): return index.hour + 24 * index.dayofweek elif self._freq.freqstr == "D": return index.dayofweek @@ -811,7 +828,7 @@ def _quarterly_to_loc( def _annual_to_loc( self, index: Union[pd.DatetimeIndex, pd.PeriodIndex] ) -> np.ndarray: - if self._freq.freqstr == "M": + if self._freq.freqstr in ("M", "ME", "MS"): return index.month - 1 else: # "Q" return index.quarter - 1 @@ -823,9 +840,9 @@ def _get_terms( locs = self._daily_to_loc(index) elif self._period == "W": locs = self._weekly_to_loc(index) - elif self._period == "Q": + elif self._period in ("Q", "QE"): locs = self._quarterly_to_loc(index) - else: # "A": + else: # "A", "Y": locs = self._annual_to_loc(index) full_cycle = self._supported[self._period][self._freq_str] terms = np.zeros((locs.shape[0], full_cycle)) diff --git a/statsmodels/tsa/filters/tests/test_filters.py b/statsmodels/tsa/filters/tests/test_filters.py index 836efc5afd2..30fafc2b785 100644 --- a/statsmodels/tsa/filters/tests/test_filters.py +++ b/statsmodels/tsa/filters/tests/test_filters.py @@ -1,4 +1,8 @@ -from statsmodels.compat.pandas import assert_frame_equal, make_dataframe +from statsmodels.compat.pandas import ( + QUARTER_END, + assert_frame_equal, + make_dataframe, +) from datetime import datetime @@ -566,7 +570,7 @@ def test_cfitz_filter(): def test_bking_pandas(): # 1d dta = macrodata.load_pandas().data - index = date_range(start='1959-01-01', end='2009-10-01', freq='Q') + index = date_range(start='1959-01-01', end='2009-10-01', freq=QUARTER_END) dta.index = index filtered = bkfilter(dta["infl"]) nd_filtered = bkfilter(dta['infl'].values) @@ -587,7 +591,7 @@ def test_bking_pandas(): def test_cfitz_pandas(): # 1d dta = macrodata.load_pandas().data - index = date_range(start='1959-01-01', end='2009-10-01', freq='Q') + index = date_range(start='1959-01-01', end='2009-10-01', freq=QUARTER_END) dta.index = index cycle, trend = cffilter(dta["infl"]) ndcycle, ndtrend = cffilter(dta['infl'].values) @@ -607,7 +611,7 @@ def test_cfitz_pandas(): def test_hpfilter_pandas(): dta = macrodata.load_pandas().data - index = date_range(start='1959-01-01', end='2009-10-01', freq='Q') + index = date_range(start='1959-01-01', end='2009-10-01', freq=QUARTER_END) dta.index = index cycle, trend = hpfilter(dta["realgdp"]) ndcycle, ndtrend = hpfilter(dta['realgdp'].values) @@ -626,11 +630,11 @@ def setup_class(cls): 232, 429, 3, 98, 43, -141, -77, -13, 125, 361, -45, 184] cls.data = DataFrame(data, date_range(start='1/1/1951', periods=len(data), - freq='Q')) + freq=QUARTER_END)) data[9] = np.nan cls.datana = DataFrame(data, date_range(start='1/1/1951', periods=len(data), - freq='Q')) + freq=QUARTER_END)) from .results import filter_results cls.expected = filter_results diff --git a/statsmodels/tsa/forecasting/tests/test_stl.py b/statsmodels/tsa/forecasting/tests/test_stl.py index f673886b5e7..9ad1b0995c7 100644 --- a/statsmodels/tsa/forecasting/tests/test_stl.py +++ b/statsmodels/tsa/forecasting/tests/test_stl.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import MONTH_END + import numpy as np from numpy.testing import assert_allclose import pandas as pd @@ -20,7 +22,7 @@ def data(request): rs = np.random.RandomState(987654321) err = rs.standard_normal(500) - index = pd.date_range("1980-1-1", freq="M", periods=500) + index = pd.date_range("1980-1-1", freq=MONTH_END, periods=500) fourier = Fourier(12, 1) terms = fourier.in_sample(index) det = np.squeeze(np.asarray(terms @ np.array([[2], [1]]))) diff --git a/statsmodels/tsa/holtwinters/tests/test_holtwinters.py b/statsmodels/tsa/holtwinters/tests/test_holtwinters.py index f9a1e8cbba8..6521eac13f9 100644 --- a/statsmodels/tsa/holtwinters/tests/test_holtwinters.py +++ b/statsmodels/tsa/holtwinters/tests/test_holtwinters.py @@ -2,6 +2,7 @@ Author: Terence L van Zyl Modified: Kevin Sheppard """ +from statsmodels.compat.pandas import MONTH_END from statsmodels.compat.pytest import pytest_warns import os @@ -108,7 +109,7 @@ def ses(): for i in range(1, 1200): y[i] = y[i - 1] + e[i] - 0.2 * e[i - 1] y = y[200:] - index = pd.date_range("2000-1-1", periods=y.shape[0], freq="M") + index = pd.date_range("2000-1-1", periods=y.shape[0], freq=MONTH_END) return pd.Series(y, index=index, name="y") @@ -1995,7 +1996,7 @@ def test_forecast_index_types(ses, index_typ): if index_typ == "period": index = pd.period_range("2000-1-1", periods=nobs + 36, freq="M") elif index_typ == "date_range": - index = pd.date_range("2000-1-1", periods=nobs + 36, freq="M") + index = pd.date_range("2000-1-1", periods=nobs + 36, freq=MONTH_END) elif index_typ == "range": index = pd.RangeIndex(nobs + 36) kwargs["seasonal_periods"] = 12 diff --git a/statsmodels/tsa/regime_switching/_hamilton_filter.pyx.in b/statsmodels/tsa/regime_switching/_hamilton_filter.pyx.in index 2528eca4ce1..6cba3be6880 100644 --- a/statsmodels/tsa/regime_switching/_hamilton_filter.pyx.in +++ b/statsmodels/tsa/regime_switching/_hamilton_filter.pyx.in @@ -114,7 +114,7 @@ cdef void {{prefix}}hamilton_filter_log_iteration(int t, int k_regimes, int orde {{cython_type}} [:] curr_predicted_joint_probabilities, {{cython_type}} [:] prev_filtered_joint_probabilities, {{cython_type}} [:] curr_filtered_joint_probabilities, - {{cython_type}} [:] tmp_predicted_joint_probabilities) nogil: + {{cython_type}} [:] tmp_predicted_joint_probabilities) noexcept nogil: cdef int i, j, k, ix cdef: int k_regimes_order_m1 = k_regimes**(order - 1) diff --git a/statsmodels/tsa/regime_switching/_kim_smoother.pyx.in b/statsmodels/tsa/regime_switching/_kim_smoother.pyx.in index d207a627da8..d42dc3377b8 100644 --- a/statsmodels/tsa/regime_switching/_kim_smoother.pyx.in +++ b/statsmodels/tsa/regime_switching/_kim_smoother.pyx.in @@ -85,7 +85,7 @@ cdef void {{prefix}}kim_smoother_log_iteration(int tt, int k_regimes, int order, {{cython_type}} [:] predicted_joint_probabilities, {{cython_type}} [:] filtered_joint_probabilities, {{cython_type}} [:] prev_smoothed_joint_probabilities, - {{cython_type}} [:] next_smoothed_joint_probabilities) nogil: + {{cython_type}} [:] next_smoothed_joint_probabilities) noexcept nogil: cdef int t, i, j, k, ix cdef: int k_regimes_order_m1 = k_regimes**(order - 1) diff --git a/statsmodels/tsa/statespace/_tools.pyx.in b/statsmodels/tsa/statespace/_tools.pyx.in index 1dd17d44942..8947329168f 100644 --- a/statsmodels/tsa/statespace/_tools.pyx.in +++ b/statsmodels/tsa/statespace/_tools.pyx.in @@ -1189,7 +1189,7 @@ cpdef _{{prefix}}compute_smoothed_state_weights( {{cython_type}} scale, int compute_prior_weights=False): cdef: - int t, i, j, n_t, n_j, min_j, max_j, p_t, p_j + int t, i, j, n_t, n_j, min_j, max_j, p_t, p_j, t0 int n = model.nobs int p = model.k_endog int m = model.k_states @@ -1231,26 +1231,31 @@ cpdef _{{prefix}}compute_smoothed_state_weights( t = compute_t[i] store_t[t] = True + # Determine the first index that we need to compute + t0 = min(min_j, compute_t[0]) + # Let double-letters denote indexing that starts from t0 + nn = n - t0 + # Flags filter_collapsed = kfilter.filter_method & 0x20 # Create required storage and temporary matrices - # Hi_W = np.zeros((p, m, n), order='F') - # K = np.zeros((p, m, n), order='F') - # weights = np.zeros((m, p, n, n), order='F') * np.nan - # state_intercept_weights = np.zeros((m, m, n, n), order='F') * np.nan + # Hi_W = np.zeros((p, m, nn), order='F') + # K = np.zeros((p, m, nn), order='F') + # weights = np.zeros((m, p, nn, nn), order='F') * np.nan + # state_intercept_weights = np.zeros((m, m, nn, nn), order='F') * np.nan # tmp = np.zeros((m, m), order='F') # F = np.zeros((p, p), order='F') # Fi = np.zeros((p, p), order='F') # FiZ = np.zeros((p, m), order='F') - dim4[0] = m; dim4[1] = p; dim4[2] = n; dim4[3] = n; + dim4[0] = m; dim4[1] = p; dim4[2] = nn; dim4[3] = nn; weights = np.PyArray_ZEROS(4, dim4, {{typenum}}, FORTRAN) * np.nan - dim4[0] = m; dim4[1] = m; dim4[2] = n; dim4[3] = n; + dim4[0] = m; dim4[1] = m; dim4[2] = nn; dim4[3] = nn; state_intercept_weights = np.PyArray_ZEROS(4, dim4, {{typenum}}, FORTRAN) * np.nan - dim3[0] = p; dim3[1] = m; dim3[2] = n; + dim3[0] = p; dim3[1] = m; dim3[2] = nn; Hi_W = np.PyArray_ZEROS(3, dim3, {{typenum}}, FORTRAN) K = np.PyArray_ZEROS(3, dim3, {{typenum}}, FORTRAN) - dim3[0] = m; dim3[1] = m; dim3[2] = n; + dim3[0] = m; dim3[1] = m; dim3[2] = nn; # Note: the order here is (m, l, t), where the `l` dimension refers to the # element of the initial state vector, and the `m, t` dimensions refer to # the element / time point of the smoothed state vector. @@ -1264,7 +1269,12 @@ cpdef _{{prefix}}compute_smoothed_state_weights( FiZ = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN) # First pass to compute required quantities, if necessary - for t in range(n): + for t in range(t0, n): + # Recall that double-letters denotes indexes that start from t0 + # Hi_W[:, :, nn], K[:, :, nn], weights[:, :, nn, nn], + # state_intercept_weights[:, :, nn, nn], prior_weights[:, :, nn] + tt = t - t0 + p_t = p - model.nmissing[t] # Handling of exact diffuse periods is not currently available @@ -1273,9 +1283,9 @@ cpdef _{{prefix}}compute_smoothed_state_weights( if (t < kfilter.nobs_diffuse or (not store_t[t] and (t < min_j or t > max_j))): # No need to set this when using the dictionary - # K[t] = np.zeros((m, p_t)) * np.nan - # Hi_W[t] is already initialized to NaNs, so can skip: - # Hi_W[t, :] = np.nan + # K[tt] = np.zeros((m, p_t)) * np.nan + # Hi_W[tt] is already initialized to NaNs, so can skip: + # Hi_W[tt, :] = np.nan continue # Move the statespace representation to the time period, but do not @@ -1286,8 +1296,8 @@ cpdef _{{prefix}}compute_smoothed_state_weights( if p_t == 0: # No need to set this when using the dictionary # K.append(np.zeros((m, 0))) - # Hi_W[t] is already initialized to NaNs, so can skip: - # Hi_W[t, :] = np.nan + # Hi_W[tt] is already initialized to NaNs, so can skip: + # Hi_W[tt, :] = np.nan continue # Get filtered objects @@ -1329,10 +1339,10 @@ cpdef _{{prefix}}compute_smoothed_state_weights( blas.{{prefix}}gemm("N", "N", &model._k_states, &model._k_endog, &model._k_states, &alpha, model._transition, &kfilter.k_states, &tmp[0, 0], &kfilter.k_states, - &beta, &K[0, 0, t], &kfilter.k_states) + &beta, &K[0, 0, tt], &kfilter.k_states) _FiZ = &FiZ[0, 0] - _K = &K[0, 0, t] + _K = &K[0, 0, tt] else: _FiZ = &kfilter.tmp3[0, 0, t] _K = &kfilter.kalman_gain[0, 0, t] @@ -1340,7 +1350,7 @@ cpdef _{{prefix}}compute_smoothed_state_weights( # Compute required quantites # W_j = H_j (F_j^{-1} Z_j - K_j' N_j L_j) # => H_j^{-1} W_j = F_j^{-1} Z_j - K_j' N_j L_j - blas.{{prefix}}copy(&pm, _FiZ, &inc, &Hi_W[0, 0, t], &inc) + blas.{{prefix}}copy(&pm, _FiZ, &inc, &Hi_W[0, 0, tt], &inc) scalar = 1. / scale blas.{{prefix}}gemm("N", "N", &model._k_states, &model._k_states, &model._k_states, @@ -1351,11 +1361,16 @@ cpdef _{{prefix}}compute_smoothed_state_weights( blas.{{prefix}}gemm("T", "N", &model._k_endog, &model._k_states, &model._k_states, &gamma, _K, &kfilter.k_states, kfilter._tmp0, &kfilter.k_states, - &scalar, &Hi_W[0, 0, t], &model.k_endog) + &scalar, &Hi_W[0, 0, tt], &model.k_endog) # Second pass to compute weights # for t in compute_t: for t in range(n): + # Recall that double-letters denotes indexes that start from t0 + # Hi_W[:, :, nn], K[:, :, nn], weights[:, :, nn, nn], + # state_intercept_weights[:, :, nn, nn], prior_weights[:, :, nn] + tt = t - t0 + if t < kfilter.nobs_diffuse or not store_t[t]: continue @@ -1374,9 +1389,13 @@ cpdef _{{prefix}}compute_smoothed_state_weights( # This is the weight of the prior on the smoothed state at # time t == 0 if t == 0 and compute_prior_weights: - blas.{{prefix}}copy(&kfilter.k_states2, kfilter._tmp0, &inc, &prior_weights[0, 0, t], &inc) + blas.{{prefix}}copy(&kfilter.k_states2, kfilter._tmp0, &inc, &prior_weights[0, 0, tt], &inc) for j in range(t - 1, min_j - 1, -1): + # Again, recall that double-letters denotes indexes that start + # from t0 + jj = j - t0 + # Since we are not handling any diffuse observations now, we can # stop as soon as we get to the diffuse periods if j < kfilter.nobs_diffuse: @@ -1386,7 +1405,7 @@ cpdef _{{prefix}}compute_smoothed_state_weights( compute_K = kfilter.univariate_filter[t] or filter_collapsed if compute_K: - _K = &K[0, 0, j] + _K = &K[0, 0, jj] else: _K = &kfilter.kalman_gain[0, 0, j] @@ -1399,15 +1418,15 @@ cpdef _{{prefix}}compute_smoothed_state_weights( # L[j] = \prod_i L_{j, i} if store_j[j]: if p_j > 0: - # weights[:, :p_j, t, j] = tmp @ K[j][:, :p_j] + # weights[:, :p_j, tt, jj] = tmp @ K[jj][:, :p_j] blas.{{prefix}}gemm("N", "N", &model._k_states, &p_j, &model._k_states, &alpha, kfilter._tmp0, &kfilter.k_states, _K, &kfilter.k_states, - &beta, &weights[0, 0, t, j], &kfilter.k_states) + &beta, &weights[0, 0, tt, jj], &kfilter.k_states) - # state_intercept_weights[:, :, t, j] = -tmp + # state_intercept_weights[:, :, tt, jj] = -tmp blas.{{prefix}}copy(&kfilter.k_states2, kfilter._tmp0, &inc, - &state_intercept_weights[0, 0, t, j], &inc) + &state_intercept_weights[0, 0, tt, jj], &inc) if j > min_j or (j == 0 and compute_prior_weights): # tmp = tmp @ L @@ -1421,7 +1440,7 @@ cpdef _{{prefix}}compute_smoothed_state_weights( # This is the weight of the prior on the smoothed state at # time t > 0 if j == 0 and compute_prior_weights: - blas.{{prefix}}copy(&kfilter.k_states2, kfilter._tmp0, &inc, &prior_weights[0, 0, t], &inc) + blas.{{prefix}}copy(&kfilter.k_states2, kfilter._tmp0, &inc, &prior_weights[0, 0, tt], &inc) # For j == t # For the univariate case, some notes: @@ -1434,16 +1453,17 @@ cpdef _{{prefix}}compute_smoothed_state_weights( # incorporated all the information from time t. Then j = t but i < p_t # would follow the j < t formula. j = t + jj = tt p_j = p - model.nmissing[t] if store_j[j]: if p_j > 0: - # weights[:, :p_j, t, j] = Pt @ Hi_W[j, :p_j].T + # weights[:, :p_j, tt, jj] = Pt @ Hi_W[j, :p_j].T blas.{{prefix}}gemm("N", "T", &model._k_states, &p_j, &model._k_states, &scale, &kfilter.predicted_state_cov[0, 0, t], &kfilter.k_states, - &Hi_W[0, 0, j], &model.k_endog, - &beta, &weights[0, 0, t, j], &kfilter.k_states) + &Hi_W[0, 0, jj], &model.k_endog, + &beta, &weights[0, 0, tt, jj], &kfilter.k_states) - # state_intercept_weights[:, :, t, j] = -(Pt @ Lt' @ Nt) + # state_intercept_weights[:, :, tt, jj] = -(Pt @ Lt' @ Nt) # Note: normally would need to multiply Pt by scale and divide Nt # by `scale`, but since they're being multiplied here, the scale terms # cancel out. @@ -1454,7 +1474,7 @@ cpdef _{{prefix}}compute_smoothed_state_weights( blas.{{prefix}}gemm("N", "N", &model._k_states, &model._k_states, &model._k_states, &alpha, &kfilter.predicted_state_cov[0, 0, t], &kfilter.k_states, kfilter._tmp00, &kfilter.k_states, - &beta, &state_intercept_weights[0, 0, t, j], &kfilter.k_states) + &beta, &state_intercept_weights[0, 0, tt, jj], &kfilter.k_states) # Forwards from j = t+1, t+2, ..., max_j # tmp = Pt @@ -1471,14 +1491,17 @@ cpdef _{{prefix}}compute_smoothed_state_weights( kfilter._tmp0, &inc) for j in range(t + 1, max_j + 1): + # Again, recall that double-letters denotes indexes that start + # from t0 + jj = j - t0 p_j = p - model.nmissing[j] if store_j[j] and p_j > 0: - # weights[:, :p_j, t, j] = tmp @ Hi_W[j, :p_j].T + # weights[:, :p_j, tt, jj] = tmp @ Hi_W[j, :p_j].T blas.{{prefix}}gemm("N", "T", &model._k_states, &p_j, &model._k_states, &alpha, kfilter._tmp0, &kfilter.k_states, - &Hi_W[0, 0, j], &p, - &beta, &weights[0, 0, t, j], &kfilter.k_states) + &Hi_W[0, 0, jj], &p, + &beta, &weights[0, 0, tt, jj], &kfilter.k_states) # tmp = tmp @ L[j].T blas.{{prefix}}gemm("N", "T", &model._k_states, &model._k_states, &model._k_states, @@ -1489,12 +1512,12 @@ cpdef _{{prefix}}compute_smoothed_state_weights( kfilter._tmp0, &inc) if store_j[j]: - # state_intercept_weights[:, :, t, j] = -(tmp @ N_j) + # state_intercept_weights[:, :, tt, jj] = -(tmp @ N_j) scalar = -1 / scale blas.{{prefix}}gemm("N", "N", &model._k_states, &model._k_states, &model._k_states, &scalar, kfilter._tmp00, &kfilter.k_states, &smoother.scaled_smoothed_estimator_cov[0, 0, j + 1], &kfilter.k_states, - &beta, &state_intercept_weights[0, 0, t, j], &kfilter.k_states) + &beta, &state_intercept_weights[0, 0, tt, jj], &kfilter.k_states) return np.array(weights), np.array(state_intercept_weights), np.array(prior_weights), np.array(Hi_W) diff --git a/statsmodels/tsa/statespace/dynamic_factor_mq.py b/statsmodels/tsa/statespace/dynamic_factor_mq.py index ad03001fe89..1a58e0a0152 100644 --- a/statsmodels/tsa/statespace/dynamic_factor_mq.py +++ b/statsmodels/tsa/statespace/dynamic_factor_mq.py @@ -3193,8 +3193,9 @@ def simulate(self, params, nsimulations, measurement_shocks=None, if use_pandas: # pd.Series (k_endog=1, replications=None) if len(shape) == 1: - sim *= self._endog_std.iloc[0] - sim += self._endog_mean.iloc[0] + std = self._endog_std.iloc[0] + mean = self._endog_mean.iloc[0] + sim = sim * std + mean # pd.DataFrame (k_endog > 1, replications=None) # [or] # pd.DataFrame with MultiIndex (replications > 0) @@ -3672,9 +3673,9 @@ def get_prediction(self, start=None, end=None, dynamic=False, def news(self, comparison, impact_date=None, impacted_variable=None, start=None, end=None, periods=None, exog=None, - comparison_type=None, state_index=None, return_raw=False, - tolerance=1e-10, endog_quarterly=None, original_scale=True, - **kwargs): + comparison_type=None, revisions_details_start=False, + state_index=None, return_raw=False, tolerance=1e-10, + endog_quarterly=None, original_scale=True, **kwargs): """ Compute impacts from updated data (news and revisions). @@ -3757,6 +3758,7 @@ def news(self, comparison, impact_date=None, impacted_variable=None, comparison, impact_date=impact_date, impacted_variable=impacted_variable, start=start, end=end, periods=periods, exog=exog, comparison_type=comparison_type, + revisions_details_start=revisions_details_start, state_index=state_index, return_raw=return_raw, tolerance=tolerance, endog_quarterly=endog_quarterly, **kwargs) @@ -3776,11 +3778,18 @@ def news(self, comparison, impact_date=None, impacted_variable=None, if news_results.revision_impacts is not None: news_results.revision_impacts = ( news_results.revision_impacts * endog_std) + if news_results.revision_detailed_impacts is not None: + news_results.revision_detailed_impacts = ( + news_results.revision_detailed_impacts * endog_std) + if news_results.revision_grouped_impacts is not None: + news_results.revision_grouped_impacts = ( + news_results.revision_grouped_impacts * endog_std) # Update forecasts for name in ['prev_impacted_forecasts', 'news', 'revisions', 'update_realized', 'update_forecasts', - 'revised', 'revised_prev', 'post_impacted_forecasts']: + 'revised', 'revised_prev', 'post_impacted_forecasts', + 'revisions_all', 'revised_all', 'revised_prev_all']: dta = getattr(news_results, name) # for pd.Series, dta.multiply(...) and (sometimes) dta.add(...) diff --git a/statsmodels/tsa/statespace/kalman_smoother.py b/statsmodels/tsa/statespace/kalman_smoother.py index 253a02cef9d..134662d2af1 100644 --- a/statsmodels/tsa/statespace/kalman_smoother.py +++ b/statsmodels/tsa/statespace/kalman_smoother.py @@ -1057,7 +1057,7 @@ def smoothed_state_autocovariance(self, lag=1, t=None, start=None, return acov def news(self, previous, t=None, start=None, end=None, - revised=None, design=None, state_index=None): + revisions_details_start=True, design=None, state_index=None): r""" Compute the news and impacts associated with a data release @@ -1080,6 +1080,15 @@ def news(self, previous, t=None, start=None, end=None, since it is an exclusive endpoint, the returned news do not include the value at this index. Cannot be used in combination with the `t` argument. + revisions_details_start : bool or int, optional + The period at which to beging computing the detailed impacts of + data revisions. Any revisions prior to this period will have their + impacts grouped together. If a negative integer, interpreted as + an offset from the end of the dataset. If set to True, detailed + impacts are computed for all revisions, while if set to False, all + revisions are grouped together. Default is False. Note that for + large models, setting this to be near the beginning of the sample + can cause this function to be slow. design : array, optional Design matrix for the period `t` in time-varying models. If this model has a time-varying design matrix, and the argument `t` is out @@ -1102,8 +1111,8 @@ def news(self, previous, t=None, start=None, end=None, the news. It is equivalent to E[y^i | post] - E[y^i | revision], where y^i are the variables of interest. In [1]_, this is described as "revision" in equation (17). - - `revision_impacts`: update to forecasts of variables impacted - variables from data revisions. It is + - `revision_detailed_impacts`: update to forecasts of variables + impacted variables from data revisions. It is E[y^i | revision] - E[y^i | previous], and does not have a specific notation in [1]_, since there for simplicity they assume that there are no revisions. @@ -1112,18 +1121,26 @@ def news(self, previous, t=None, start=None, end=None, were newly incorporated in a data release (but not including revisions to data points that already existed in the previous release). In [1]_, this is described as "news" in equation (17). - - `revisions` + - `revisions`: y^r(updated) - y^r(previous) for periods in + which detailed impacts were computed + - `revisions_all` : y^r(updated) - y^r(previous) for all revisions - `gain`: the gain matrix associated with the "Kalman-like" update from the news, E[y I'] E[I I']^{-1}. In [1]_, this can be found in the equation For E[y_{k,t_k} \mid I_{v+1}] in the middle of page 17. - - `revision_weights` + - `revision_weights` weights on observations for the smoothed + signal - `update_forecasts`: forecasts of the updated periods used to construct the news, E[y^u | previous]. - `update_realized`: realizations of the updated periods used to construct the news, y^u. - - `revised_prev` - - `revised` + - `revised`: revised observations of the periods that were revised + and for which detailed impacts were computed + - `revised`: revised observations of the periods that were revised + - `revised_prev`: previous observations of the periods that were + revised and for which detailed impacts were computed + - `revised_prev_all`: previous observations of the periods that + were revised and for which detailed impacts were computed - `prev_impacted_forecasts`: previous forecast of the periods of interest, E[y^i | previous]. - `post_impacted_forecasts`: forecast of the periods of interest @@ -1131,8 +1148,18 @@ def news(self, previous, t=None, start=None, end=None, E[y^i | post]. - `revision_results`: results object that updates the `previous` results to take into account data revisions. + - `revision_results`: results object associated with the revisions + - `revision_impacts`: total impacts from all revisions (both + grouped and detailed) - `revisions_ix`: list of `(t, i)` positions of revisions in endog + - `revisions_details`: list of `(t, i)` positions of revisions to + endog for which details of impacts were computed + - `revisions_grouped`: list of `(t, i)` positions of revisions to + endog for which impacts were grouped + - `revisions_details_start`: period in which revision details start + to be computed - `updates_ix`: list of `(t, i)` positions of updates to endog + - `state_index`: index of state variables used to compute impacts Notes ----- @@ -1233,21 +1260,67 @@ def news(self, previous, t=None, start=None, end=None, post_impacted_forecasts = self.predict( start=start, end=end).smoothed_forecasts - # Get revision weights, impacts, and forecasts + # Separate revisions into those with detailed impacts and those where + # impacts are grouped together + if revisions_details_start is True: + revisions_details_start = 0 + elif revisions_details_start is False: + revisions_details_start = previous.nobs + elif revisions_details_start < 0: + revisions_details_start = previous.nobs + revisions_details_start + + revisions_grouped = [] + revisions_details = [] + if revisions_details_start > 0: + for s, i in revisions_ix: + if s < revisions_details_start: + revisions_grouped.append((s, i)) + else: + revisions_details.append((s, i)) + else: + revisions_details = revisions_ix + + # Practically, don't compute impacts of revisions prior to first + # point that was actually revised if len(revisions_ix) > 0: + revisions_details_start = max(revisions_ix[0][0], + revisions_details_start) + + # Setup default (empty) output for revisions + revised_endog = None + revised_all = None + revised_prev_all = None + revisions_all = None + + revised = None + revised_prev = None + revisions = None + revision_weights = None + revision_detailed_impacts = None + revision_results = None + revision_impacts = None + + # Get revisions datapoints for all revisions (regardless of whether + # or not we are computing detailed impacts) + if len(revisions_ix) > 0: + # Indexes + revised_j, revised_p = zip(*revisions_ix) + compute_j = np.arange(revised_j[0], revised_j[-1] + 1) + + # Data from updated model revised_endog = self.endog[:, :previous.nobs].copy() + # ("revisions" are points where data was previously published and + # then changed, so we need to ignore "updates", which are points + # that were not previously published) revised_endog[previous.missing.astype(bool)] = np.nan + # subset to revision periods + revised_all = revised_endog.T[compute_j] - # Compute the revisions - revised_j, revised_p = zip(*revisions_ix) - compute_j = np.arange(revised_j[0], revised_j[-1] + 1) - revised_prev = previous.endog.T[compute_j] - revised = revised_endog.T[compute_j] - revisions = (revised - revised_prev) + # Data from original model + revised_prev_all = previous.endog.T[compute_j] - # Compute the weights of the smoothed state vector - compute_t = np.arange(start, end) - ix = np.ix_(compute_t, compute_j) + # revision = updated - original + revisions_all = (revised_all - revised_prev_all) # Construct a model from which we can create weights for impacts # through `end` @@ -1274,74 +1347,111 @@ def news(self, previous, t=None, start=None, end=None, rev_mod.initialize(init) revision_results = rev_mod.smooth() - smoothed_state_weights, _, _ = ( - tools._compute_smoothed_state_weights( - rev_mod, compute_t=compute_t, compute_j=compute_j, - compute_prior_weights=False, scale=previous.scale)) - smoothed_state_weights = smoothed_state_weights[ix] - - # Convert the weights in terms of smoothed forecasts - # t, j, m, p, i - ZT = rev_mod.design.T - if ZT.shape[0] > 1: - ZT = ZT[compute_t] - - # Subset the states used for the impacts if applicable - if state_index is not None: - ZT = ZT[:, state_index, :] - smoothed_state_weights = ( - smoothed_state_weights[:, :, state_index]) + # Get detailed revision weights, impacts, and forecasts + if len(revisions_details) > 0: + # Indexes for the subset of revisions for which we are + # computing detailed impacts + compute_j = np.arange(revisions_details_start, + revised_j[-1] + 1) + # Offset describing revisions for which we are not computing + # detailed impacts + offset = revisions_details_start - revised_j[0] + revised = revised_all[offset:] + revised_prev = revised_prev_all[offset:] + revisions = revisions_all[offset:] + + # Compute the weights of the smoothed state vector + compute_t = np.arange(start, end) + + smoothed_state_weights, _, _ = ( + tools._compute_smoothed_state_weights( + rev_mod, compute_t=compute_t, compute_j=compute_j, + compute_prior_weights=False, scale=previous.scale)) + + # Convert the weights in terms of smoothed forecasts + # t, j, m, p, i + ZT = rev_mod.design.T + if ZT.shape[0] > 1: + ZT = ZT[compute_t] + + # Subset the states used for the impacts if applicable + if state_index is not None: + ZT = ZT[:, state_index, :] + smoothed_state_weights = ( + smoothed_state_weights[:, :, state_index]) + + # Multiplication gives: t, j, m, p * t, j, m, p, k + # Sum along axis=2 gives: t, j, p, k + # Transpose to: t, j, k, p (i.e. like t, j, m, p but with k + # instead of m) + revision_weights = np.nansum( + smoothed_state_weights[..., None] + * ZT[:, None, :, None, :], axis=2).transpose(0, 1, 3, 2) + + # Multiplication gives: t, j, k, p * t, j, k, p + # Sum along axes 1, 3 gives: t, k + # This is also a valid way to compute impacts, but it employs + # unnecessary multiplications with zeros; it is better to use + # the below method that flattens the revision indices before + # computing the impacts + # revision_detailed_impacts = np.nansum( + # revision_weights * revisions[None, :, None, :], + # axis=(1, 3)) + + # Flatten the weights and revisions along the revised j, k + # dimensions so that we only retain the actual revision + # elements + revised_j, revised_p = zip(*[ + s for s in revisions_ix + if s[0] >= revisions_details_start]) + ix_j = revised_j - revised_j[0] + # Shape is: t, k, j * p + # Note: have to transpose first so that the two advanced + # indexes are next to each other, so that "the dimensions from + # the advanced indexing operations are inserted into the result + # array at the same spot as they were in the initial array" + # (see https://numpy.org/doc/stable/user/basics.indexing.html, + # "Combining advanced and basic indexing") + revision_weights = ( + revision_weights.transpose(0, 2, 1, 3)[:, :, + ix_j, revised_p]) + # Shape is j * k + revisions = revisions[ix_j, revised_p] + # Shape is t, k + revision_detailed_impacts = revision_weights @ revisions + + # Similarly, flatten the revised and revised_prev series + revised = revised[ix_j, revised_p] + revised_prev = revised_prev[ix_j, revised_p] + + # Squeeze if `t` argument used + if t is not None: + revision_weights = revision_weights[0] + revision_detailed_impacts = revision_detailed_impacts[0] + + # Get total revision impacts + revised_impact_forecasts = ( + revision_results.smoothed_forecasts[..., start:end]) + if end > revision_results.nobs: + predict_start = max(start, revision_results.nobs) + p = revision_results.predict( + start=predict_start, end=end, **extend_kwargs) + revised_impact_forecasts = np.concatenate( + (revised_impact_forecasts, p.forecasts), axis=1) + + revision_impacts = (revised_impact_forecasts - + prev_impacted_forecasts).T + if t is not None: + revision_impacts = revision_impacts[0] - # Multiplication gives: t, j, m, p * t, j, m, p, k - # Sum along axis=2 gives: t, j, p, k - # Transpose to: t, j, k, p (i.e. like t, j, m, p but with k instead - # of m) - revision_weights = np.nansum( - smoothed_state_weights[..., None] - * ZT[:, None, :, None, :], axis=2).transpose(0, 1, 3, 2) - - # Multiplication gives: t, j, k, p * t, j, k, p - # Sum along axes 1, 3 gives: t, k - # This is also a valid way to compute impacts, but it employes - # unnecessary multiplications with zeros; it is better to use the - # below method that flattens the revision indices before computing - # the impacts - # revision_impacts = np.nansum( - # revision_weights * revisions[None, :, None, :], axis=(1, 3)) - - # Flatten the weights and revisions along the revised j, k - # dimensions so that we only retain the actual revision elements + # Need to also flatten the revisions items that contain all revisions + if len(revisions_ix) > 0: + revised_j, revised_p = zip(*revisions_ix) ix_j = revised_j - revised_j[0] - # Shape is: t, k, j * p - # Note: have to transpose first so that the two advanced indexes - # are next to each other, so that "the dimensions from the - # advanced indexing operations are inserted into the result - # array at the same spot as they were in the initial array" - # (see https://numpy.org/doc/stable/user/basics.indexing.html, - # "Combining advanced and basic indexing") - revision_weights = ( - revision_weights.transpose(0, 2, 1, 3)[:, :, ix_j, revised_p]) - # Shape is j * k - revisions = revisions[ix_j, revised_p] - # Shape is t, k - revision_impacts = revision_weights @ revisions - - # Similarly, flatten the revised and revised_prev series - revised = revised[ix_j, revised_p] - revised_prev = revised_prev[ix_j, revised_p] - # Squeeze if `t` argument used - if t is not None: - revision_weights = revision_weights[0] - revision_impacts = revision_impacts[0] - else: - revised_endog = None - revised = None - revised_prev = None - revisions = None - revision_weights = None - revision_impacts = None - revision_results = None + revisions_all = revisions_all[ix_j, revised_p] + revised_all = revised_all[ix_j, revised_p] + revised_prev_all = revised_prev_all[ix_j, revised_p] # Now handle updates if len(updates_ix) > 0: @@ -1417,11 +1527,14 @@ def news(self, previous, t=None, start=None, end=None, update_impacts=update_impacts, # update to forecast of variables of interest from revisions # = E[y^i | revision] - E[y^i | previous] - revision_impacts=revision_impacts, + revision_detailed_impacts=revision_detailed_impacts, # news = A = y^u - E[y^u | previous] news=update_forecasts_error, - # revivions y^r(updated) - y^r(previous) + # revivions y^r(updated) - y^r(previous) for periods in which + # detailed impacts were computed revisions=revisions, + # revivions y^r(updated) - y^r(previous) + revisions_all=revisions_all, # gain matrix = E[y A'] E[A A']^{-1} gain=obs_gain, # weights on observations for the smoothed signal @@ -1432,20 +1545,38 @@ def news(self, previous, t=None, start=None, end=None, # realizations of the updated periods used to construct the news # = y^u update_realized=update_realized, - # revised observations of the periods that were revised + # revised observations of the periods that were revised and for + # which detailed impacts were computed # = y^r_{revised} revised=revised, - # previous observations of the periods that were revised + # revised observations of the periods that were revised + # = y^r_{revised} + revised_all=revised_all, + # previous observations of the periods that were revised and for + # which detailed impacts were computed # = y^r_{previous} revised_prev=revised_prev, + # previous observations of the periods that were revised + # = y^r_{previous} + revised_prev_all=revised_prev_all, # previous forecast of the periods of interest, E[y^i | previous] prev_impacted_forecasts=prev_impacted_forecasts, # post. forecast of the periods of interest, E[y^i | post] post_impacted_forecasts=post_impacted_forecasts, # results object associated with the revision revision_results=revision_results, + # total impacts from all revisions (both grouped and detailed) + revision_impacts=revision_impacts, # list of (x, y) positions of revisions to endog revisions_ix=revisions_ix, + # list of (x, y) positions of revisions to endog for which details + # of impacts were computed + revisions_details=revisions_details, + # list of (x, y) positions of revisions to endog for which impacts + # were grouped + revisions_grouped=revisions_grouped, + # period in which revision details start to be computed + revisions_details_start=revisions_details_start, # list of (x, y) positions of updates to endog updates_ix=updates_ix, # index of state variables used to compute impacts diff --git a/statsmodels/tsa/statespace/mlemodel.py b/statsmodels/tsa/statespace/mlemodel.py index cb4f1defaae..3910a55339a 100644 --- a/statsmodels/tsa/statespace/mlemodel.py +++ b/statsmodels/tsa/statespace/mlemodel.py @@ -3779,11 +3779,11 @@ def _apply(self, mod, refit=False, fit_kwargs=None): fit_kwargs['cov_kwds'] = { 'custom_cov_type': self.cov_type, 'custom_cov_params': self.cov_params_default, - 'custom_description': ('Parameters and standard errors' - ' were estimated using a different' - ' dataset and were then applied to' - ' this dataset. %s' - % self.cov_kwds['description'])} + 'custom_description': ( + 'Parameters and standard errors were estimated using a' + ' different dataset and were then applied to this' + ' dataset. %s' + % self.cov_kwds.get('description', 'Unknown.'))} if self.smoother_results is not None: func = mod.smooth @@ -3869,34 +3869,42 @@ def _get_previous_updated(self, comparison, exog=None, return previous, updated, comparison_dataset def _news_previous_results(self, previous, start, end, periods, + revisions_details_start=False, state_index=None): # Compute the news - out = self.smoother_results.news(previous.smoother_results, - start=start, end=end, - state_index=state_index) + out = self.smoother_results.news( + previous.smoother_results, start=start, end=end, + revisions_details_start=revisions_details_start, + state_index=state_index) return out def _news_updated_results(self, updated, start, end, periods, - state_index=None): - return updated._news_previous_results(self, start, end, periods, - state_index=state_index) + revisions_details_start=False, state_index=None): + return updated._news_previous_results( + self, start, end, periods, + revisions_details_start=revisions_details_start, + state_index=state_index) def _news_previous_data(self, endog, start, end, periods, exog, - state_index=None): + revisions_details_start=False, state_index=None): previous = self.apply(endog, exog=exog, copy_initialization=True) - return self._news_previous_results(previous, start, end, periods, - state_index=state_index) + return self._news_previous_results( + previous, start, end, periods, + revisions_details_start=revisions_details_start, + state_index=state_index) def _news_updated_data(self, endog, start, end, periods, exog, - state_index=None): + revisions_details_start=False, state_index=None): updated = self.apply(endog, exog=exog, copy_initialization=True) - return self._news_updated_results(updated, start, end, periods, - state_index=state_index) + return self._news_updated_results( + updated, start, end, periods, + revisions_details_start=revisions_details_start, + state_index=state_index) def news(self, comparison, impact_date=None, impacted_variable=None, start=None, end=None, periods=None, exog=None, - comparison_type=None, state_index=None, return_raw=False, - tolerance=1e-10, **kwargs): + comparison_type=None, revisions_details_start=False, + state_index=None, return_raw=False, tolerance=1e-10, **kwargs): """ Compute impacts from updated data (news and revisions) @@ -3936,6 +3944,15 @@ def news(self, comparison, impact_date=None, impacted_variable=None, *previous* results object or dataset or an *updated* results object or dataset. If not specified, then an attempt is made to determine the comparison type. + revisions_details_start : bool, int, str, or datetime, optional + The period at which to beging computing the detailed impacts of + data revisions. Any revisions prior to this period will have their + impacts grouped together. If a negative integer, interpreted as + an offset from the end of the dataset. If set to True, detailed + impacts are computed for all revisions, while if set to False, all + revisions are grouped together. Default is False. Note that for + large models, setting this to be near the beginning of the sample + can cause this function to be slow. state_index : array_like, optional An optional index specifying a subset of states to use when constructing the impacts of revisions and news. For example, if @@ -3984,6 +4001,11 @@ def news(self, comparison, impact_date=None, impacted_variable=None, ' large for the number of states in the model' f' ({self.model.k_states}).') + if not isinstance(revisions_details_start, (int, bool)): + revisions_details_start, _, _, _ = ( + self.model._get_prediction_index( + revisions_details_start, revisions_details_start)) + # Get the previous and updated results objects from `self` and # `comparison`: previous, updated, comparison_dataset = self._get_previous_updated( @@ -4045,9 +4067,10 @@ def news(self, comparison, impact_date=None, impacted_variable=None, updated = updated_orig.append(extra, exog=exog, **kwargs) # Compute the news - news_results = ( - updated._news_previous_results(previous, start, end + 1, periods, - state_index=state_index)) + news_results = updated._news_previous_results( + previous, start, end + 1, periods, + revisions_details_start=revisions_details_start, + state_index=state_index) if not return_raw: news_results = NewsResults( diff --git a/statsmodels/tsa/statespace/news.py b/statsmodels/tsa/statespace/news.py index 95e8bffe805..3f52ec34898 100644 --- a/statsmodels/tsa/statespace/news.py +++ b/statsmodels/tsa/statespace/news.py @@ -5,12 +5,13 @@ Author: Chad Fulton License: BSD-3 """ +from statsmodels.compat.pandas import FUTURE_STACK import numpy as np import pandas as pd -from statsmodels.iolib.table import SimpleTable from statsmodels.iolib.summary import Summary +from statsmodels.iolib.table import SimpleTable from statsmodels.iolib.tableformatting import fmt_params @@ -45,47 +46,76 @@ class NewsResults: Attributes ---------- - total_impacts : pd.Series + total_impacts : pd.DataFrame Updates to forecasts of impacted variables from both news and data revisions, E[y^i | post] - E[y^i | previous]. - update_impacts : pd.Series + update_impacts : pd.DataFrame Updates to forecasts of impacted variables from the news, E[y^i | post] - E[y^i | revisions] where y^i are the impacted variables of interest. - revision_impacts : pd.Series - Updates to forecasts of impacted variables from data revisions, + revision_impacts : pd.DataFrame + Updates to forecasts of impacted variables from all data revisions, E[y^i | revisions] - E[y^i | previous]. - news : pd.Series + news : pd.DataFrame The unexpected component of the updated data, E[y^u | post] - E[y^u | revisions] where y^u are the updated variables. - weights : pd.Series + weights : pd.DataFrame Weights describing the effect of news on variables of interest. - revisions : pd.Series - The revisions betwen the current and previously observed data + revisions : pd.DataFrame + The revisions between the current and previously observed data, for + revisions for which detailed impacts were computed. + revisions_all : pd.DataFrame + The revisions between the current and previously observed data, y^r_{revised} - y^r_{previous} where y^r are the revised variables. - revision_weights : pd.Series - Weights describing the effect of revisions on variables of interest. - update_forecasts : pd.Series + revision_weights : pd.DataFrame + Weights describing the effect of revisions on variables of interest, + for revisions for which detailed impacts were computed. + revision_weights_all : pd.DataFrame + Weights describing the effect of revisions on variables of interest, + with a new entry that includes NaNs for the revisions for which + detailed impacts were not computed. + update_forecasts : pd.DataFrame Forecasts based on the previous dataset of the variables that were updated, E[y^u | previous]. - update_realized : pd.Series + update_realized : pd.DataFrame Actual observed data associated with the variables that were updated, y^u - revised_prev : pd.Series + revisions_details_start : int + Integer index of first period in which detailed revision impacts were + computed. + revision_detailed_impacts : pd.DataFrame + Updates to forecasts of impacted variables from data revisions with + detailed impacts, E[y^i | revisions] - E[y^i | grouped revisions]. + revision_grouped_impacts : pd.DataFrame + Updates to forecasts of impacted variables from data revisions that + were grouped together, E[y^i | grouped revisions] - E[y^i | previous]. + revised_prev : pd.DataFrame + Previously observed data associated with the variables that were + revised, for revisions for which detailed impacts were computed. + revised_prev_all : pd.DataFrame Previously observed data associated with the variables that were revised, y^r_{previous} - revised : pd.Series + revised : pd.DataFrame + Currently observed data associated with the variables that were + revised, for revisions for which detailed impacts were computed. + revised_all : pd.DataFrame Currently observed data associated with the variables that were revised, y^r_{revised} - prev_impacted_forecasts : pd.Series + prev_impacted_forecasts : pd.DataFrame Previous forecast of the variables of interest, E[y^i | previous]. - post_impacted_forecasts : pd.Series + post_impacted_forecasts : pd.DataFrame Forecast of the variables of interest after taking into account both revisions and updates, E[y^i | post]. revisions_iloc : pd.DataFrame The integer locations of the data revisions in the dataset. revisions_ix : pd.DataFrame The label-based locations of the data revisions in the dataset. + revisions_iloc_detailed : pd.DataFrame + The integer locations of the data revisions in the dataset for which + detailed impacts were computed. + revisions_ix_detailed : pd.DataFrame + The label-based locations of the data revisions in the dataset for + which detailed impacts were computed. updates_iloc : pd.DataFrame The integer locations of the updated data points. updates_ix : pd.DataFrame @@ -126,30 +156,56 @@ def __init__(self, news_results, model, updated, previous, self.endog_names = self.updated.model.endog_names self.k_endog = len(self.endog_names) + self.n_revisions = len(self.news_results.revisions_ix) + self.n_revisions_detailed = len(self.news_results.revisions_details) + self.n_revisions_grouped = len(self.news_results.revisions_grouped) + index = self.updated.model._index columns = np.atleast_1d(self.endog_names) # E[y^i | post] self.post_impacted_forecasts = pd.DataFrame( news_results.post_impacted_forecasts.T, - index=self.row_labels, columns=columns) + index=self.row_labels, columns=columns).rename_axis( + index='impact date', columns='impacted variable') # E[y^i | previous] self.prev_impacted_forecasts = pd.DataFrame( news_results.prev_impacted_forecasts.T, - index=self.row_labels, columns=columns) + index=self.row_labels, columns=columns).rename_axis( + index='impact date', columns='impacted variable') # E[y^i | post] - E[y^i | revisions] self.update_impacts = pd.DataFrame( news_results.update_impacts, - index=self.row_labels, columns=columns) + index=self.row_labels, columns=columns).rename_axis( + index='impact date', columns='impacted variable') + # E[y^i | revisions] - E[y^i | grouped revisions] + self.revision_detailed_impacts = pd.DataFrame( + news_results.revision_detailed_impacts, + index=self.row_labels, + columns=columns, + dtype=float, + ).rename_axis(index="impact date", columns="impacted variable") # E[y^i | revisions] - E[y^i | previous] self.revision_impacts = pd.DataFrame( news_results.revision_impacts, - index=self.row_labels, columns=columns) + index=self.row_labels, + columns=columns, + dtype=float, + ).rename_axis(index="impact date", columns="impacted variable") + # E[y^i | grouped revisions] - E[y^i | previous] + self.revision_grouped_impacts = ( + self.revision_impacts + - self.revision_detailed_impacts.fillna(0)) + if self.n_revisions_grouped == 0: + self.revision_grouped_impacts.loc[:] = 0 + # E[y^i | post] - E[y^i | previous] self.total_impacts = (self.post_impacted_forecasts - self.prev_impacted_forecasts) # Indices of revisions and updates + self.revisions_details_start = news_results.revisions_details_start + self.revisions_iloc = pd.DataFrame( list(zip(*news_results.revisions_ix)), index=['revision date', 'revised variable']).T @@ -161,6 +217,10 @@ def __init__(self, news_results, model, updated, previous, else: self.revisions_ix = iloc.copy() + mask = iloc['revision date'] >= self.revisions_details_start + self.revisions_iloc_detailed = self.revisions_iloc[mask] + self.revisions_ix_detailed = self.revisions_ix[mask] + self.updates_iloc = pd.DataFrame( list(zip(*news_results.updates_ix)), index=['update date', 'updated variable']).T @@ -176,9 +236,12 @@ def __init__(self, news_results, model, updated, previous, self.state_index = news_results.state_index # Wrap forecasts and forecasts errors - r_ix = pd.MultiIndex.from_arrays([ + r_ix_all = pd.MultiIndex.from_arrays([ self.revisions_ix['revision date'], self.revisions_ix['revised variable']]) + r_ix = pd.MultiIndex.from_arrays([ + self.revisions_ix_detailed['revision date'], + self.revisions_ix_detailed['revised variable']]) u_ix = pd.MultiIndex.from_arrays([ self.updates_ix['update date'], self.updates_ix['updated variable']]) @@ -189,13 +252,21 @@ def __init__(self, news_results, model, updated, previous, dtype=model.params.dtype) else: self.news = pd.Series(news_results.news, index=u_ix, name='news') - # E[y^u | revisions] - E[y^u | previous] + # Revisions to data (y^r_{revised} - y^r_{previous}) + if news_results.revisions_all is None: + self.revisions_all = pd.Series([], index=r_ix_all, name='revision', + dtype=model.params.dtype) + else: + self.revisions_all = pd.Series(news_results.revisions_all, + index=r_ix_all, name='revision') + # Revisions to data (y^r_{revised} - y^r_{previous}) for which detailed + # impacts were computed if news_results.revisions is None: self.revisions = pd.Series([], index=r_ix, name='revision', dtype=model.params.dtype) else: - self.revisions = pd.Series(news_results.revisions, index=r_ix, - name='revision') + self.revisions = pd.Series(news_results.revisions, + index=r_ix, name='revision') # E[y^u | revised] if news_results.update_forecasts is None: self.update_forecasts = pd.Series([], index=u_ix, @@ -204,6 +275,14 @@ def __init__(self, news_results, model, updated, previous, self.update_forecasts = pd.Series( news_results.update_forecasts, index=u_ix) # y^r_{revised} + if news_results.revised_all is None: + self.revised_all = pd.Series([], index=r_ix_all, + dtype=model.params.dtype, + name='revised') + else: + self.revised_all = pd.Series(news_results.revised_all, + index=r_ix_all, name='revised') + # y^r_{revised} for which detailed impacts were computed if news_results.revised is None: self.revised = pd.Series([], index=r_ix, dtype=model.params.dtype, name='revised') @@ -211,6 +290,13 @@ def __init__(self, news_results, model, updated, previous, self.revised = pd.Series(news_results.revised, index=r_ix, name='revised') # y^r_{previous} + if news_results.revised_prev_all is None: + self.revised_prev_all = pd.Series([], index=r_ix_all, + dtype=model.params.dtype) + else: + self.revised_prev_all = pd.Series( + news_results.revised_prev_all, index=r_ix_all) + # y^r_{previous} for which detailed impacts were computed if news_results.revised_prev is None: self.revised_prev = pd.Series([], index=r_ix, dtype=model.params.dtype) @@ -235,7 +321,7 @@ def __init__(self, news_results, model, updated, previous, self.weights.columns.names = ['impact date', 'impacted variable'] # reshaped version of revision_weights - if len(self.revisions_iloc): + if self.n_revisions_detailed > 0: revision_weights = news_results.revision_weights.reshape( len(cols), len(r_ix)) else: @@ -245,6 +331,9 @@ def __init__(self, news_results, model, updated, previous, self.revision_weights.columns.names = [ 'impact date', 'impacted variable'] + self.revision_weights_all = self.revision_weights.reindex( + self.revised_all.index) + @property def impacted_variable(self): return self._impacted_variable @@ -276,6 +365,8 @@ def data_revisions(self): in the previous dataset. - `revised`: the revised value of the data, as it is observed in the new dataset + - `detailed impacts computed`: whether or not detailed impacts have + been computed in these NewsResults for this revision See also -------- @@ -283,9 +374,11 @@ def data_revisions(self): """ # Save revisions data data = pd.concat([ - self.revised.rename('revised'), - self.revised_prev.rename('observed (prev)') + self.revised_all.rename('revised'), + self.revised_prev_all.rename('observed (prev)') ], axis=1).sort_index() + data['detailed impacts computed'] = ( + self.revised_all.index.isin(self.revised.index)) return data @property @@ -375,7 +468,8 @@ def details_by_impact(self): revision_details_by_update impacts """ - df = self.weights.stack(level=[0, 1]).rename('weight').to_frame() + s = self.weights.stack(level=[0, 1], **FUTURE_STACK) + df = s.rename('weight').to_frame() if len(self.updates_iloc): df['forecast (prev)'] = self.update_forecasts df['observed'] = self.update_realized @@ -392,9 +486,23 @@ def details_by_impact(self): if self.impacted_variable is not None and len(df) > 0: df = df.loc[np.s_[:, self.impacted_variable], :] - mask = np.abs(df['weight']) > self.tolerance + mask = np.abs(df['impact']) > self.tolerance return df[mask] + @property + def _revision_grouped_impacts(self): + s = self.revision_grouped_impacts.stack(**FUTURE_STACK) + df = s.rename('impact').to_frame() + df = df.reindex(['revision date', 'revised variable', 'impact'], + axis=1) + if self.revisions_details_start > 0: + df['revision date'] = ( + self.updated.model._index[self.revisions_details_start - 1]) + df['revised variable'] = 'all prior revisions' + df = (df.set_index(['revision date', 'revised variable'], append=True) + .reorder_levels([2, 3, 0, 1])) + return df + @property def revision_details_by_impact(self): """ @@ -434,6 +542,11 @@ def revision_details_by_impact(self): new datapoints. That information can be found in the `impacts` or `details_by_impact` tables. + Grouped impacts are shown in this table, with a "revision date" equal + to the last period prior to which detailed revisions were computed and + with "revised variable" set to the string "all prior revisions". For + these rows, all columns except "impact" will be set to NaNs. + This form of the details table is organized so that the impacted dates / variables are first in the index. This is convenient for slicing by impacted variables / dates to view the details of data @@ -453,21 +566,28 @@ def revision_details_by_impact(self): details_by_impact impacts """ - weights = self.revision_weights.stack(level=[0, 1]) + weights = self.revision_weights.stack(level=[0, 1], **FUTURE_STACK) df = pd.concat([ self.revised.reindex(weights.index), self.revised_prev.rename('observed (prev)').reindex(weights.index), self.revisions.reindex(weights.index), weights.rename('weight'), - (self.revisions * weights).rename('impact'), + (self.revisions.reindex(weights.index) * weights).rename('impact'), ], axis=1) + if self.n_revisions_grouped > 0: + df = pd.concat([df, self._revision_grouped_impacts]) + # Explicitly set names for compatibility with pandas=1.2.5 + df.index = df.index.set_names( + ['revision date', 'revised variable', + 'impact date', 'impacted variable']) + df = df.reorder_levels([2, 3, 0, 1]).sort_index() if self.impacted_variable is not None and len(df) > 0: df = df.loc[np.s_[:, self.impacted_variable], :] - mask = np.abs(df['weight']) > self.tolerance + mask = np.abs(df['impact']) > self.tolerance return df[mask] @property @@ -527,7 +647,8 @@ def details_by_update(self): details_by_impact impacts """ - df = self.weights.stack(level=[0, 1]).rename('weight').to_frame() + s = self.weights.stack(level=[0, 1], **FUTURE_STACK) + df = s.rename('weight').to_frame() if len(self.updates_iloc): df['forecast (prev)'] = self.update_forecasts df['observed'] = self.update_realized @@ -550,7 +671,7 @@ def details_by_update(self): details = details.loc[ np.s_[:, :, :, :, :, self.impacted_variable], :] - mask = np.abs(details['weight']) > self.tolerance + mask = np.abs(details['impact']) > self.tolerance return details[mask] @property @@ -589,7 +710,12 @@ def revision_details_by_update(self): release. This table does not summarize the impacts or show the effect of - revisions. That information can be found in the `impacts` table. + new datapoints, see `details_by_update` instead. + + Grouped impacts are shown in this table, with a "revision date" equal + to the last period prior to which detailed revisions were computed and + with "revised variable" set to the string "all prior revisions". For + these rows, all columns except "impact" will be set to NaNs. This form of the details table is organized so that the revision dates / variables are first in the index, and in this table the index @@ -609,16 +735,23 @@ def revision_details_by_update(self): details_by_impact impacts """ - weights = self.revision_weights.stack(level=[0, 1]) + weights = self.revision_weights.stack(level=[0, 1], **FUTURE_STACK) df = pd.concat([ self.revised_prev.rename('observed (prev)').reindex(weights.index), self.revised.reindex(weights.index), self.revisions.reindex(weights.index), weights.rename('weight'), - (self.revisions * weights).rename('impact'), + (self.revisions.reindex(weights.index) * weights).rename('impact'), ], axis=1) + if self.n_revisions_grouped > 0: + df = pd.concat([df, self._revision_grouped_impacts]) + # Explicitly set names for compatibility with pandas=1.2.5 + df.index = df.index.set_names( + ['revision date', 'revised variable', + 'impact date', 'impacted variable']) + details = (df.set_index(['observed (prev)', 'revised'], append=True) .reorder_levels([ 'revision date', 'revised variable', 'revised', @@ -630,7 +763,7 @@ def revision_details_by_update(self): details = details.loc[ np.s_[:, :, :, :, :, self.impacted_variable], :] - mask = np.abs(details['weight']) > self.tolerance + mask = np.abs(details['impact']) > self.tolerance return details[mask] @property @@ -682,9 +815,9 @@ def impacts(self): self.post_impacted_forecasts.unstack().rename('estimate (new)')], axis=1) impacts['impact of revisions'] = ( - impacts['impact of revisions'].fillna(0)) + impacts['impact of revisions'].astype(float).fillna(0)) impacts['impact of news'] = ( - impacts['impact of news'].fillna(0)) + impacts['impact of news'].astype(float).fillna(0)) impacts['total impact'] = (impacts['impact of revisions'] + impacts['impact of news']) impacts = impacts.reorder_levels([1, 0]).sort_index() @@ -756,7 +889,7 @@ def summary_impacts(self, impact_date=None, impacted_variable=None, # Default is to only show the revisions columns if there were any # revisions (otherwise it would just be a column of zeros) if show_revisions_columns is None: - show_revisions_columns = len(self.revisions_iloc) > 0 + show_revisions_columns = self.n_revisions > 0 # Select only the variables / dates of interest s = list(np.s_[:, :]) @@ -959,7 +1092,7 @@ def summary_details(self, source='news', impact_date=None, } else: raise ValueError(f'Invalid `source`: {source}. Must be "news" or' - ' "impacts".') + ' "revisions".') # Make the first index level the groupby level groupby = groupby.lower().replace('_', ' ') @@ -1081,10 +1214,11 @@ def create_table(details, removed_levels): if multiple_tables: details_table = [] - for item in details[groupby].unique(): - mask = details[groupby] == item - item_details = details[mask].drop(groupby, axis=1) - item_removed_levels = [f'{groupby} = {item}'] + removed_levels + for item in details[columns[groupby]].unique(): + mask = details[columns[groupby]] == item + item_details = details[mask].drop(columns[groupby], axis=1) + item_removed_levels = ( + [f'{columns[groupby]} = {item}'] + removed_levels) details_table.append(create_table(item_details, item_removed_levels)) else: @@ -1112,13 +1246,17 @@ def summary_revisions(self, sparsify=True): - `observed (prev)` : the observed value prior to the revision - `revised` : the new value after the revision - `revision` : the new value after the revision + - `detailed impacts computed` : whether detailed impacts were + computed for this revision """ data = pd.merge( - self.data_revisions, self.revisions, left_index=True, + self.data_revisions, self.revisions_all, left_index=True, right_index=True).sort_index().reset_index() + data = data[['revision date', 'revised variable', 'observed (prev)', + 'revision', 'detailed impacts computed']] data[['revision date', 'revised variable']] = ( data[['revision date', 'revised variable']].applymap(str)) - data.iloc[:, 2:] = data.iloc[:, 2:].applymap( + data.iloc[:, 2:-1] = data.iloc[:, 2:-1].applymap( lambda num: '' if pd.isnull(num) else '%.2f' % num) # Sparsify the date column @@ -1370,7 +1508,7 @@ def get_sample(model): table_ix += 1 # Revisions - if include_revisions_tables and len(self.revisions_iloc) > 0: + if include_revisions_tables and self.n_revisions > 0: summary.tables.insert( table_ix, self.summary_revisions(sparsify=sparsify)) table_ix += 1 diff --git a/statsmodels/tsa/statespace/structural.py b/statsmodels/tsa/statespace/structural.py index 6896c81d500..76b6597c9a9 100644 --- a/statsmodels/tsa/statespace/structural.py +++ b/statsmodels/tsa/statespace/structural.py @@ -597,7 +597,7 @@ def __init__(self, endog, level=False, trend=False, seasonal=None, # of the data. if cycle_period_bounds is None: freq = self.data.freq[0] if self.data.freq is not None else '' - if freq == 'A': + if freq in ('A', 'Y'): cycle_period_bounds = (1.5, 12) elif freq == 'Q': cycle_period_bounds = (1.5*4, 12*4) diff --git a/statsmodels/tsa/statespace/tests/results/results_structural.py b/statsmodels/tsa/statespace/tests/results/results_structural.py index 61e35801a6c..53c8e2c306b 100644 --- a/statsmodels/tsa/statespace/tests/results/results_structural.py +++ b/statsmodels/tsa/statespace/tests/results/results_structural.py @@ -283,7 +283,7 @@ # Annual frequency dataset {'level': 'lltrend', 'autoregressive': 1, 'cycle': True, 'stochastic_cycle': True, 'seasonal': 4, 'autoregressive': 1, - 'exog': True, 'mle_regression': False, 'freq': 'AS'}, + 'exog': True, 'mle_regression': False, 'freq': 'YS'}, # Quarterly frequency dataset {'level': 'lltrend', 'autoregressive': 1, 'cycle': True, 'stochastic_cycle': True, 'seasonal': 4, 'autoregressive': 1, @@ -295,7 +295,7 @@ # Minutely frequency dataset {'level': 'lltrend', 'autoregressive': 1, 'cycle': True, 'stochastic_cycle': True, 'seasonal': 4, 'autoregressive': 1, - 'exog': True, 'mle_regression': False, 'freq': 'T', + 'exog': True, 'mle_regression': False, 'freq': 'min', 'cycle_period_bounds': (1.5*12, 12*12)}, ], 'params': [0.0001, 0.01, 0.06, 0.0001, 0.0001, 0.1, 2*pi / 10, 0.2], diff --git a/statsmodels/tsa/statespace/tests/test_chandrasekhar.py b/statsmodels/tsa/statespace/tests/test_chandrasekhar.py index 31d16c12591..48aecae2764 100644 --- a/statsmodels/tsa/statespace/tests/test_chandrasekhar.py +++ b/statsmodels/tsa/statespace/tests/test_chandrasekhar.py @@ -20,8 +20,12 @@ def check_output(res_chand, res_orig, memory_conserve=False): # Test loglike params = res_orig.params assert_allclose(res_chand.llf, res_orig.llf) - assert_allclose(res_chand.model.score_obs(params), - res_orig.model.score_obs(params), atol=1e-10) + assert_allclose( + res_chand.model.score_obs(params), + res_orig.model.score_obs(params), + rtol=5e-5, + atol=5e-6 + ) # Test state space representation matrices for name in res_chand.model.ssm.shapes: diff --git a/statsmodels/tsa/statespace/tests/test_dynamic_factor_mq.py b/statsmodels/tsa/statespace/tests/test_dynamic_factor_mq.py index 4a65944abe2..683ec310215 100644 --- a/statsmodels/tsa/statespace/tests/test_dynamic_factor_mq.py +++ b/statsmodels/tsa/statespace/tests/test_dynamic_factor_mq.py @@ -4,7 +4,12 @@ These test standard that the state space model is set up as desired, that expected exceptions are raised, etc. """ -from statsmodels.compat.pandas import assert_frame_equal, assert_series_equal +from statsmodels.compat.pandas import ( + MONTH_END, + QUARTER_END, + assert_frame_equal, + assert_series_equal, +) import numpy as np from numpy.testing import assert_, assert_allclose, assert_equal @@ -20,7 +25,6 @@ ) from statsmodels.tsa.statespace.tests import test_dynamic_factor_mq_monte_carlo - SKIP_MONTE_CARLO_TESTS = True @@ -730,7 +734,7 @@ def test_invalid_model_specification(): dta_period_M = pd.DataFrame( dta, index=pd.period_range(start='2000', periods=10, freq='M')) dta_date_M = pd.DataFrame( - dta, index=pd.date_range(start='2000', periods=10, freq='M')) + dta, index=pd.date_range(start='2000', periods=10, freq=MONTH_END)) dta_period_Q = pd.DataFrame( dta, index=pd.period_range(start='2000', periods=10, freq='Q')) @@ -838,9 +842,18 @@ def test_invalid_model_specification(): dta_date_M, endog_quarterly=dta_date_W) -@pytest.mark.parametrize('freq_Q', ['Q', 'Q-DEC', 'Q-JAN', 'QS', - 'QS-DEC', 'QS-APR']) -@pytest.mark.parametrize('freq_M', ['M', 'MS']) +@pytest.mark.parametrize( + "freq_Q", + [ + QUARTER_END, + f"{QUARTER_END}-DEC", + f"{QUARTER_END}-JAN", + "QS", + "QS-DEC", + "QS-APR", + ], +) +@pytest.mark.parametrize("freq_M", [MONTH_END, "MS"]) def test_date_indexes(reset_randomstate, freq_M, freq_Q): # Test that using either PeriodIndex or DatetimeIndex for monthly or # quarterly data, with a variety of DatetimeIndex frequencies, works diff --git a/statsmodels/tsa/statespace/tests/test_impulse_responses.py b/statsmodels/tsa/statespace/tests/test_impulse_responses.py index be901ac03c7..0a2987e36ba 100644 --- a/statsmodels/tsa/statespace/tests/test_impulse_responses.py +++ b/statsmodels/tsa/statespace/tests/test_impulse_responses.py @@ -4,17 +4,24 @@ Author: Chad Fulton License: Simplified-BSD """ +from statsmodels.compat.pandas import MONTH_END + import warnings import numpy as np +from numpy.testing import assert_, assert_allclose import pandas as pd -from scipy.stats import ortho_group import pytest -from numpy.testing import assert_, assert_allclose +from scipy.stats import ortho_group from statsmodels.tools.sm_exceptions import EstimationWarning -from statsmodels.tsa.statespace import (mlemodel, sarimax, structural, varmax, - dynamic_factor) +from statsmodels.tsa.statespace import ( + dynamic_factor, + mlemodel, + sarimax, + structural, + varmax, +) from statsmodels.tsa.vector_ar.tests.test_var import get_macrodata @@ -660,7 +667,7 @@ def test_pandas_univariate_rangeindex(): def test_pandas_univariate_dateindex(): # Impulse responses still have RangeIndex (i.e. aren't wrapped with dates) - ix = pd.date_range(start='2000', periods=1, freq='M') + ix = pd.date_range(start='2000', periods=1, freq=MONTH_END) endog = pd.Series(np.zeros(1), index=ix) mod = sarimax.SARIMAX(endog) res = mod.filter([0.5, 1.]) @@ -685,7 +692,7 @@ def test_pandas_multivariate_rangeindex(): def test_pandas_multivariate_dateindex(): # Impulse responses still have RangeIndex (i.e. aren't wrapped with dates) - ix = pd.date_range(start='2000', periods=1, freq='M') + ix = pd.date_range(start='2000', periods=1, freq=MONTH_END) endog = pd.DataFrame(np.zeros((1, 2)), index=ix) mod = varmax.VARMAX(endog, trend='n') res = mod.filter([0.5, 0., 0., 0.2, 1., 0., 1.]) @@ -698,7 +705,7 @@ def test_pandas_multivariate_dateindex(): def test_pandas_anchor(): # Test that anchor with dates works - ix = pd.date_range(start='2000', periods=10, freq='M') + ix = pd.date_range(start='2000', periods=10, freq=MONTH_END) endog = pd.DataFrame(np.zeros((10, 2)), index=ix) mod = TVSS(endog) res = mod.filter([]) diff --git a/statsmodels/tsa/statespace/tests/test_mlemodel.py b/statsmodels/tsa/statespace/tests/test_mlemodel.py index af6c70f9466..7842b7cfcd3 100644 --- a/statsmodels/tsa/statespace/tests/test_mlemodel.py +++ b/statsmodels/tsa/statespace/tests/test_mlemodel.py @@ -4,22 +4,35 @@ Author: Chad Fulton License: Simplified-BSD """ +from statsmodels.compat.pandas import MONTH_END + import os import re import warnings import numpy as np +from numpy.testing import ( + assert_, + assert_allclose, + assert_almost_equal, + assert_equal, + assert_raises, +) import pandas as pd import pytest -from statsmodels.tsa.statespace import (sarimax, varmax, kalman_filter, - kalman_smoother) -from statsmodels.tsa.statespace.mlemodel import MLEModel, MLEResultsWrapper from statsmodels.datasets import nile -from numpy.testing import ( - assert_, assert_almost_equal, assert_equal, assert_allclose, assert_raises) +from statsmodels.tsa.statespace import ( + kalman_filter, + kalman_smoother, + sarimax, + varmax, +) +from statsmodels.tsa.statespace.mlemodel import MLEModel, MLEResultsWrapper from statsmodels.tsa.statespace.tests.results import ( - results_sarimax, results_var_misc) + results_sarimax, + results_var_misc, +) current_path = os.path.dirname(os.path.abspath(__file__)) @@ -1173,13 +1186,13 @@ def test_states_index_periodindex(): def test_states_index_dateindex(): nobs = 10 - ix = pd.date_range(start='2000', periods=nobs, freq='M') + ix = pd.date_range(start='2000', periods=nobs, freq=MONTH_END) endog = pd.Series(np.zeros(nobs), index=ix) mod = sarimax.SARIMAX(endog, order=(2, 0, 0)) res = mod.smooth([0.5, 0.1, 1.0]) - predicted_ix = pd.date_range(start=ix[0], periods=nobs + 1, freq='M') + predicted_ix = pd.date_range(start=ix[0], periods=nobs + 1, freq=MONTH_END) cols = pd.Index(['state.0', 'state.1']) check_states_index(res.states, ix, predicted_ix, cols) diff --git a/statsmodels/tsa/statespace/tests/test_news.py b/statsmodels/tsa/statespace/tests/test_news.py index cfe47eb90c8..e03511752ce 100644 --- a/statsmodels/tsa/statespace/tests/test_news.py +++ b/statsmodels/tsa/statespace/tests/test_news.py @@ -5,6 +5,7 @@ License: BSD-3 """ from statsmodels.compat.pandas import ( + FUTURE_STACK, assert_frame_equal, assert_series_equal, ) @@ -157,8 +158,8 @@ def check_news(news, revisions, updates, impact_dates, impacted_variables, assert_(np.all(news.revision_impacts.isnull())) # Total impacts - total_impacts = (news.revision_impacts.fillna(0) + - news.update_impacts.fillna(0)) + total_impacts = (news.revision_impacts.astype(float).fillna(0) + + news.update_impacts.astype(float).fillna(0)) assert_allclose(news.total_impacts, total_impacts, atol=1e-12) # - Impacted variable forecasts ------------------------------------------ @@ -178,7 +179,7 @@ def check_news(news, revisions, updates, impact_dates, impacted_variables, # - Table: data revisions ------------------------------------------------ assert_equal(news.data_revisions.columns.tolist(), - ['revised', 'observed (prev)']) + ['revised', 'observed (prev)', 'detailed impacts computed']) assert_equal(news.data_revisions.index.names, ['revision date', 'revised variable']) assert_(news.data_revisions.index.equals(revisions_index)) @@ -255,23 +256,37 @@ def check_news(news, revisions, updates, impact_dates, impacted_variables, desired = ['impact date', 'impacted variable'] assert_equal(impacts.index.names, desired) - assert_allclose(impacts.loc[:, 'estimate (prev)'], - news.prev_impacted_forecasts.stack(), atol=1e-12) - assert_allclose(impacts.loc[:, 'impact of revisions'], - news.revision_impacts.fillna(0).stack(), atol=1e-12) - assert_allclose(impacts.loc[:, 'impact of news'], - news.update_impacts.fillna(0).stack(), atol=1e-12) - assert_allclose(impacts.loc[:, 'total impact'], - news.total_impacts.stack(), atol=1e-12) - assert_allclose(impacts.loc[:, 'estimate (new)'], - news.post_impacted_forecasts.stack(), atol=1e-12) - - -# @pytest.mark.parametrize('revisions', [True, False]) -# @pytest.mark.parametrize('updates', [True, False]) -@pytest.mark.parametrize('revisions', [True]) -@pytest.mark.parametrize('updates', [True]) -def test_sarimax_time_invariant(revisions, updates): + assert_allclose( + impacts.loc[:, "estimate (prev)"], + news.prev_impacted_forecasts.stack(**FUTURE_STACK), + atol=1e-12, + ) + assert_allclose( + impacts.loc[:, "impact of revisions"], + news.revision_impacts.astype(float).fillna(0).stack(**FUTURE_STACK), + atol=1e-12, + ) + assert_allclose( + impacts.loc[:, "impact of news"], + news.update_impacts.astype(float).fillna(0).stack(**FUTURE_STACK), + atol=1e-12, + ) + assert_allclose( + impacts.loc[:, "total impact"], + news.total_impacts.stack(**FUTURE_STACK), + atol=1e-12, + ) + assert_allclose( + impacts.loc[:, "estimate (new)"], + news.post_impacted_forecasts.stack(**FUTURE_STACK), + atol=1e-12, + ) + + +@pytest.mark.parametrize('revisions', [True, False]) +@pytest.mark.parametrize('updates', [True, False]) +@pytest.mark.parametrize('revisions_details_start', [True, False, -2]) +def test_sarimax_time_invariant(revisions, updates, revisions_details_start): # Construct previous and updated datasets endog = dta['infl'].copy() comparison_type = None @@ -291,7 +306,8 @@ def test_sarimax_time_invariant(revisions, updates): mod = sarimax.SARIMAX(endog1) res = mod.smooth([0.5, 1.0]) news = res.news(endog2, start='2009Q2', end='2010Q1', - comparison_type=comparison_type) + comparison_type=comparison_type, + revisions_details_start=revisions_details_start) # Compute the true values for each combination of (revsions, updates) impact_dates = pd.period_range(start='2009Q2', end='2010Q1', freq='Q') @@ -998,3 +1014,293 @@ def test_invalid(): msg = 'The index associated with the updated results is not a superset' with pytest.raises(ValueError, match=msg): res.news(endog.values) + + +@pytest.mark.parametrize('revisions_details_start', [True, -10, 200]) +def test_detailed_revisions(revisions_details_start): + # Construct original and revised datasets + y = np.log(dta[['realgdp', 'realcons', + 'realinv', 'cpi']]).diff().iloc[1:] * 100 + y.iloc[-1, 0] = np.nan + + y_revised = y.copy() + revisions = { + ('2009Q2', 'realgdp'): 1.1, + ('2009Q3', 'realcons'): 0.5, + ('2009Q2', 'realinv'): -0.3, + ('2009Q2', 'cpi'): 0.2, + ('2009Q3', 'cpi'): 0.2, + } + for key, diff in revisions.items(): + y_revised.loc[key] += diff + + # Create model and results + mod = varmax.VARMAX(y, trend='n') + ar_coeff = { + 'realgdp': 0.9, + 'realcons': 0.8, + 'realinv': 0.7, + 'cpi': 0.6 + } + params = np.r_[np.diag(list(ar_coeff.values())).flatten(), + [1, 0, 1, 0, 0, 1, 0, 0, 0, 1]] + res = mod.smooth(params) + res_revised = res.apply(y_revised) + news = res_revised.news(res, comparison_type='previous', tolerance=-1, + revisions_details_start=revisions_details_start) + + # Tests + data_revisions = news.data_revisions + revision_details = news.revision_details_by_update.reset_index([2, 3]) + + for key, diff in revisions.items(): + assert_allclose(data_revisions.loc[key, 'revised'], y_revised.loc[key]) + assert_allclose(data_revisions.loc[key, 'observed (prev)'], y.loc[key]) + assert_equal( + # Need to manually cast to numpy for compatibility with + # pandas==1.2.5 + np.array(data_revisions.loc[key, 'detailed impacts computed']), + True) + assert_allclose(revision_details.loc[key, 'revised'], + y_revised.loc[key]) + assert_allclose(revision_details.loc[key, 'observed (prev)'], + y.loc[key]) + assert_allclose(revision_details.loc[key, 'revision'], diff) + + # For revisions to the impact period, the own-weight is equal to 1. + key = ('2009Q3', 'realcons', '2009Q3', 'realcons') + assert_allclose(revision_details.loc[key, 'weight'], 1) + assert_allclose(revision_details.loc[key, 'impact'], + revisions[('2009Q3', 'realcons')]) + key = ('2009Q3', 'cpi', '2009Q3', 'cpi') + assert_allclose(revision_details.loc[key, 'weight'], 1) + assert_allclose(revision_details.loc[key, 'impact'], + revisions[('2009Q3', 'cpi')]) + + # For revisions just before the impact period, all weights are equal to + # zero unless there is a missing value in the impact period, in which case + # the weight is equal to the AR coefficient + key = ('2009Q2', 'realgdp', '2009Q3', 'realgdp') + assert_allclose(revision_details.loc[key, 'weight'], ar_coeff['realgdp']) + assert_allclose(revision_details.loc[key, 'impact'], + ar_coeff['realgdp'] * revisions[('2009Q2', 'realgdp')]) + key = ('2009Q2', 'realinv', '2009Q3', 'realinv') + assert_allclose(revision_details.loc[key, 'weight'], 0) + assert_allclose(revision_details.loc[key, 'impact'], 0) + key = ('2009Q2', 'cpi', '2009Q3', 'cpi') + assert_allclose(revision_details.loc[key, 'weight'], 0) + assert_allclose(revision_details.loc[key, 'impact'], 0) + + # Check the impacts table + + # Since we only have revisions, all impacts are due to revisions + assert_allclose(news.impacts['impact of news'], 0) + assert_allclose(news.impacts['total impact'], + news.impacts['impact of revisions']) + + # Check the values for estimates + for name in ['cpi', 'realcons', 'realinv']: + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (new)'], + y_revised.loc['2009Q3', name]) + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (prev)'], + y.loc['2009Q3', name]) + # Have to handle real GDP separately since the 2009Q3 value is missing + name = 'realgdp' + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (new)'], + y_revised.loc['2009Q2', name] * ar_coeff[name]) + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (prev)'], + y.loc['2009Q2', name] * ar_coeff[name]) + + # Check that the values of revision impacts sum up correctly + assert_allclose( + news.impacts['impact of revisions'], + revision_details.groupby(level=[2, 3]).sum()['impact'] + ) + + +@pytest.mark.parametrize('revisions_details_start', [False, 202]) +def test_grouped_revisions(revisions_details_start): + # Tests for computing revision impacts when all revisions are grouped + # together (i.e. no detailed impacts are computed ) + # Construct original and revised datasets + y = np.log(dta[['realgdp', 'realcons', + 'realinv', 'cpi']]).diff().iloc[1:] * 100 + y.iloc[-1, 0] = np.nan + + y_revised = y.copy() + revisions = { + ('2009Q2', 'realgdp'): 1.1, + ('2009Q3', 'realcons'): 0.5, + ('2009Q2', 'realinv'): -0.3, + ('2009Q2', 'cpi'): 0.2, + ('2009Q3', 'cpi'): 0.2, + } + for key, diff in revisions.items(): + y_revised.loc[key] += diff + + # Create model and results + mod = varmax.VARMAX(y, trend='n') + ar_coeff = { + 'realgdp': 0.9, + 'realcons': 0.8, + 'realinv': 0.7, + 'cpi': 0.6 + } + params = np.r_[np.diag(list(ar_coeff.values())).flatten(), + [1, 0, 1, 0, 0, 1, 0, 0, 0, 1]] + res = mod.smooth(params) + res_revised = res.apply(y_revised) + news = res_revised.news(res, comparison_type='previous', tolerance=-1, + revisions_details_start=revisions_details_start) + + # Tests + data_revisions = news.data_revisions + revision_details = news.revision_details_by_update.reset_index([2, 3]) + + for key, diff in revisions.items(): + assert_allclose(data_revisions.loc[key, 'revised'], y_revised.loc[key]) + assert_allclose(data_revisions.loc[key, 'observed (prev)'], y.loc[key]) + assert_equal( + # Need to manually cast to numpy for compatibility with + # pandas==1.2.5 + np.array(data_revisions.loc[key, 'detailed impacts computed']), + False) + + # For grouped data, should not have any of revised, observed (prev), + # revision, weight + key = ('2009Q3', 'all prior revisions', '2009Q3') + cols = ['revised', 'observed (prev)', 'revision', 'weight'] + assert np.all(revision_details.loc[key, cols].isnull()) + + # Expected grouped impacts are the sum of the detailed impacts from + # `test_detailed_revisions` + assert_allclose(revision_details.loc[key + ('realgdp',), 'impact'], + ar_coeff['realgdp'] * revisions[('2009Q2', 'realgdp')]) + assert_allclose(revision_details.loc[key + ('realcons',), 'impact'], + revisions[('2009Q3', 'realcons')]) + assert_allclose(revision_details.loc[key + ('realinv',), 'impact'], 0) + assert_allclose(revision_details.loc[key + ('cpi',), 'impact'], + revisions[('2009Q3', 'cpi')]) + + # Check the values for estimates + for name in ['cpi', 'realcons', 'realinv']: + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (new)'], + y_revised.loc['2009Q3', name]) + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (prev)'], + y.loc['2009Q3', name]) + # Have to handle real GDP separately since the 2009Q3 value is missing + name = 'realgdp' + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (new)'], + y_revised.loc['2009Q2', name] * ar_coeff[name]) + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (prev)'], + y.loc['2009Q2', name] * ar_coeff[name]) + + # Check that the values of revision impacts sum up correctly + assert_allclose( + news.impacts['impact of revisions'], + revision_details.groupby(level=[2, 3]).sum()['impact'] + ) + + +@pytest.mark.parametrize('revisions_details_start', [-1, 201]) +def test_mixed_revisions(revisions_details_start): + # Construct original and revised datasets + y = np.log(dta[['realgdp', 'realcons', + 'realinv', 'cpi']]).diff().iloc[1:] * 100 + y.iloc[-1, 0] = np.nan + + y_revised = y.copy() + revisions = { + ('2009Q2', 'realgdp'): 1.1, + ('2009Q3', 'realcons'): 0.5, + ('2009Q2', 'realinv'): -0.3, + ('2009Q2', 'cpi'): 0.2, + ('2009Q3', 'cpi'): 0.2, + } + for key, diff in revisions.items(): + y_revised.loc[key] += diff + + # Create model and results + mod = varmax.VARMAX(y, trend='n') + ar_coeff = { + 'realgdp': 0.9, + 'realcons': 0.8, + 'realinv': 0.7, + 'cpi': 0.6 + } + params = np.r_[np.diag(list(ar_coeff.values())).flatten(), + [1, 0, 1, 0, 0, 1, 0, 0, 0, 1]] + res = mod.smooth(params) + res_revised = res.apply(y_revised) + news = res_revised.news(res, comparison_type='previous', tolerance=-1, + revisions_details_start=revisions_details_start) + + # Tests + data_revisions = news.data_revisions + revision_details = news.revision_details_by_update.reset_index([2, 3]) + + for key, diff in revisions.items(): + assert_allclose(data_revisions.loc[key, 'revised'], y_revised.loc[key]) + assert_allclose(data_revisions.loc[key, 'observed (prev)'], y.loc[key]) + # Revisions to 2009Q2 are grouped (i.e. no details are computed), + # while revisions to 2009Q3 have detailed impacts computed + expected_details_computed = key[0] == '2009Q3' + assert_equal( + # Need to manually cast to numpy for compatibility with + # pandas==1.2.5 + np.array(data_revisions.loc[key, 'detailed impacts computed']), + expected_details_computed) + + # For grouped data, should not have any of revised, observed (prev), + # revision, weight + key = ('2009Q2', 'all prior revisions', '2009Q3') + cols = ['revised', 'observed (prev)', 'revision', 'weight'] + assert np.all(revision_details.loc[key, cols].isnull()) + + # Expected grouped impacts are the sum of the detailed impacts from + # `test_detailed_revisions` for revisions to 2009Q2 + assert_allclose(revision_details.loc[key + ('realgdp',), 'impact'], + ar_coeff['realgdp'] * revisions[('2009Q2', 'realgdp')]) + assert_allclose(revision_details.loc[key + ('realinv',), 'impact'], 0) + + # Expected detailed impacts are for revisions to 2009Q3 + # (for revisions to the impact period, the own-weight is equal to 1) + key = ('2009Q3', 'realcons', '2009Q3', 'realcons') + assert_allclose(revision_details.loc[key, 'weight'], 1) + assert_allclose(revision_details.loc[key, 'impact'], + revisions[('2009Q3', 'realcons')]) + key = ('2009Q3', 'cpi', '2009Q3', 'cpi') + assert_allclose(revision_details.loc[key, 'weight'], 1) + assert_allclose(revision_details.loc[key, 'impact'], + revisions[('2009Q3', 'cpi')]) + + # Check the values for estimates + for name in ['cpi', 'realcons', 'realinv']: + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (new)'], + y_revised.loc['2009Q3', name]) + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (prev)'], + y.loc['2009Q3', name]) + # Have to handle real GDP separately since the 2009Q3 value is missing + name = 'realgdp' + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (new)'], + y_revised.loc['2009Q2', name] * ar_coeff[name]) + assert_allclose( + news.impacts.loc[('2009Q3', name), 'estimate (prev)'], + y.loc['2009Q2', name] * ar_coeff[name]) + + # Check that the values of revision impacts sum up correctly + assert_allclose( + news.impacts['impact of revisions'], + revision_details.groupby(level=[2, 3]).sum()['impact'] + ) diff --git a/statsmodels/tsa/statespace/tests/test_simulate.py b/statsmodels/tsa/statespace/tests/test_simulate.py index ec6b3d147d1..4b6486193b0 100644 --- a/statsmodels/tsa/statespace/tests/test_simulate.py +++ b/statsmodels/tsa/statespace/tests/test_simulate.py @@ -5,17 +5,26 @@ License: Simplified-BSD """ +from statsmodels.compat.pandas import MONTH_END + import numpy as np -import pandas as pd from numpy.testing import assert_, assert_allclose, assert_equal +import pandas as pd import pytest from scipy.signal import lfilter +from statsmodels.tools.sm_exceptions import ( + EstimationWarning, + SpecificationWarning, +) +from statsmodels.tsa.statespace import ( + dynamic_factor, + sarimax, + structural, + varmax, +) + from .test_impulse_responses import TVSS -from statsmodels.tools.sm_exceptions import SpecificationWarning, \ - EstimationWarning -from statsmodels.tsa.statespace import (sarimax, structural, varmax, - dynamic_factor) def test_arma_lfilter(): @@ -1578,7 +1587,7 @@ def test_pandas_univariate_rangeindex_repetitions(): def test_pandas_univariate_dateindex(): # Simulation will maintain have date index - ix = pd.date_range(start='2000', periods=2, freq='M') + ix = pd.date_range(start='2000', periods=2, freq=MONTH_END) endog = pd.Series(np.zeros(2), index=ix) mod = sarimax.SARIMAX(endog) res = mod.filter([0.5, 1.]) @@ -1586,7 +1595,7 @@ def test_pandas_univariate_dateindex(): # Default simulate anchors to the start of the sample actual = res.simulate(2, state_shocks=np.zeros(2), initial_state=np.zeros(1)) - ix = pd.date_range(start='2000-01', periods=2, freq='M') + ix = pd.date_range(start='2000-01', periods=2, freq=MONTH_END) desired = pd.Series([0, 0], index=ix) assert_allclose(actual, desired) assert_(actual.index.equals(desired.index)) @@ -1594,14 +1603,14 @@ def test_pandas_univariate_dateindex(): # Alternative anchor changes the index actual = res.simulate(2, anchor=2, state_shocks=np.zeros(2), initial_state=np.zeros(1)) - ix = pd.date_range(start='2000-03', periods=2, freq='M') + ix = pd.date_range(start='2000-03', periods=2, freq=MONTH_END) desired = pd.Series([0, 0], index=ix) assert_allclose(actual, desired) def test_pandas_univariate_dateindex_repetitions(): # Simulation will maintain have date index - ix = pd.date_range(start='2000', periods=2, freq='M') + ix = pd.date_range(start='2000', periods=2, freq=MONTH_END) endog = pd.Series(np.zeros(2), index=ix) mod = sarimax.SARIMAX(endog) res = mod.filter([0.5, 1.]) @@ -1609,7 +1618,7 @@ def test_pandas_univariate_dateindex_repetitions(): # Default simulate anchors to the start of the sample actual = res.simulate(2, state_shocks=np.zeros(2), initial_state=np.zeros(1), repetitions=2) - ix = pd.date_range(start='2000-01', periods=2, freq='M') + ix = pd.date_range(start='2000-01', periods=2, freq=MONTH_END) columns = pd.MultiIndex.from_product([['y'], [0, 1]]) desired = pd.DataFrame(np.zeros((2, 2)), index=ix, columns=columns) assert_allclose(actual, desired) @@ -1618,7 +1627,7 @@ def test_pandas_univariate_dateindex_repetitions(): # Alternative anchor changes the index actual = res.simulate(2, anchor=2, state_shocks=np.zeros(2), initial_state=np.zeros(1), repetitions=2) - ix = pd.date_range(start='2000-03', periods=2, freq='M') + ix = pd.date_range(start='2000-03', periods=2, freq=MONTH_END) columns = pd.MultiIndex.from_product([['y'], [0, 1]]) desired = pd.DataFrame(np.zeros((2, 2)), index=ix, columns=columns) assert_allclose(actual, desired) @@ -1674,7 +1683,7 @@ def test_pandas_multivariate_rangeindex_repetitions(): def test_pandas_multivariate_dateindex(): # Simulate will also have RangeIndex - ix = pd.date_range(start='2000', periods=2, freq='M') + ix = pd.date_range(start='2000', periods=2, freq=MONTH_END) endog = pd.DataFrame(np.zeros((2, 2)), index=ix) mod = varmax.VARMAX(endog, trend='n') res = mod.filter([0.5, 0., 0., 0.2, 1., 0., 1.]) @@ -1688,7 +1697,7 @@ def test_pandas_multivariate_dateindex(): # Alternative anchor changes the index actual = res.simulate(2, anchor=2, state_shocks=np.zeros((2, 2)), initial_state=np.zeros(2)) - ix = pd.date_range(start='2000-03', periods=2, freq='M') + ix = pd.date_range(start='2000-03', periods=2, freq=MONTH_END) desired = pd.DataFrame(np.zeros((2, 2)), index=ix) assert_allclose(actual, desired) assert_(actual.index.equals(desired.index)) @@ -1696,7 +1705,7 @@ def test_pandas_multivariate_dateindex(): def test_pandas_multivariate_dateindex_repetitions(): # Simulate will also have RangeIndex - ix = pd.date_range(start='2000', periods=2, freq='M') + ix = pd.date_range(start='2000', periods=2, freq=MONTH_END) endog = pd.DataFrame(np.zeros((2, 2)), columns=['y1', 'y2'], index=ix) mod = varmax.VARMAX(endog, trend='n') res = mod.filter([0.5, 0., 0., 0.2, 1., 0., 1.]) @@ -1712,7 +1721,7 @@ def test_pandas_multivariate_dateindex_repetitions(): # Alternative anchor changes the index actual = res.simulate(2, anchor=2, state_shocks=np.zeros((2, 2)), initial_state=np.zeros(2), repetitions=2) - ix = pd.date_range(start='2000-03', periods=2, freq='M') + ix = pd.date_range(start='2000-03', periods=2, freq=MONTH_END) columns = pd.MultiIndex.from_product([['y1', 'y2'], [0, 1]]) desired = pd.DataFrame(np.zeros((2, 4)), index=ix, columns=columns) assert_allclose(actual, desired) @@ -1722,7 +1731,7 @@ def test_pandas_multivariate_dateindex_repetitions(): def test_pandas_anchor(): # Test that anchor with dates works - ix = pd.date_range(start='2000', periods=2, freq='M') + ix = pd.date_range(start='2000', periods=2, freq=MONTH_END) endog = pd.Series(np.zeros(2), index=ix) mod = sarimax.SARIMAX(endog) res = mod.filter([0.5, 1.]) diff --git a/statsmodels/tsa/statespace/tests/test_structural.py b/statsmodels/tsa/statespace/tests/test_structural.py index 76b8a101cda..e27102bad03 100644 --- a/statsmodels/tsa/statespace/tests/test_structural.py +++ b/statsmodels/tsa/statespace/tests/test_structural.py @@ -66,11 +66,11 @@ def run_ucm(name, use_exact_diffuse=False): freqstr = freq[0] if freq is not None else values.index.freqstr[0] if 'cycle_period_bounds' in kwargs: cycle_period_bounds = kwargs['cycle_period_bounds'] - elif freqstr == 'A': + elif freqstr in ('A', 'AS', 'Y', 'YS'): cycle_period_bounds = (1.5, 12) - elif freqstr == 'Q': + elif freqstr in ('Q', 'QS'): cycle_period_bounds = (1.5*4, 12*4) - elif freqstr == 'M': + elif freqstr in ('M', 'MS'): cycle_period_bounds = (1.5*12, 12*12) else: # If we have no information on data frequency, require the diff --git a/statsmodels/tsa/statespace/tests/test_weights.py b/statsmodels/tsa/statespace/tests/test_weights.py index d08bcb610f0..85cfce84e53 100644 --- a/statsmodels/tsa/statespace/tests/test_weights.py +++ b/statsmodels/tsa/statespace/tests/test_weights.py @@ -506,8 +506,10 @@ def test_compute_t_compute_j(compute_j, compute_t, reset_randomstate): actual, _, _ = tools.compute_smoothed_state_weights( res, compute_t=compute_t, compute_j=compute_j) - compute_t = np.atleast_1d(compute_t) - compute_j = np.atleast_1d(compute_j) + compute_t = np.unique(np.atleast_1d(compute_t)) + compute_t.sort() + compute_j = np.unique(np.atleast_1d(compute_j)) + compute_j.sort() for t in np.arange(10): if t not in compute_t: desired[t, :] = np.nan @@ -515,6 +517,10 @@ def test_compute_t_compute_j(compute_j, compute_t, reset_randomstate): if j not in compute_j: desired[:, j] = np.nan + # Subset to the actual required compute_t and compute_j + ix = np.ix_(compute_t, compute_j) + desired = desired[ix] + assert_allclose(actual, desired, atol=1e-7) diff --git a/statsmodels/tsa/statespace/tools.py b/statsmodels/tsa/statespace/tools.py index a45fd896410..43c708fe48b 100644 --- a/statsmodels/tsa/statespace/tools.py +++ b/statsmodels/tsa/statespace/tools.py @@ -1940,13 +1940,15 @@ def _compute_smoothed_state_weights(ssm, compute_t=None, compute_j=None, # Re-order missing entries correctly and transpose to the appropriate # shape - if np.any(_model.nmissing): + t0 = min(compute_t[0], compute_j[0]) + missing = np.isnan(ssm.endog[:, t0:]) + if np.any(missing): shape = weights.shape # Transpose m, p, t, j, -> t, m, p, j so that we can use the # `reorder_missing_matrix` function weights = np.asfortranarray(weights.transpose(2, 0, 1, 3).reshape( shape[2] * shape[0], shape[1], shape[3], order='C')) - missing = np.asfortranarray(np.isnan(ssm.endog).astype(np.int32)) + missing = np.asfortranarray(missing.astype(np.int32)) reorder_missing_matrix(weights, missing, reorder_cols=True, inplace=True) # Transpose t, m, p, j -> t, j, m, p, @@ -1962,6 +1964,13 @@ def _compute_smoothed_state_weights(ssm, compute_t=None, compute_j=None, # Transpose m, l, t -> t, m, l prior_weights = prior_weights.transpose(2, 0, 1) + # Subset to the actual computed t, j elements + ix_tj = np.ix_(compute_t - t0, compute_j - t0) + weights = weights[ix_tj] + state_intercept_weights = state_intercept_weights[ix_tj] + if compute_prior_weights: + prior_weights = prior_weights[compute_t - t0] + return weights, state_intercept_weights, prior_weights diff --git a/statsmodels/tsa/statespace/varmax.py b/statsmodels/tsa/statespace/varmax.py index 2aaa7d970a5..303044d264c 100644 --- a/statsmodels/tsa/statespace/varmax.py +++ b/statsmodels/tsa/statespace/varmax.py @@ -1062,7 +1062,15 @@ def simulate(self, nsimulations, measurement_shocks=None, return out def _news_previous_results(self, previous, start, end, periods, + revisions_details_start=False, state_index=None): + # TODO: tests for: + # - the model cloning used in `kalman_smoother.news` works when we + # have time-varying exog (i.e. or do we need to somehow explicitly + # call the _set_final_exog and _set_final_predicted_state methods + # on the rev_mod / revision_results) + # - in the case of revisions to `endog`, should the revised model use + # the `previous` exog? or the `revised` exog? # We need to figure out the out-of-sample exog, so that we can add back # in the last exog, predicted state exog = None @@ -1070,34 +1078,16 @@ def _news_previous_results(self, previous, start, end, periods, if self.model.k_exog > 0 and out_of_sample > 0: exog = self.model.exog[-out_of_sample:] - # We also need to manually compute the `revised` results, - rev_endog = self.model.endog[:previous.nobs].copy() - rev_endog[previous.filter_results.missing.astype(bool).T] = np.nan - has_revisions = not np.allclose(rev_endog, previous.model.endog, - equal_nan=True) - revised_results = None - if has_revisions: - rev_exog = None - if self.model.exog is not None: - rev_exog = self.model.exog[:previous.nobs].copy() - rev_mod = previous.model.clone(rev_endog, exog=rev_exog) - revised = rev_mod.smooth(self.params) - # Compute the news with contextlib.ExitStack() as stack: stack.enter_context(previous.model._set_final_exog(exog)) stack.enter_context(previous._set_final_predicted_state( exog, out_of_sample)) - if has_revisions: - stack.enter_context(revised.model._set_final_exog(exog)) - stack.enter_context(revised._set_final_predicted_state( - exog, out_of_sample)) - revised_results = revised.smoother_results - out = self.smoother_results.news( previous.smoother_results, start=start, end=end, - revised=revised_results, state_index=state_index) + revisions_details_start=revisions_details_start, + state_index=state_index) return out @Appender(MLEResults.summary.__doc__) diff --git a/statsmodels/tsa/stattools.py b/statsmodels/tsa/stattools.py index 6b30f217308..891c77634fd 100644 --- a/statsmodels/tsa/stattools.py +++ b/statsmodels/tsa/stattools.py @@ -8,7 +8,7 @@ from statsmodels.compat.python import Literal, lzip from statsmodels.compat.scipy import _next_regular -from typing import Tuple +from typing import Union, List import warnings import numpy as np @@ -40,6 +40,8 @@ from statsmodels.tsa.adfvalues import mackinnoncrit, mackinnonp from statsmodels.tsa.tsatools import add_trend, lagmat, lagmat2ds +ArrayLike1D = Union[np.ndarray, pd.Series, List[float]] + __all__ = [ "acovf", "acf", @@ -709,8 +711,11 @@ def acf( else: return acf, qstat, pvalue - -def pacf_yw(x, nlags=None, method="adjusted"): +def pacf_yw( + x: ArrayLike1D, + nlags: int | None = None, + method: Literal["adjusted", "mle"] = "adjusted", +) -> np.ndarray: """ Partial autocorrelation estimated with non-recursive yule_walker. @@ -747,8 +752,7 @@ def pacf_yw(x, nlags=None, method="adjusted"): nlags = int_like(nlags, "nlags", optional=True) nobs = x.shape[0] if nlags is None: - nlags = min(int(10 * np.log10(nobs)), nobs - 1) - + nlags = max(min(int(10 * np.log10(nobs)), nobs - 1), 1) method = string_like(method, "method", options=("adjusted", "mle")) pacf = [1.0] with warnings.catch_warnings(): @@ -758,7 +762,9 @@ def pacf_yw(x, nlags=None, method="adjusted"): return np.array(pacf) -def pacf_burg(x, nlags=None, demean=True): +def pacf_burg( + x: ArrayLike1D, nlags: int | None = None, demean: bool = True +) -> tuple[np.ndarray, np.ndarray]: """ Calculate Burg"s partial autocorrelation estimator. @@ -800,6 +806,7 @@ def pacf_burg(x, nlags=None, demean=True): x = x - x.mean() nobs = x.shape[0] p = nlags if nlags is not None else min(int(10 * np.log10(nobs)), nobs - 1) + p = max(p, 1) if p > nobs - 1: raise ValueError("nlags must be smaller than nobs - 1") d = np.zeros(p + 1) @@ -818,14 +825,19 @@ def pacf_burg(x, nlags=None, demean=True): v[1:] = last_v[1:] - pacf[i] * last_u[:-1] d[i + 1] = (1 - pacf[i] ** 2) * d[i] - v[i] ** 2 - u[-1] ** 2 pacf[i + 1] = 2 / d[i + 1] * v[i + 1 :].dot(u[i:-1]) - sigma2 = (1 - pacf ** 2) * d / (2.0 * (nobs - np.arange(0, p + 1))) + sigma2 = (1 - pacf**2) * d / (2.0 * (nobs - np.arange(0, p + 1))) pacf[0] = 1 # Insert the 0 lag partial autocorrel return pacf, sigma2 @deprecate_kwarg("unbiased", "adjusted") -def pacf_ols(x, nlags=None, efficient=True, adjusted=False): +def pacf_ols( + x: ArrayLike1D, + nlags: int | None = None, + efficient: bool = True, + adjusted: bool = False, +) -> np.ndarray: """ Calculate partial autocorrelations via OLS. @@ -885,8 +897,9 @@ def pacf_ols(x, nlags=None, efficient=True, adjusted=False): adjusted = bool_like(adjusted, "adjusted") nobs = x.shape[0] if nlags is None: - nlags = min(int(10 * np.log10(nobs)), nobs - 1) - + nlags = max(min(int(10 * np.log10(nobs)), nobs // 2), 1) + if nlags > nobs//2: + raise ValueError(f"nlags must be smaller than nobs // 2 ({nobs//2})") pacf = np.empty(nlags + 1) pacf[0] = 1.0 if efficient: @@ -903,14 +916,30 @@ def pacf_ols(x, nlags=None, efficient=True, adjusted=False): params = lstsq(xlags[:, :k], x0, rcond=None)[0] # Last coefficient corresponds to PACF value (see [1]) pacf[k] = np.squeeze(params[-1]) - if adjusted: pacf *= nobs / (nobs - np.arange(nlags + 1)) - return pacf -def pacf(x, nlags=None, method="ywadjusted", alpha=None): +def pacf( + x: ArrayLike1D, + nlags: int | None = None, + method: Literal[ + "yw", + "ywadjusted", + "ols", + "ols-inefficient", + "ols-adjusted", + "ywm", + "ywmle", + "ld", + "ldadjusted", + "ldb", + "ldbiased", + "burg", + ] = "ywadjusted", + alpha: float | None = None, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: """ Partial autocorrelation estimate. @@ -996,7 +1025,7 @@ def pacf(x, nlags=None, method="ywadjusted", alpha=None): "ldb", "ldbiased", "ld_biased", - "burg" + "burg", ) x = array_like(x, "x", maxdim=2) method = string_like(method, "method", options=methods) @@ -1005,13 +1034,13 @@ def pacf(x, nlags=None, method="ywadjusted", alpha=None): nobs = x.shape[0] if nlags is None: nlags = min(int(10 * np.log10(nobs)), nobs // 2 - 1) - if nlags >= x.shape[0] // 2: + nlags = max(nlags, 1) + if nlags > x.shape[0] // 2: raise ValueError( "Can only compute partial correlations for lags up to 50% of the " f"sample size. The requested nlags {nlags} must be < " f"{x.shape[0] // 2}." ) - if method in ("ols", "ols-inefficient", "ols-adjusted"): efficient = "inefficient" not in method adjusted = "adjusted" in method @@ -1031,7 +1060,6 @@ def pacf(x, nlags=None, method="ywadjusted", alpha=None): acv = acovf(x, adjusted=False, fft=False) ld_ = levinson_durbin(acv, nlags=nlags, isacov=True) ret = ld_[2] - if alpha is not None: varacf = 1.0 / len(x) # for all lags >=1 interval = stats.norm.ppf(1.0 - alpha / 2.0) * np.sqrt(varacf) @@ -1538,7 +1566,7 @@ def grangercausalitytests(x, maxlag, addconst=True, verbose=None): if x.shape[0] <= 3 * maxlag + int(addconst): raise ValueError( "Insufficient observations. Maximum allowable " - "lag is {0}".format(int((x.shape[0] - int(addconst)) / 3) - 1) + "lag is {}".format(int((x.shape[0] - int(addconst)) / 3) - 1) ) resli = {} @@ -1945,7 +1973,7 @@ def kpss( regression: Literal["c", "ct"] = "c", nlags: Literal["auto", "legacy"] | int = "auto", store: bool = False, -) -> Tuple[float, float, int, dict[str, float]]: +) -> tuple[float, float, int, dict[str, float]]: """ Kwiatkowski-Phillips-Schmidt-Shin test for stationarity. @@ -2032,7 +2060,7 @@ def kpss( # if m is not one, n != m * n if nobs != x.size: - raise ValueError("x of shape {0} not understood".format(x.shape)) + raise ValueError(f"x of shape {x.shape} not understood") if hypo == "ct": # p. 162 Kwiatkowski et al. (1992): y_t = beta * t + r_t + e_t, @@ -2106,8 +2134,8 @@ def kpss( rstore.nobs = nobs stationary_type = "level" if hypo == "c" else "trend" - rstore.H0 = "The series is {0} stationary".format(stationary_type) - rstore.HA = "The series is not {0} stationary".format(stationary_type) + rstore.H0 = f"The series is {stationary_type} stationary" + rstore.HA = f"The series is not {stationary_type} stationary" return kpss_stat, p_value, crit_dict, rstore else: @@ -2200,7 +2228,7 @@ def range_unit_root_test(x, store=False): # if m is not one, n != m * n if nobs != x.size: - raise ValueError("x of shape {0} not understood".format(x.shape)) + raise ValueError(f"x of shape {x.shape} not understood") # Table from [1] has been replicated using 200,000 samples # Critical values for new n_obs values have been identified diff --git a/statsmodels/tsa/stl/tests/test_stl.py b/statsmodels/tsa/stl/tests/test_stl.py index fc51365243d..5862dcfacdc 100644 --- a/statsmodels/tsa/stl/tests/test_stl.py +++ b/statsmodels/tsa/stl/tests/test_stl.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import MONTH_END + import os import pickle @@ -274,7 +276,7 @@ def test_period_detection(default_kwargs): del class_kwargs["period"] endog = class_kwargs["endog"] - index = pd.date_range("1-1-1959", periods=348, freq="M") + index = pd.date_range("1-1-1959", periods=348, freq=MONTH_END) class_kwargs["endog"] = pd.Series(endog, index=index) mod = STL(**class_kwargs) @@ -336,6 +338,6 @@ def test_pickle(default_kwargs): def test_squezable_to_1d(): data = co2.load().data - data = data.resample("M").mean().ffill() + data = data.resample(MONTH_END).mean().ffill() res = STL(data).fit() assert isinstance(res, DecomposeResult) diff --git a/statsmodels/tsa/tests/test_ar.py b/statsmodels/tsa/tests/test_ar.py index 161ce0c6522..f80b3ca05c5 100644 --- a/statsmodels/tsa/tests/test_ar.py +++ b/statsmodels/tsa/tests/test_ar.py @@ -1,11 +1,12 @@ """ Test AR Model """ +from statsmodels.compat.pandas import MONTH_END from statsmodels.compat.pytest import pytest_warns -from typing import NamedTuple, Union import datetime as dt from itertools import product +from typing import NamedTuple, Union import numpy as np from numpy.testing import assert_allclose, assert_almost_equal @@ -40,7 +41,7 @@ def gen_ar_data(nobs): rs = np.random.RandomState(982739) - idx = pd.date_range(dt.datetime(1900, 1, 1), freq="M", periods=nobs) + idx = pd.date_range(dt.datetime(1900, 1, 1), freq=MONTH_END, periods=nobs) return pd.Series(rs.standard_normal(nobs), index=idx), rs @@ -273,7 +274,7 @@ def gen_data(nobs, nexog, pandas, seed=92874765): exog = rs.standard_normal((nobs, nexog)) if nexog else None if pandas: index = pd.date_range( - dt.datetime(1999, 12, 31), periods=nobs, freq="M" + dt.datetime(1999, 12, 31), periods=nobs, freq=MONTH_END ) endog = pd.Series(endog, name="endog", index=index) if nexog: @@ -722,7 +723,9 @@ def test_ar_order_select(): y = arma_generate_sample([1, -0.75, 0.3], [1], 100) ts = Series( y, - index=date_range(start=dt.datetime(1990, 1, 1), periods=100, freq="M"), + index=date_range( + start=dt.datetime(1990, 1, 1), periods=100, freq=MONTH_END + ), ) res = ar_select_order(ts, maxlag=12, ic="aic") assert tuple(res.ar_lags) == (1, 2) @@ -808,7 +811,7 @@ def test_equiv_dynamic(reset_randomstate): res = mod.fit() pred0 = res.predict(500, 800, dynamic=0) pred1 = res.predict(500, 800, dynamic=True) - idx = pd.date_range(dt.datetime(2000, 1, 30), periods=1001, freq="M") + idx = pd.date_range(dt.datetime(2000, 1, 30), periods=1001, freq=MONTH_END) y = pd.Series(y, index=idx) mod = AutoReg(y, 1) res = mod.fit() @@ -853,8 +856,12 @@ def test_predict_seasonal(): for i in range(1, 1001): y[i] = 10 + 0.9 * y[i - 1] + e[i] + effects[i % 12] ys = pd.Series( - y, index=pd.date_range(dt.datetime(1950, 1, 1), periods=1001, freq="M") + y, + index=pd.date_range( + dt.datetime(1950, 1, 1), periods=1001, freq=MONTH_END + ), ) + mod = AutoReg(ys, 1, seasonal=True) res = mod.fit() c = res.params.iloc[0] @@ -867,14 +874,14 @@ def test_predict_seasonal(): for i in range(1, 201): direct[i] = direct[i - 1] * ar + c + seasons[(900 + i) % 12] direct = pd.Series( - direct, index=pd.date_range(ys.index[900], periods=201, freq="M") + direct, index=pd.date_range(ys.index[900], periods=201, freq=MONTH_END) ) assert_series_equal(pred, direct) pred = res.predict(900, dynamic=False) direct = y[899:-1] * ar + c + seasons[np.arange(900, 1001) % 12] direct = pd.Series( - direct, index=pd.date_range(ys.index[900], periods=101, freq="M") + direct, index=pd.date_range(ys.index[900], periods=101, freq=MONTH_END) ) assert_series_equal(pred, direct) @@ -888,7 +895,10 @@ def test_predict_exog(): for i in range(3, 1001): y[i] = 10 + 0.9 * y[i - 1] - 0.5 * y[i - 3] + e[i] + x[i].sum() ys = pd.Series( - y, index=pd.date_range(dt.datetime(1950, 1, 1), periods=1001, freq="M") + y, + index=pd.date_range( + dt.datetime(1950, 1, 1), periods=1001, freq=MONTH_END + ), ) xdf = pd.DataFrame(x, columns=["x0", "x1"], index=ys.index) mod = AutoReg(ys, [1, 3], trend="c", exog=xdf) @@ -903,7 +913,7 @@ def test_predict_exog(): phi_2 = ar.iloc[1] direct = c + phi_1 * y[899:-1] + phi_2 * y[897:-3] direct += ex[0] * x[900:, 0] + ex[1] * x[900:, 1] - idx = pd.date_range(ys.index[900], periods=101, freq="M") + idx = pd.date_range(ys.index[900], periods=101, freq=MONTH_END) direct = pd.Series(direct, index=idx) assert_series_equal(pred, direct) exog_oos = rs.standard_normal((100, 2)) @@ -923,7 +933,7 @@ def test_predict_exog(): direct[i] += exog_oos[i - 101] @ ex direct = pd.Series( - direct, index=pd.date_range(ys.index[900], periods=201, freq="M") + direct, index=pd.date_range(ys.index[900], periods=201, freq=MONTH_END) ) assert_series_equal(pred, direct) @@ -936,7 +946,10 @@ def test_predict_irregular_ar(): for i in range(3, 1001): y[i] = 10 + 0.9 * y[i - 1] - 0.5 * y[i - 3] + e[i] ys = pd.Series( - y, index=pd.date_range(dt.datetime(1950, 1, 1), periods=1001, freq="M") + y, + index=pd.date_range( + dt.datetime(1950, 1, 1), periods=1001, freq=MONTH_END + ) ) mod = AutoReg(ys, [1, 3], trend="ct") res = mod.fit() @@ -954,7 +967,7 @@ def test_predict_irregular_ar(): c + t * (901 + i) + ar[0] * direct[i - 1] + ar[1] * direct[i - 3] ) direct = pd.Series( - direct, index=pd.date_range(ys.index[900], periods=201, freq="M") + direct, index=pd.date_range(ys.index[900], periods=201, freq=MONTH_END) ) assert_series_equal(pred, direct) @@ -965,7 +978,7 @@ def test_predict_irregular_ar(): + ar[0] * y[899:-1] + ar[1] * y[897:-3] ) - idx = pd.date_range(ys.index[900], periods=101, freq="M") + idx = pd.date_range(ys.index[900], periods=101, freq=MONTH_END) direct = pd.Series(direct, index=idx) assert_series_equal(pred, direct) @@ -980,12 +993,20 @@ def test_forecast_start_end_equiv(dynamic): for i in range(1, 1001): y[i] = 10 + 0.9 * y[i - 1] + e[i] + effects[i % 12] ys = pd.Series( - y, index=pd.date_range(dt.datetime(1950, 1, 1), periods=1001, freq="M") + y, index=pd.date_range( + dt.datetime(1950, 1, 1), + periods=1001, + freq=MONTH_END + ) ) mod = AutoReg(ys, 1, seasonal=True) res = mod.fit() pred_int = res.predict(1000, 1020, dynamic=dynamic) - dates = pd.date_range(dt.datetime(1950, 1, 1), periods=1021, freq="M") + dates = pd.date_range( + dt.datetime(1950, 1, 1), + periods=1021, + freq=MONTH_END + ) pred_dates = res.predict(dates[1000], dates[1020], dynamic=dynamic) assert_series_equal(pred_int, pred_dates) @@ -1057,7 +1078,7 @@ def test_autoreg_plot_err(): def test_autoreg_resids(): - idx = pd.date_range(dt.datetime(1900, 1, 1), periods=250, freq="M") + idx = pd.date_range(dt.datetime(1900, 1, 1), periods=250, freq=MONTH_END) rs = np.random.RandomState(0) idx_dates = sorted(rs.choice(idx, size=100, replace=False)) e = rs.standard_normal(250) @@ -1259,7 +1280,7 @@ def append_data(): x_oos = rs.standard_normal((10, 3)) y_oos = rs.standard_normal(10) index = pd.date_range( - "2020-1-1", periods=y.shape[0] + y_oos.shape[0], freq="M" + "2020-1-1", periods=y.shape[0] + y_oos.shape[0], freq=MONTH_END ) y = pd.Series(y, index=index[: y.shape[0]], name="y") x = pd.DataFrame( diff --git a/statsmodels/tsa/tests/test_arima_process.py b/statsmodels/tsa/tests/test_arima_process.py index 3dd8bfb5dd6..af1b3f90a3c 100644 --- a/statsmodels/tsa/tests/test_arima_process.py +++ b/statsmodels/tsa/tests/test_arima_process.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import QUARTER_END + import datetime as dt import numpy as np @@ -474,7 +476,7 @@ def test_from_estimation(d, seasonal): ar = [0.8] if not seasonal else [0.8, 0, 0, 0.2, -0.16] ma = [0.4] if not seasonal else [0.4, 0, 0, 0.2, -0.08] ap = ArmaProcess.from_coeffs(ar, ma, 500) - idx = pd.date_range(dt.datetime(1900, 1, 1), periods=500, freq="Q") + idx = pd.date_range(dt.datetime(1900, 1, 1), periods=500, freq=QUARTER_END) data = ap.generate_sample(500) if d == 1: data = np.cumsum(data) diff --git a/statsmodels/tsa/tests/test_deterministic.py b/statsmodels/tsa/tests/test_deterministic.py index d63e58ac047..2c31cd7e41a 100644 --- a/statsmodels/tsa/tests/test_deterministic.py +++ b/statsmodels/tsa/tests/test_deterministic.py @@ -1,4 +1,10 @@ -from statsmodels.compat.pandas import PD_LT_1_0_0, is_int_index +from statsmodels.compat.pandas import ( + MONTH_END, + PD_LT_1_0_0, + QUARTER_END, + YEAR_END, + is_int_index, +) from statsmodels.compat.pytest import pytest_warns from typing import Hashable, Tuple @@ -30,7 +36,7 @@ def time_index(request): def index(request): param = request.param if param in ("period", "datetime"): - idx = pd.date_range("2000-01-01", periods=137, freq="M") + idx = pd.date_range("2000-01-01", periods=137, freq=MONTH_END) if param == "period": idx = idx.to_period("M") elif param == "range": @@ -145,7 +151,7 @@ def test_fourier_smoke(index, forecast_index): @pytest.mark.smoke def test_calendar_time_trend_smoke(time_index, forecast_index): - ct = CalendarTimeTrend("A", order=2) + ct = CalendarTimeTrend(YEAR_END, order=2) ct.in_sample(time_index) steps = 83 if forecast_index is None else len(forecast_index) ct.out_of_sample(steps, time_index, forecast_index) @@ -159,7 +165,7 @@ def test_calendar_time_trend_smoke(time_index, forecast_index): @pytest.mark.smoke def test_calendar_fourier_smoke(time_index, forecast_index): - cf = CalendarFourier("A", 2) + cf = CalendarFourier(YEAR_END, 2) cf.in_sample(time_index) steps = 83 if forecast_index is None else len(forecast_index) cf.out_of_sample(steps, time_index, forecast_index) @@ -192,14 +198,14 @@ def test_calendar_seasonality(time_index, forecast_index, freq_period): def test_forbidden_index(): index = pd.RangeIndex(0, 10) - ct = CalendarTimeTrend("A", order=2) + ct = CalendarTimeTrend(YEAR_END, order=2) with pytest.raises(TypeError, match="CalendarTimeTrend terms can only"): ct.in_sample(index) def test_calendar_time_trend_base(time_index): - ct = CalendarTimeTrend("M", True, order=3, base_period="1960-1-1") - ct2 = CalendarTimeTrend("M", True, order=3) + ct = CalendarTimeTrend(MONTH_END, True, order=3, base_period="1960-1-1") + ct2 = CalendarTimeTrend(MONTH_END, True, order=3) assert ct != ct2 str(ct) str(ct2) @@ -209,14 +215,14 @@ def test_calendar_time_trend_base(time_index): def test_invalid_freq_period(time_index): with pytest.raises(ValueError, match="The combination of freq="): - CalendarSeasonality("H", "A") + CalendarSeasonality("h", YEAR_END) cs = CalendarSeasonality("B", "W") with pytest.raises(ValueError, match="freq is B but index contains"): cs.in_sample(pd.date_range("2000-1-1", periods=10, freq="D")) def test_check_index_type(): - ct = CalendarTimeTrend("A", True, order=3) + ct = CalendarTimeTrend(YEAR_END, True, order=3) idx = pd.RangeIndex(0, 20) with pytest.raises(TypeError, match="CalendarTimeTrend terms can only"): ct._check_index_type(idx, pd.DatetimeIndex) @@ -429,8 +435,8 @@ def test_calendar_time_trend(reset_randomstate): def test_calendar_seasonal_period_w(): period = "W" - index = pd.date_range("2000-01-03", freq="H", periods=600) - cs = CalendarSeasonality("H", period=period) + index = pd.date_range("2000-01-03", freq="h", periods=600) + cs = CalendarSeasonality("h", period=period) terms = cs.in_sample(index) assert np.all(terms.sum(1) == 1.0) for i in range(index.shape[0]): @@ -453,8 +459,8 @@ def test_calendar_seasonal_period_w(): def test_calendar_seasonal_period_d(): period = "D" - index = pd.date_range("2000-01-03", freq="H", periods=600) - cs = CalendarSeasonality("H", period=period) + index = pd.date_range("2000-01-03", freq="h", periods=600) + cs = CalendarSeasonality("h", period=period) terms = cs.in_sample(index) assert np.all(terms.sum(1) == 1.0) for i in range(index.shape[0]): @@ -463,8 +469,8 @@ def test_calendar_seasonal_period_d(): def test_calendar_seasonal_period_q(): period = "Q" - index = pd.date_range("2000-01-01", freq="M", periods=600) - cs = CalendarSeasonality("M", period=period) + index = pd.date_range("2000-01-01", freq=MONTH_END, periods=600) + cs = CalendarSeasonality(MONTH_END, period=period) terms = cs.in_sample(index) assert np.all(terms.sum(1) == 1.0) for i in range(index.shape[0]): @@ -472,15 +478,15 @@ def test_calendar_seasonal_period_q(): def test_calendar_seasonal_period_a(): - period = "A" - index = pd.date_range("2000-01-01", freq="M", periods=600) - cs = CalendarSeasonality("M", period=period) + period = "Y" + index = pd.date_range("2000-01-01", freq=MONTH_END, periods=600) + cs = CalendarSeasonality(MONTH_END, period=period) terms = cs.in_sample(index) assert np.all(terms.sum(1) == 1.0) for i in range(index.shape[0]): assert terms.iloc[i, i % 12] == 1.0 - cs = CalendarSeasonality("Q", period=period) + cs = CalendarSeasonality(QUARTER_END, period=period) terms = cs.in_sample(index) assert np.all(terms.sum(1) == 1.0) for i in range(index.shape[0]): @@ -530,7 +536,7 @@ def test_range_error(): def test_range_index_basic(): - idx = pd.date_range("2000-1-1", freq="M", periods=120) + idx = pd.date_range("2000-1-1", freq=MONTH_END, periods=120) dp = DeterministicProcess(idx, constant=True, order=1, seasonal=True) dp.range("2001-1-1", "2008-1-1") dp.range("2001-1-1", "2015-1-1") diff --git a/statsmodels/tsa/tests/test_exponential_smoothing.py b/statsmodels/tsa/tests/test_exponential_smoothing.py index af3030b4544..dad5b53696e 100644 --- a/statsmodels/tsa/tests/test_exponential_smoothing.py +++ b/statsmodels/tsa/tests/test_exponential_smoothing.py @@ -1,6 +1,7 @@ """ Author: Samuel Scherrer """ +from statsmodels.compat.pandas import QUARTER_END from statsmodels.compat.platform import PLATFORM_LINUX32, PLATFORM_WIN from itertools import product @@ -220,7 +221,7 @@ def austourists(): 61.09776802, 66.05576122, ] - index = pd.date_range("1999-01-01", "2015-12-31", freq="Q") + index = pd.date_range("1999-01-01", "2015-12-31", freq=QUARTER_END) return pd.Series(data, index) diff --git a/statsmodels/tsa/tests/test_seasonal.py b/statsmodels/tsa/tests/test_seasonal.py index 3fab8d7caa2..2a1f0b630a7 100644 --- a/statsmodels/tsa/tests/test_seasonal.py +++ b/statsmodels/tsa/tests/test_seasonal.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import MONTH_END, QUARTER_END, YEAR_END + import numpy as np from numpy.testing import ( assert_allclose, @@ -54,7 +56,7 @@ def setup_class(cls): 232, 429, 3, 98, 43, -141, -77, -13, 125, 361, -45, 184] cls.data = pd.DataFrame(data, pd.date_range(start='1/1/1951', periods=len(data), - freq='Q')) + freq=QUARTER_END)) def test_ndarray(self): res_add = seasonal_decompose(self.data.values, period=4) @@ -93,7 +95,7 @@ def test_pandas(self): res_add = seasonal_decompose(self.data, period=4) freq_override_data = self.data.copy() freq_override_data.index = pd.date_range( - start='1/1/1951', periods=len(freq_override_data), freq='Y') + start='1/1/1951', periods=len(freq_override_data), freq=YEAR_END) res_add_override = seasonal_decompose(freq_override_data, period=4) assert_almost_equal(res_add.seasonal.values.squeeze(), SEASONAL, 2) @@ -270,14 +272,14 @@ def test_raises(self): def test_seasonal_decompose_too_short(reset_randomstate): - dates = pd.date_range('2000-01-31', periods=4, freq='Q') + dates = pd.date_range('2000-01-31', periods=4, freq=QUARTER_END) y = np.sin(np.arange(4) / 4 * 2 * np.pi) y += np.random.standard_normal(y.size) y = pd.Series(y, name='y', index=dates) with pytest.raises(ValueError): seasonal_decompose(y) - dates = pd.date_range('2000-01-31', periods=12, freq='M') + dates = pd.date_range('2000-01-31', periods=12, freq=MONTH_END) y = np.sin(np.arange(12) / 12 * 2 * np.pi) y += np.random.standard_normal(y.size) y = pd.Series(y, name='y', index=dates) @@ -296,7 +298,7 @@ def test_seasonal_decompose_smoke(): data = pd.DataFrame(x, pd.date_range(start='1/1/1951', periods=len(x), - freq='Q')) + freq=QUARTER_END)) seasonal_decompose(data) diff --git a/statsmodels/tsa/tests/test_stattools.py b/statsmodels/tsa/tests/test_stattools.py index ba53aa8d7b2..abbc071810f 100644 --- a/statsmodels/tsa/tests/test_stattools.py +++ b/statsmodels/tsa/tests/test_stattools.py @@ -1,5 +1,5 @@ from statsmodels.compat.numpy import lstsq -from statsmodels.compat.pandas import assert_index_equal +from statsmodels.compat.pandas import MONTH_END, YEAR_END, assert_index_equal from statsmodels.compat.platform import PLATFORM_WIN from statsmodels.compat.python import lrange @@ -17,8 +17,8 @@ import pandas as pd from pandas import DataFrame, Series, date_range import pytest -from scipy.interpolate import interp1d from scipy import stats +from scipy.interpolate import interp1d from statsmodels.datasets import macrodata, modechoice, nile, randhie, sunspots from statsmodels.tools.sm_exceptions import ( @@ -38,8 +38,8 @@ adfuller, arma_order_select_ic, breakvar_heteroskedasticity_test, - ccovf, ccf, + ccovf, coint, grangercausalitytests, innovations_algo, @@ -1034,7 +1034,7 @@ def test_pandasacovf(): def test_acovf2d(reset_randomstate): dta = sunspots.load_pandas().data - dta.index = date_range(start="1700", end="2009", freq="Y")[:309] + dta.index = date_range(start="1700", end="2009", freq=YEAR_END)[:309] del dta["YEAR"] res = acovf(dta, fft=False) assert_equal(res, acovf(dta.values, fft=False)) @@ -1119,7 +1119,7 @@ def test_arma_order_select_ic(): assert_(res.bic.index.equals(bic.index)) assert_(res.bic.columns.equals(bic.columns)) - index = pd.date_range("2000-1-1", freq="M", periods=len(y)) + index = pd.date_range("2000-1-1", freq=MONTH_END, periods=len(y)) y_series = pd.Series(y, index=index) res_pd = arma_order_select_ic( y_series, max_ar=2, max_ma=1, ic=["aic", "bic"], trend="n" @@ -1533,7 +1533,7 @@ def test_acf_conservate_nanops(reset_randomstate): def test_pacf_nlags_error(reset_randomstate): - e = np.random.standard_normal(100) + e = np.random.standard_normal(99) with pytest.raises(ValueError, match="Can only compute partial"): pacf(e, 50) @@ -1584,3 +1584,27 @@ def test_granger_causality_exception_maxlag(gc_data): def test_granger_causality_verbose(gc_data): with pytest.warns(FutureWarning, match="verbose"): grangercausalitytests(gc_data, 3, verbose=True) + +@pytest.mark.parametrize("size",[3,5,7,9]) +def test_pacf_small_sample(size,reset_randomstate): + y = np.random.standard_normal(size) + a = pacf(y) + assert isinstance(a, np.ndarray) + a, b = pacf_burg(y) + assert isinstance(a, np.ndarray) + assert isinstance(b, np.ndarray) + a = pacf_ols(y) + assert isinstance(a, np.ndarray) + a = pacf_yw(y) + assert isinstance(a, np.ndarray) + + +def test_pacf_1_obs(reset_randomstate): + y = np.random.standard_normal(1) + with pytest.raises(ValueError): + pacf(y) + with pytest.raises(ValueError): + pacf_burg(y) + with pytest.raises(ValueError): + pacf_ols(y) + pacf_yw(y) diff --git a/statsmodels/tsa/tests/test_tsa_tools.py b/statsmodels/tsa/tests/test_tsa_tools.py index e4808496551..3d32e60d630 100644 --- a/statsmodels/tsa/tests/test_tsa_tools.py +++ b/statsmodels/tsa/tests/test_tsa_tools.py @@ -1,7 +1,13 @@ """tests for some time series analysis functions """ -from statsmodels.compat.pandas import assert_frame_equal, assert_series_equal +from statsmodels.compat.pandas import ( + PD_LT_2_2_0, + QUARTER_END, + YEAR_END, + assert_frame_equal, + assert_series_equal, +) import numpy as np from numpy.testing import ( @@ -11,6 +17,7 @@ assert_raises, ) import pandas as pd +from pandas.tseries.frequencies import to_offset import pytest from statsmodels import regression @@ -465,14 +472,28 @@ def test_duplicate_column_names(self): stattools.lagmat(df, maxlag=2, use_pandas=True) -def test_freq_to_period(): - from pandas.tseries.frequencies import to_offset - - freqs = ["A", "AS-MAR", "Q", "QS", "QS-APR", "W", "W-MON", "B", "D", "H"] - expected = [1, 1, 4, 4, 4, 52, 52, 5, 7, 24] - for i, j in zip(freqs, expected): - assert_equal(tools.freq_to_period(i), j) - assert_equal(tools.freq_to_period(to_offset(i)), j) +ANNUAL = "A" if PD_LT_2_2_0 else YEAR_END +freqs = [ + YEAR_END, + f"{ANNUAL}-MAR", + QUARTER_END, + "QS", + "QS-APR", + "W", + "W-MON", + "B", + "D", + "h", +] +expected = [1, 1, 4, 4, 4, 52, 52, 5, 7, 24] +freq_expected = [(f, e) for f, e in zip(freqs, expected)] + + +@pytest.mark.parametrize("freq_expected", freq_expected) +def test_freq_to_period(freq_expected): + freq, expected = freq_expected + assert_equal(tools.freq_to_period(freq), expected) + assert_equal(tools.freq_to_period(to_offset(freq)), expected) class TestDetrend: diff --git a/statsmodels/tsa/tests/test_x13.py b/statsmodels/tsa/tests/test_x13.py index b6de96ad53a..357b7881c88 100644 --- a/statsmodels/tsa/tests/test_x13.py +++ b/statsmodels/tsa/tests/test_x13.py @@ -1,3 +1,5 @@ +from statsmodels.compat.pandas import MONTH_END + import pandas as pd import pytest @@ -20,7 +22,7 @@ dta = co2.load_pandas().data dta['co2'] = dta.co2.interpolate() -monthly_data = dta.resample('M') +monthly_data = dta.resample(MONTH_END) # change in pandas 0.18 resample is deferred object if not isinstance(monthly_data, (pd.DataFrame, pd.Series)): monthly_data = monthly_data.mean() diff --git a/statsmodels/tsa/tsatools.py b/statsmodels/tsa/tsatools.py index 5e3e9a293ab..048b56a2e05 100644 --- a/statsmodels/tsa/tsatools.py +++ b/statsmodels/tsa/tsatools.py @@ -1,6 +1,6 @@ from __future__ import annotations -from statsmodels.compat.python import lrange, Literal +from statsmodels.compat.python import Literal, lrange import warnings @@ -804,7 +804,8 @@ def freq_to_period(freq: str | offsets.DateOffset) -> int: assert isinstance(freq, offsets.DateOffset) freq = freq.rule_code.upper() - if freq in ("A", "Y") or freq.startswith(("A-", "AS-", "Y-", "YS-")): + yearly_freqs = ("A-", "AS-", "Y-", "YS-", "YE-") + if freq in ("A", "Y") or freq.startswith(yearly_freqs): return 1 elif freq == "Q" or freq.startswith(("Q-", "QS", "QE")): return 4 diff --git a/statsmodels/tsa/vector_ar/tests/test_var.py b/statsmodels/tsa/vector_ar/tests/test_var.py index 6fc0b39151c..aba11565a4e 100644 --- a/statsmodels/tsa/vector_ar/tests/test_var.py +++ b/statsmodels/tsa/vector_ar/tests/test_var.py @@ -2,7 +2,7 @@ """ Test VAR Model """ -from statsmodels.compat.pandas import assert_index_equal +from statsmodels.compat.pandas import QUARTER_END, assert_index_equal from statsmodels.compat.python import lrange from io import BytesIO, StringIO @@ -600,7 +600,7 @@ def setup_class(cls): adj_data = np.diff(np.log(data), axis=0) # est = VAR(adj_data, p=2, dates=dates[1:], names=names) - cls.model = VAR(adj_data[:-16], dates=dates[1:-16], freq="BQ-MAR") + cls.model = VAR(adj_data[:-16], dates=dates[1:-16], freq=f"B{QUARTER_END}-MAR") cls.res = cls.model.fit(maxlags=cls.p) cls.irf = cls.res.irf(10) cls.lut = E1_Results() diff --git a/tools/ci/azure/azure_template_posix.yml b/tools/ci/azure/azure_template_posix.yml index 5c383dfa756..73a37c7d61c 100644 --- a/tools/ci/azure/azure_template_posix.yml +++ b/tools/ci/azure/azure_template_posix.yml @@ -17,9 +17,14 @@ jobs: strategy: matrix: ${{ if eq(parameters.name, 'Linux') }}: + python_312_latest: + python.version: '3.12' + coverage: true python_311_latest: python.version: '3.11' lint: true + CYTHON: 3.0.6 + coverage: true python_310_latest_install: python.version: '3.10' lint: true @@ -35,6 +40,8 @@ jobs: SCIPY: 1.5.4 PANDAS: 1.2.5 NUMPY: 1.20.3 + CYTHON: 0.29.33 + coverage: true lint: true python38: python.version: '3.8' @@ -45,6 +52,7 @@ jobs: MATPLOTLIB: 3.2.2 coverage: true PYTEST_OPTIONS: '' + CYTHON: 0.29.36 lint: true python_38_legacy_blas: python.version: '3.8' @@ -70,8 +78,12 @@ jobs: pip.pre: true SM_TEST_COPY_ON_WRITE: 1 ${{ if eq(parameters.name, 'macOS') }}: + python310_macos_latest: + python.version: '3.10' python311_macos_latest: python.version: '3.11' + python312_macos_latest: + python.version: '3.12' maxParallel: 10 steps: @@ -109,7 +121,7 @@ jobs: condition: eq(variables['test.install'], 'true') - script: | - python -m pip install -e . -v --no-build-isolation + python -m pip install -e . -vv --no-build-isolation displayName: 'Install statsmodels (editable)' condition: ne(variables['test.install'], 'true') @@ -128,6 +140,9 @@ jobs: mkdir test_run_dir pushd test_run_dir python -c "import statsmodels; statsmodels.test(['-n', 'auto', '--junitxml=../junit/test-results.xml', '--skip-examples', '--dist', 'loadscope', '--durations=25'], exit=True)" + LAST_EXIT_CODE=$? + echo "Status code: ${LAST_EXIT_CODE}" + exit ${LAST_EXIT_CODE} popd displayName: 'Run tests (site-packages)' condition: eq(variables['test.install'], 'true') diff --git a/tools/ci/azure/azure_template_windows.yml b/tools/ci/azure/azure_template_windows.yml index 9062f03c3cf..d34132a9c56 100644 --- a/tools/ci/azure/azure_template_windows.yml +++ b/tools/ci/azure/azure_template_windows.yml @@ -28,6 +28,9 @@ jobs: python311_win_latest: python.version: '3.11' python.architecture: 'x64' + python312_win_latest: + python.version: '3.12' + python.architecture: 'x64' maxParallel: 10 steps: diff --git a/tools/ci/azure/install-posix.sh b/tools/ci/azure/install-posix.sh index da701ba44c6..e5ac78afa66 100644 --- a/tools/ci/azure/install-posix.sh +++ b/tools/ci/azure/install-posix.sh @@ -14,20 +14,27 @@ else CMD="python -m pip install numpy" fi +echo "Python location: $(where python)" python -m pip install --upgrade pip setuptools wheel build -python -m pip install "cython>=0.29.28,<3.0.0" "pytest~=7.0.1" pytest-xdist coverage pytest-cov ipython jupyter notebook nbconvert "property_cached>=1.6.3" black==20.8b1 isort flake8 nbconvert==5.6.1 coveralls setuptools_scm[toml]~=7.0.0 +python -m pip install -r requirements-dev.txt +python -m pip uninstall numpy scipy pandas cython -y if [[ -n ${NUMPY} ]]; then CMD="$CMD==${NUMPY}"; fi; CMD="$CMD scipy" if [[ -n ${SCIPY} ]]; then CMD="$CMD==${SCIPY}"; fi; CMD="$CMD pandas" if [[ -n ${PANDAS} ]]; then CMD="$CMD==${PANDAS}"; fi; +CMD="$CMD cython" +if [[ -n ${CYTHON} ]]; then CMD="$CMD==${CYTHON}"; fi; if [[ ${USE_MATPLOTLIB} == true ]]; then CMD="$CMD matplotlib" if [[ -n ${MATPLOTLIB} ]]; then CMD="$CMD==${MATPLOTLIB}"; fi +else + # Uninstall if not needed + python -m pip uninstall matplotlib -y || true fi CMD="${CMD} patsy ${BLAS}" diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 72bbd0cdecd..00000000000 --- a/tox.ini +++ /dev/null @@ -1,67 +0,0 @@ -############################################################################## -# Intended to simplify local generation of coverage of Cython files -# -# tox -e cython-coverage -# -# will produce a HTML report showing line-by-line coverage -# -# can be used to run standard tests on Python 2.7, 3.5, 3.6, and 3.7 or a -# run with coverage but without coverage of Cython files using -# -# tox -e ENV -# -# where env is one of the values in envlist or all environments (slow) -# -# tox -# -############################################################################## -[tox] -envlist = - py38 - py39 - py310 - coverage - cython-coverage - -[testenv] -deps = - -rrequirements.txt - cython - matplotlib - joblib - pytest>=6 - pytest-xdist -setenv = - MKL_NUM_THREADS=1 - NUMEXPR_NUM_THREADS=1 - OMP_NUM_THREADS=1 - OPENBLAS_NUM_THREADS=1 -changedir = {homedir} -commands = - python -c "import statsmodels;statsmodels.test(['-n 2'], exit=True)" - -[testenv:coverage] -usedevelop = True -deps = - {[testenv]deps} - pytest-cov -setenv = - {[testenv]setenv} - SM_CYTHON_COVERAGE=false -changedir = {toxinidir} -commands = - coverage erase - pytest -n 2 --cov-report html --cov=statsmodels statsmodels - -[testenv:cython-coverage] -usedevelop = True -deps = - {[testenv]deps} - pytest-cov -setenv = - {[testenv]setenv} - SM_CYTHON_COVERAGE=true -changedir = {toxinidir} -commands = - coverage erase - pytest -n 2 --cov-report html --cov=statsmodels statsmodels