/
iterated_regression.py
129 lines (118 loc) · 5.27 KB
/
iterated_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Regularized iterated regression for estimating AR parameters in ARMA models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.tsa.tsatools import lagmat
def fit_arparams_iter(outputs, inputs, p, q, r, l2_reg=0.0):
"""Iterative regression for estimating AR params in ARMAX(p, q, r) model.
The iterative AR regression process provides consistent estimates for the
AR parameters of an ARMAX(p, q, r) model after q iterative steps.
It first fits an ARMAX(p, 0, r) model with least squares regression, then
ARMAX(p, 1, r), and so on, ..., til ARMAX(p, q, r). At the i-th step, it
fits an ARMAX(p, i, r) model, according to estimated error terms from the
previous step.
For description of the iterative regression method, see Section 2 of
`Consistent Estimates of Autoregressive Parameters and Extended Sample
Autocorrelation Function for Stationary and Nonstationary ARMA Models` at
https://www.jstor.org/stable/2288340.
The implementation here is a generalization of the method mentioned in the
paper. We adapt the method for multidimensional outputs, exogenous inputs, nan
handling, and also add regularization on the MA parameters.
Args:
outputs: Array with the output values from the LDS, nans allowed.
inputs: Array with exogenous inputs values, nans allowed. Could be None.
p: AR order, i.e. max lag of the autoregressive part.
q: MA order, i.e. max lag of the error terms.
r: Max lag of the exogenous inputs.
l2_reg: L2 regularization coefficient, to be applied on MA coefficients.
Returns:
Fitted AR coefficients.
"""
if outputs.shape[1] > 1:
# If there are multiple output dimensions, fit autoregressive params on
# each dimension separately and average.
params_list = [
fit_arparams_iter(outputs[:, j:j+1], inputs, p, q, r, l2_reg=l2_reg) \
for j in xrange(outputs.shape[1])]
return np.mean(
np.concatenate([a.reshape(1, -1) for a in params_list]), axis=0)
# We include a constant term in regression.
k_const = 1
# Input dim. If inputs is None, then in_dim = 0.
in_dim = 0
if inputs is not None:
in_dim = inputs.shape[1]
# Lag the inputs to obtain [?, r], column j means series x_{t-j}.
# Use trim to drop rows with unknown values both at beginning and end.
lagged_in = np.concatenate(
[lagmat(inputs[:, i], maxlag=r, trim='both') for i in xrange(in_dim)],
axis=1)
# Since we trim in beginning, the offset is r.
lagged_in_offset = r
# Lag the series itself to p-th order.
lagged_out = lagmat(outputs, maxlag=p, trim='both')
lagged_out_offset = p
y = outputs
y_offset = 0
# Estimated residuals, initialized to 0.
res = np.zeros_like(outputs)
for i in xrange(q + 1):
# Lag the residuals to i-th order in i-th iteration.
lagged_res = lagmat(res, maxlag=i, trim='both')
lagged_res_offset = y_offset + i
# Compute offset in regression, since lagged_in, lagged_out, and lagged_res
# have different offsets. Align them.
if inputs is None:
y_offset = max(lagged_out_offset, lagged_res_offset)
else:
y_offset = max(lagged_out_offset, lagged_res_offset, lagged_in_offset)
y = outputs[y_offset:, :]
# Concatenate all variables in regression.
x = np.concatenate([
lagged_out[y_offset - lagged_out_offset:, :],
lagged_res[y_offset - lagged_res_offset:, :]
],
axis=1)
if inputs is not None:
x = np.concatenate([lagged_in[y_offset - lagged_in_offset:, :], x],
axis=1)
# Add constant term as the first variable.
x = add_constant(x, prepend=True)
if x.shape[1] < k_const + in_dim * r + p + i:
raise ValueError('Insufficient sequence length for model fitting.')
# Drop rows with nans.
arr = np.concatenate([y, x], axis=1)
arr = arr[~np.isnan(arr).any(axis=1)]
y_dropped_na = arr[:, 0:1]
x_dropped_na = arr[:, 1:]
# Only regularize the MA part.
alpha = np.concatenate(
[np.zeros(k_const + in_dim * r + p), l2_reg * np.ones(i)], axis=0)
# When L1_wt = 0, it's ridge regression.
olsfit = OLS(y_dropped_na, x_dropped_na).fit_regularized(
alpha=alpha, L1_wt=0.0)
# Update estimated residuals.
res = y - np.matmul(x, olsfit.params.reshape(-1, 1))
if len(olsfit.params) != k_const + in_dim * r + p + q:
raise ValueError('Expected param len %d, got %d.' %
(k_const + in_dim * r + p + q, len(olsfit.params)))
if q == 0:
return olsfit.params[-p:]
return olsfit.params[-(p + q):-q]