Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: vectorize loo function for local_constant estimator #9199

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
63 changes: 62 additions & 1 deletion statsmodels/nonparametric/_kernel_base.py
Expand Up @@ -5,7 +5,7 @@
import copy

import numpy as np
from scipy import optimize
from scipy import optimize, spatial
from scipy.stats.mstats import mquantiles

try:
Expand Down Expand Up @@ -516,3 +516,64 @@ def gpke(bw, data, data_predict, var_type, ckertype='gaussian',
return dens.sum(axis=0)
else:
return dens


def pairwise_product_kernel(bw, data, var_type, ckertype='gaussian',
okertype='wangryzin', ukertype='aitchisonaitken'):
r"""
Returns the pairwise product kernel function for each pair of data points
in `data` as a 2-D array. Summing row-wise or col-wise gives the `gpke`
evaluated at each data point in `data`.

Parameters
----------
bw : 1-D ndarray, shape (k_vars,)
Bandwidth parameters for the kernel functions.
data : 2-D ndarray, shape (nobs, k_vars)
Points at which the kernel function is evaluated.
var_type : str
The type of the variables, one character per variable:

- c: continuous
- u: unordered (discrete)
- o: ordered (discrete)

ckertype : TYPE, optional
The kernel used for the continuous variables. The default is
'gaussian'.
okertype : TYPE, optional
The kernel used for the ordered discrete variables. The default is
'wangryzin'.
ukertype : TYPE, optional
The kernel used for the unordered discrete variables. The default is
'aitchisonaitken'.

Returns
-------
K : 2-D ndarray, shape (nobs, nobs)
A 2-D array.where the (i, j) component is K(x_i, x_j) where K
is the product kernel function.

"""
nobs, k_vars = np.shape(data)
# full vectorization only for continuous gaussian kernel
if (ckertype == 'gaussian') and (set(var_type) == set('c')):
VI = np.diag(1 / bw) ** 2
D = spatial.distance.squareform(
spatial.distance.pdist(data, metric='mahalanobis', VI=VI)
) ** 2
K = np.exp(-0.5 * D) * (1. / np.sqrt(2 * np.pi)) ** k_vars
K = K / np.prod(bw)
else:
K = np.empty((nobs, nobs))
for ii in range(nobs):
ker_ii = gpke(bw, data=data, data_predict=data[ii, :],
var_type=var_type,
ckertype=ckertype,
ukertype=ukertype,
okertype=okertype,
tosum=False)

K[:, ii] = ker_ii

return K
38 changes: 36 additions & 2 deletions statsmodels/nonparametric/kernel_regression.py
Expand Up @@ -36,7 +36,8 @@
from scipy.stats.mstats import mquantiles

from ._kernel_base import GenericKDE, EstimatorSettings, gpke, \
LeaveOneOut, _get_type_pos, _adjust_shape, _compute_min_std_IQR, kernel_func
LeaveOneOut, _get_type_pos, _adjust_shape, _compute_min_std_IQR, \
kernel_func, pairwise_product_kernel


__all__ = ['KernelReg', 'KernelCensoredReg']
Expand Down Expand Up @@ -131,7 +132,8 @@ def _compute_reg_bw(self, bw):
self._bw_method = bw
# Workaround to avoid instance methods in __dict__
if bw == 'cv_ls':
res = self.cv_loo
res = (
self.cv_loo_fast if self.reg_type == 'lc' else self.cv_loo)
else: # bw == 'aic'
res = self.aic_hurvich
X = np.std(self.exog, axis=0)
Expand Down Expand Up @@ -336,6 +338,38 @@ def cv_loo(self, bw, func):
# Note: There might be a way to vectorize this. See p.72 in [1]
return L / self.nobs

def cv_loo_fast(self, bw, func=None):
r"""
The cross-validation function with leave-one-out estimator. Vectorized
version of `cv_loo` function for local constant estimator.

Parameters
----------
bw : array_like
Vector of bandwidth values.
func : callable function
Unused for this function.

Returns
-------
L : float
The value of the CV function.

"""
endog = self.endog
exog = self.exog
K = pairwise_product_kernel(bw, data=exog,
var_type=self.var_type,
ckertype=self.ckertype,
ukertype=self.ukertype,
okertype=self.okertype)
Kzerodiag = K - np.diag(np.diag(K))
k = np.sum(Kzerodiag, axis=1)
Y = endog.flatten()
G = np.dot(Kzerodiag, Y) / k
L = np.mean((G - Y) ** 2)
return L

def r_squared(self):
r"""
Returns the R-Squared for the nonparametric regression.
Expand Down