Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add CFA to factor analysis #9114

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions statsmodels/multivariate/api.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
__all__ = [
"PCA", "MANOVA", "Factor", "FactorResults", "CanCorr",
"PCA", "MANOVA", "Factor", "FactorResults", "CFABuilder", "CanCorr",
"factor_rotation"
]

from .pca import PCA
from .manova import MANOVA
from .factor import Factor, FactorResults
from .factor import Factor, FactorResults, CFABuilder
from .cancorr import CanCorr
from . import factor_rotation
190 changes: 166 additions & 24 deletions statsmodels/multivariate/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
Variables in columns, observations in rows. May be `None` if
`corr` is not `None`.
n_factor : int
The number of factors to extract
The number of factors to extract, not used if 'cfa' is provided.
corr : array_like
Directly specify the correlation matrix instead of estimating
it from `endog`. If provided, `endog` is not used for the
Expand All @@ -79,7 +79,13 @@
Missing value handling for endog, default is row-wise deletion 'drop'
If 'none', no nan checking is done. If 'drop', any observations with
nans are dropped. If 'raise', an error is raised.

cfa : None or array_like
Used to fit a confirmatory factor analysis model. If provided, 'cfa' is a
'(n_factor * k_endog) x q' array such that a q-dimensional vector 'u' determines
the loadings via the construction 'load = reshape(cfa * u)', with 'reshape'
converting the 'n_factor * k_endog' dimensional vector to a 'k_endog x n_factor'
matrix, reshaped column-wise. Use CFABuilder to construct cfa in a common
use-case.

Notes
-----
Expand All @@ -95,6 +101,13 @@
where L are the loadings and U are the uniquenesses. In addition,
L' * diag(U)^{-1} L must be diagonal.

If 'cfa' is provided, elements of the factor loading matrix can be
set to zero, or constrained to equality. For example, setting
'cfa = [1 0 0 0; 0 1 0 0; 0 0 0 0; 0 0 0 0; 0 0 0 0; 0 0 0 0;
0 0 1 0; 0 0 0 1]' defines a 2-factor model for 4-dimensional data
in which the first factor has structure [w x 0 0] and the second factor
has structure [0 0 y z].

References
----------
.. [*] Hofacker, C. (2004). Exploratory Factor Analysis, Mathematical
Expand All @@ -103,7 +116,8 @@
dimension. Annals of Statistics. https://arxiv.org/pdf/1205.6617.pdf
"""
def __init__(self, endog=None, n_factor=1, corr=None, method='pa',
smc=True, endog_names=None, nobs=None, missing='drop'):
smc=True, endog_names=None, nobs=None, missing='drop',
cfa=None):

_check_args_1(endog, n_factor, corr, nobs)

Expand Down Expand Up @@ -133,6 +147,14 @@
self.corr = corr
self.k_endog = k_endog

if cfa is not None and method == "ml":
self.n_factor = cfa.shape[0] // self.k_endog

Check warning on line 151 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L151

Added line #L151 was not covered by tests
if cfa.shape[0] != self.n_factor * self.k_endog:
raise ValueError("Leading dimension of 'cfa' must be divisible by number of variables")
self.cfa = cfa

Check warning on line 154 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L153-L154

Added lines #L153 - L154 were not covered by tests
elif cfa is not None:
raise ValueError("A cfa can only be used when fitting with method 'ml'")

Check warning on line 156 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L156

Added line #L156 was not covered by tests

if endog_names is None:
if hasattr(corr, 'index'):
endog_names = corr.index
Expand Down Expand Up @@ -280,13 +302,22 @@
# roots of the uniquenesses. The remaining elements are the
# factor loadings, packed one factor at a time.
def _unpack(self, par):
return (par[0:self.k_endog]**2,
np.reshape(par[self.k_endog:], (-1, self.k_endog)).T)
if hasattr(self, "cfa"):
uq = par[0:self.k_endog]**2
ld = np.dot(self.cfa, par[self.k_endog:])
ld = np.reshape(ld, (-1, self.k_endog)).T
return (uq, ld)

Check warning on line 309 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L306-L309

Added lines #L306 - L309 were not covered by tests
else:
return (par[0:self.k_endog]**2,
np.reshape(par[self.k_endog:], (-1, self.k_endog)).T)

# Packs the model parameters into a flat parameter, used for ML
# estimation.
def _pack(self, load, uniq):
return np.concatenate((np.sqrt(uniq), load.T.flat))
if hasattr(self, "cfa"):
return np.concatenate((np.sqrt(uniq), load))

Check warning on line 318 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L318

Added line #L318 was not covered by tests
else:
return np.concatenate((np.sqrt(uniq), load.T.flat))

def loglike(self, par):
"""
Expand All @@ -310,6 +341,9 @@
uniq, load = self._unpack(par)
else:
load, uniq = par[0], par[1]
if hasattr(self, "cfa"):
load = np.dot(self.cfa, load)
load = np.reshape(load, (-1, self.k_endog)).T

Check warning on line 346 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L345-L346

Added lines #L345 - L346 were not covered by tests

loadu = load / uniq[:, None]
lul = np.dot(load.T, loadu)
Expand Down Expand Up @@ -354,6 +388,9 @@
uniq, load = self._unpack(par)
else:
load, uniq = par[0], par[1]
if hasattr(self, "cfa"):
load = np.dot(self.cfa, load)
load = np.reshape(load, (-1, self.k_endog)).T

Check warning on line 393 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L392-L393

Added lines #L392 - L393 were not covered by tests

# Center term of SMW
loadu = load / uniq[:, None]
Expand Down Expand Up @@ -381,33 +418,48 @@
dl += 2*luz
dl -= 2*np.dot(lud, luz)

# Cannot use _pack because we are working with the square root
# uniquenesses directly.
return -np.concatenate((du, dl.T.flat)) / (2*self.k_endog)
if hasattr(self, "cfa"):
# Apply the chain rule to get the derivative with respect to the parameters.
dp = np.dot(self.cfa.T, dl.T.flat)
return -np.concatenate((du, dp)) / (2*self.k_endog)

Check warning on line 424 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L423-L424

Added lines #L423 - L424 were not covered by tests
else:
# Cannot use _pack because we are working with the square root
# uniquenesses directly.
return -np.concatenate((du, dl.T.flat)) / (2*self.k_endog)

# Maximum likelihood factor analysis.
def _fit_ml(self, start, em_iter, opt_method, opt):
"""estimate Factor model using Maximum Likelihood
"""

def nloglike(par):
return -self.loglike(par)

def nscore(par):
return -self.score(par)

# Starting values
if start is None:
load, uniq = self._fit_ml_em(em_iter)
start = self._pack(load, uniq)
if hasattr(self, "cfa"):
# Use gradient-free optimization for starting values when a cfa is present.
d = self.k_endog + self.cfa.shape[1]
start = np.zeros(d)
start[0:self.k_endog] = 1
r = minimize(nloglike, start, method="powell", options={"maxiter": 10})
start = r.x

Check warning on line 449 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L445-L449

Added lines #L445 - L449 were not covered by tests
else:
# Use EM optimization for starting values when no cfa is present.
load, uniq = self._fit_ml_em(em_iter)
start = self._pack(load, uniq)
elif len(start) == 2:
# Starting values were provided
if len(start[1]) != start[0].shape[0]:
msg = "Starting values have incompatible dimensions"
raise ValueError(msg)
start = self._pack(start[0], start[1])
else:
raise ValueError("Invalid starting values")

def nloglike(par):
return -self.loglike(par)

def nscore(par):
return -self.score(par)

# Do the optimization
if opt is None:
opt = _opt_defaults
Expand All @@ -422,7 +474,8 @@
warnings.warn("Some uniquenesses are nearly zero")

# Rotate solution to satisfy IC3 of Bai and Li
load = self._rotate(load, uniq)
if not hasattr(self, "cfa"):
load = self._rotate(load, uniq)

self.uniqueness = uniq
self.communality = 1 - uniq
Expand Down Expand Up @@ -484,6 +537,10 @@
cm = np.dot(load.T, load / uniq[:, None]) / nobs
_, f = np.linalg.eig(cm)
load = np.dot(load, f)

for j in range(load.shape[1]):
if (load[:, j] < 0).mean() > 0.5:
load[:, j] *= -1
return load


Expand Down Expand Up @@ -987,7 +1044,7 @@
Notes
-----
If excess kurtosis is known, provide as `kurt`. Standard
errors are only available if the model was fit using maximum
errors are only available for EFA models fit using maximum
likelihood. If `endog` is not provided, `nobs` must be
provided to obtain standard errors.

Expand All @@ -998,8 +1055,8 @@
unrotated maximum likelihood solution.
"""

if self.fa_method.lower() != "ml":
msg = "Standard errors only available under ML estimation"
if self.fa_method.lower() != "ml" and not hasattr(self, "cfa"):
msg = "Standard errors are only available under ML estimation for EFA (cfa=None)"

Check warning on line 1059 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1059

Added line #L1059 was not covered by tests
raise ValueError(msg)

if self.nobs is None:
Expand All @@ -1014,7 +1071,7 @@
"""
The standard errors of the loadings.

Standard errors are only available if the model was fit using
Standard errors are only available for EFA models fit using
maximum likelihood. If `endog` is not provided, `nobs` must be
provided to obtain standard errors.

Expand All @@ -1025,8 +1082,8 @@
unrotated maximum likelihood solution.
"""

if self.fa_method.lower() != "ml":
msg = "Standard errors only available under ML estimation"
if self.fa_method.lower() != "ml" and not hasattr(self, "cfa"):
msg = "Standard errors are only available under ML estimation for EFA (cfa=None)"

Check warning on line 1086 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1086

Added line #L1086 was not covered by tests
raise ValueError(msg)

if self.nobs is None:
Expand All @@ -1035,3 +1092,88 @@

v = np.outer(self.uniqueness, np.ones(self.loadings.shape[1]))
return np.sqrt(v / self.nobs)

@cache_readonly
def srmr(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't there be another s in there: srmsr

"""
The standardized root mean square residual (SRMR).

Returns the SRMR and the variable-wise means of the SRMR values.
"""
Fit = self.fitted_cov
Obs = self.model.corr
d = np.diag(Obs)
R = (Obs - Fit)**2 / np.outer(d, d)
ii, jj = np.tril_indices(Fit.shape[0])
return np.sqrt(np.mean(R[ii, jj])),np.sqrt(R.mean(0))

Check warning on line 1108 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1103-L1108

Added lines #L1103 - L1108 were not covered by tests

class CFABuilder:
"""
Specify a confirmatory factor analysis (CFA) model.
"""

@staticmethod
def _vpos(X, va):
# Convert variable names to positions if needed
if type(X) is pd.DataFrame:
vm = {v:k for (k,v) in enumerate(X.columns)}
va_ix = []

Check warning on line 1120 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1120

Added line #L1120 was not covered by tests
for v in va:
va_ix.append([vm[k] for k in v])
else:
va_ix = va
return va_ix

Check warning on line 1125 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1124-L1125

Added lines #L1124 - L1125 were not covered by tests


@staticmethod
def cfa(X, fmap):
"""
Specify a CFA by providing a dictionary that maps variable names or
positions to the list of factors (numbered 0, 1, ...) on which the
variable loads.
"""

k_endog = X.shape[1]

Check warning on line 1136 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1136

Added line #L1136 was not covered by tests
n_factor = max([max(x) for x in fmap.values()]) + 1
q = sum([len(v) for v in fmap.values()])
p = k_endog * n_factor
cfa = np.zeros((p, q))

Check warning on line 1140 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1139-L1140

Added lines #L1139 - L1140 were not covered by tests

if type(X) is pd.DataFrame:
vm = {v: i for i,v in enumerate(X.columns)}
else:
vm = {i: i for i in range(X.shape[1])}

jj = 0

Check warning on line 1147 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1147

Added line #L1147 was not covered by tests
for k,v in fmap.items():
for j in v:
ii = j * k_endog + vm[k]
cfa[ii, jj] = 1
jj += 1

Check warning on line 1152 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1150-L1152

Added lines #L1150 - L1152 were not covered by tests

return cfa

Check warning on line 1154 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1154

Added line #L1154 was not covered by tests

@staticmethod
def no_crossload(X, va):
"""
Specify a CFA with no crossloadings. 'va' is a list of lists, where
va[i] contains the variables that load on factor 'i'. If 'X' is a
dataframe then variables are provided by name, otherwise they are
provided by position.
"""

va_ix = CFABuilder._vpos(X, va)
k_endog = X.shape[1]
n_factor = len(va_ix)

Check warning on line 1167 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1165-L1167

Added lines #L1165 - L1167 were not covered by tests
q = sum([len(v) for v in va_ix])
p = k_endog * n_factor
cfa = np.zeros((p, q))

Check warning on line 1170 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1169-L1170

Added lines #L1169 - L1170 were not covered by tests

ii, jj = 0, 0

Check warning on line 1172 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1172

Added line #L1172 was not covered by tests
for v in va_ix:
for i in v:
cfa[ii+i, jj] = 1
jj += 1
ii += k_endog

Check warning on line 1177 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1175-L1177

Added lines #L1175 - L1177 were not covered by tests

return cfa

Check warning on line 1179 in statsmodels/multivariate/factor.py

View check run for this annotation

Codecov / codecov/patch

statsmodels/multivariate/factor.py#L1179

Added line #L1179 was not covered by tests