-
Notifications
You must be signed in to change notification settings - Fork 12
/
correlation.py
101 lines (75 loc) · 3.79 KB
/
correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Plot distributions of correlation matrix eigenvalues."""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
from matplotlib import pyplot as plt
if TYPE_CHECKING:
from numpy.typing import ArrayLike
def marchenko_pastur_pdf(x: float, gamma: float, sigma: float = 1) -> float:
"""Generate Marchenko-Pastur probability distribution which describes the density of
singular values of large rectangular random matrices.
See https://wikipedia.org/wiki/Marchenko-Pastur_distribution.
By comparing the eigenvalue distribution of a correlation matrix to this
PDF, one can gauge the significance of correlations.
Args:
x (float): Position at which to compute probability density.
gamma (float): Also referred to as lambda. The distribution's main parameter
that measures how well sampled the data is.
sigma (float, optional): Standard deviation of random variables assumed
to be independent identically distributed. Defaults to 1 as
appropriate for correlation matrices.
Returns:
float: Marchenko-Pastur density for given gamma at x
"""
lambda_m = (sigma * (1 - np.sqrt(1 / gamma))) ** 2 # Largest eigenvalue
lambda_p = (sigma * (1 + np.sqrt(1 / gamma))) ** 2 # Smallest eigenvalue
pre_fac = gamma / (2 * np.pi * sigma**2 * x)
root = np.sqrt((lambda_p - x) * (x - lambda_m))
unit_step = x > lambda_p or x < lambda_m
return pre_fac * root * (0 if unit_step else 1)
def marchenko_pastur(
matrix: ArrayLike,
gamma: float,
*,
sigma: float = 1,
filter_high_evals: bool = False,
ax: plt.Axes | None = None,
) -> plt.Axes:
"""Plot the eigenvalue distribution of a symmetric matrix (usually a correlation
matrix) against the Marchenko Pastur distribution.
The probability of a random matrix having eigenvalues >= (1 + sqrt(gamma))^2 in the
absence of any signal is vanishingly small. Thus, if eigenvalues larger than that
appear, they correspond to statistically significant signals.
Args:
matrix (ArrayLike): 2d array
gamma (float): The Marchenko-Pastur ratio of random variables to observation
count. E.g. for N=1000 variables and p=500 observations of each,
gamma = p/N = 1/2.
sigma (float, optional): Standard deviation of random variables. Defaults to 1.
filter_high_evals (bool, optional): Whether to filter out eigenvalues larger
than theoretical random maximum. Useful for focusing the plot on the area
of the MP PDF. Defaults to False.
ax (Axes, optional): matplotlib Axes on which to plot. Defaults to None.
Returns:
plt.Axes: matplotlib Axes object
"""
ax = ax or plt.gca()
# use eigvalsh (over eigvals) for speed since correlation matrix is symmetric
eigen_vals = np.linalg.eigvalsh(matrix)
lambda_m = (sigma * (1 - np.sqrt(1 / gamma))) ** 2 # Largest eigenvalue
lambda_p = (sigma * (1 + np.sqrt(1 / gamma))) ** 2 # Smallest eigenvalue
if filter_high_evals:
# Remove eigenvalues larger than those expected in a purely random matrix
eigen_vals = eigen_vals[eigen_vals <= lambda_p + 1]
ax.hist(eigen_vals, bins=50, edgecolor="black", density=True)
# Plot the theoretical density
mp_pdf = np.vectorize(lambda x: marchenko_pastur_pdf(x, gamma, sigma))
x = np.linspace(max(1e-4, lambda_m), lambda_p, 200)
ax.plot(x, mp_pdf(x), linewidth=5)
# Compute and display matrix rank
# A ratio less than one indicates an undersampled set of RVs
rank = np.linalg.matrix_rank(matrix)
n_rows = len(matrix)
rank_str = f"rank deficiency: {rank}/{n_rows} {'(None)' if n_rows == rank else ''}"
plt.text(*[0.95, 0.9], rank_str, transform=ax.transAxes, ha="right")
return ax