/
BenchtopUtils.py
244 lines (188 loc) · 10.2 KB
/
BenchtopUtils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
############################################################################################################
# From 01: Plotting data, uncertainty, curve fits
import numpy as np
import pandas as pd
import scipy.stats as stats
def classical_fit_intervals(func,p_opt,x,y,xpts):
if x.shape != y.shape:
x = np.tile(x,[y.size//x.size,1]).T
n = y.size
m = p_opt.size
dof = n-m # Degrees of freedom
res = y - func(x,*p_opt) # Residuals
t = stats.t.ppf((1. + 0.95)/2., n - m) # Student's t distribution
chi2 = np.sum((res / func(x,*p_opt))**2) # chi-squared; estimates error in data
chi2_red = chi2 / dof # reduced chi-squared; measures goodness of fit
s_err = np.sqrt(np.sum(res**2) / dof) # standard error of the fit at each point
ci = t * s_err * np.sqrt(1/n + (xpts - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
pi = t * s_err * np.sqrt(1 + 1/n + (xpts - np.mean(x))**2 / np.sum((x - np.mean(x))**2))
return ci, pi
def classical_fit_param_summary(p_opt, p_cov, names = None):
nstd = stats.norm.ppf((1. - 0.95)/2.)
p_std = np.sqrt(np.diag(p_cov))
p_ci_lower = p_opt - nstd * p_std
p_ci_upper = p_opt + nstd * p_std
summary = pd.DataFrame(data = [p_ci_lower,p_opt,p_ci_upper,p_std],
index = ('95% CI Lower Limit','Optimal Value','95% CI Upper Limit','Standard Error'),
columns = names)
return summary
############################################################################################################
# From 02: Bootstrapping confidence intervals
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
import scipy.optimize as opt
import seaborn as sns
from tqdm import tqdm#_notebook as tqdm
# Performing the bootstrap algorithm
def bootstrap_fits(func, x, y, p_opt, n_straps = 1000, res = 100, xpts = None, guess_gen = None,
fit_kws = {}, conservative = True, piecewise = True):
# If y is a vector of length 'm', x must also be a vector of length 'm'
# If y is a matrix of shape 'm x n', with replicates in different columns, x must either be a vector of length 'm' or a matrix of shape 'm x n'
# Number of unique x values
n = len(set(x))
# Number of replicates
m = y.size//n
# Piecewise bootstrapping is nonsensical if there's only one y per x
if y.ndim == 1: piecewise = False
# Generate points at which to evaluate the curves
if xpts is None: xpts = np.linspace(x.min(),x.max(),res)
elif xpts.size == 2: xpts = np.linspace(xpts[0],xpts[1],res)
# Predicted y values
y_fit = func(x,*p_opt)
# Tile the predicted y values if necessary so that they're the same shape as the original data
if y_fit.shape != y.shape: y_fit = np.tile(y_fit,[y.size//x.size,1]).T
# Get the residuals (and they're )
resid = y - y_fit
p_strapped = np.zeros([n_straps,p_opt.size]) # Create a matrix of zeros to store the parameters from each bootstrap iteration
curve_strapped = np.zeros([n_straps,xpts.size]) # Another matrix to store the predicted curve for each iteration
for i in tqdm(range(n_straps)):
# Choose new residuals based on the specified method
if piecewise and conservative:
invalid_sample = True
while invalid_sample:
resid_resamples = np.array([np.random.choice(resid[row],size = m) for row in range(n)])
if all(len(set(resid_resamples[row])) > 1 for row in range(n)): invalid_sample = False
elif piecewise and not conservative:
sigma_resid = [resid[row].std() for row in range(n)]
resid_resamples = np.array([np.random.normal(0, size = m) for row in range(n)])
elif not piecewise and not conservative:
sigma_resid = resid.std()
resid_resamples = np.random.normal(0, sigma_resid, size = resid.shape)
elif not piecewise and conservative:
resid_resamples = np.random.choice(resid.flat, size = resid.shape)
# Generate a synthetic dataset from the sampled residuals
new_y = y_fit+resid_resamples
if guess_gen is not None:
# Generate guesses for this dataset
guesses = guess_gen(x,new_y)
else:
# Default guesses
guesses = np.ones(len(p_opt))
# Additional keyword arguments to curve_fit can be passed as a dictionary via fit_kws
if y.ndim == 1:
p_strapped[i], _ = opt.curve_fit(func, x, new_y,
p0 = guesses,
**fit_kws)
else:
p_strapped[i], _ = opt.curve_fit(func, x, new_y.mean(1),
sigma = new_y.std(1), absolute_sigma = True,
p0 = guesses,
**fit_kws)
curve_strapped[i] = func(xpts,*p_strapped[i])
return p_strapped, curve_strapped
# Plot the bootstrapped curve and its confidence intervals
def bootstrap_plot(xpts,bootstrap_curves, CI = 95, line_kws ={},fill_kws={}):
c_lower = np.percentile(bootstrap_curves,(100-CI)/2,axis = 0)
c_median = np.percentile(bootstrap_curves,50,axis = 0)
c_upper = np.percentile(bootstrap_curves,(100+CI)/2,axis = 0)
# Additional keyword arguments to plot or fill_between can be passed as a dictionary via line_kws and fill_kws, respectively
med = plt.plot(xpts, c_median, **line_kws)
ci = plt.fill_between(xpts, c_upper, c_lower, color = plt.getp(med[0],'color'), alpha = 0.25, **fill_kws)
return med, ci
# Summarize parameters and confidence intervals resulting from the bootstrap algorithm
def bootstrap_summary(bootstrap_params, CI = 95, names = None):
p_lower = np.percentile(bootstrap_params,(100-CI)/2,axis = 0)
p_median = np.percentile(bootstrap_params,50,axis = 0)
p_upper = np.percentile(bootstrap_params,(100+CI)/2,axis = 0)
summary = pd.DataFrame(data = [p_lower,p_median,p_upper],
index = ('{:}% CI Lower Limit'.format(CI),'Median Value','{:}% CI Upper Limit'.format(CI)),
columns = names)
return summary
# Plot the bootstrapped distributions for each parameter and label with the modal value derived from its KDE
def bootstrap_dists(bootstrap_params, CI = 95, names = None, rug_kws = {}, kde_kws = {}):
_,n_p = bootstrap_params.shape
mode = np.zeros([n_p,])
fig, axs = plt.subplots(1, n_p, figsize = (4*n_p,3))
for p in range(n_p):
sns.distplot(bootstrap_params[:,p], ax = axs[p], **rug_kws, **kde_kws)
axs[p].axvline(np.percentile(bootstrap_params[:,p], (100-CI)/2, axis = 0), ls = '--', color = 'k')
axs[p].axvline(np.percentile(bootstrap_params[:,p], 50, axis = 0), ls = ':', color = 'k')
axs[p].axvline(np.percentile(bootstrap_params[:,p], (100+CI)/2, axis = 0), ls = '--', color = 'k')
KDE = axs_[p].lines[0]
mode[p] = KDE.get_xdata()[np.argmax(KDE.get_ydata())]
axs[p].set_title(names[p] + '\n' + 'mode = {:.3f}'.format(mode[p]))
return fig, axs, mode
def bootstrap_bca(func, x, y, bootstrap_params, CI=95, names = None, guess_gen = None, fit_kwargs = {}):
from scipy.special import erfinv
from scipy.special import erf
import warnings
def norm_cdf(x):
return 0.5*(1+erf(x/2**0.5))
def norm_ppf(x):
return 2**0.5 * erfinv(2*x-1)
def jackknife(data):
"""
Given data points data, where axis 0 is considered to delineate points, return
a list of arrays where each array is a set of jackknife indexes.
For a given set of data Y, the jackknife sample J[i] is defined as the data set
Y with the ith data point deleted.
"""
base = np.arange(0,len(data))
return (np.delete(base,i) for i in base)
x = x.flatten()
y = y.flatten()
straps,n_p = bootstrap_params.shape
if guess_gen is not None:
guesses = guess_gen(x,y)
else:
guesses = None
p_opt,_ = opt.curve_fit(func,x,y, p0=guesses, **fit_kwargs)
z0 = norm_ppf( 1.0*np.sum(bootstrap_params < p_opt, axis=0) / straps )
idx_jack = jackknife(y)
p_jack = np.array([opt.curve_fit(func,x[idx],y[idx], p0=guesses, **fit_kwargs)[0] for idx in idx_jack])
p_jmean = np.mean(p_jack,axis=0)
# Acceleration value
a = np.sum((p_jmean - p_jack)**3, axis=0) / (
6.0 * np.sum((p_jmean - p_jack)**2, axis=0)**1.5)
alphas = np.array([100-CI,100+CI])/200.
zs = z0 + norm_ppf(alphas)[:,np.newaxis]
avals = norm_cdf(z0 + zs/(1-a*zs))
nvals = np.round((straps-1)*avals)
if np.any(np.isnan(avals)):
warnings.warn("Some values were NaN; results are probably unstable " +
"(all values were probably equal)",
stacklevel=2)
return
if np.any(nvals == 0) or np.any(nvals == straps-1):
warnings.warn("Some values used extremal samples; " +
"results are probably unstable.",
stacklevel=2)
return
elif np.any(nvals < 10) or np.any(nvals >= straps-10):
warnings.warn("Some values used top 10 low/high samples; " +
"results may be unstable.",
stacklevel=2)
p_lower, p_median, p_upper = [np.zeros(n_p) for _ in range(3)]
avals *= 100
for p in range(n_p):
p_lower[p] = np.percentile(bootstrap_params[:,p],avals[0,p])
p_median[p] = np.percentile(bootstrap_params[:,p],50)
p_upper[p] = np.percentile(bootstrap_params[:,p],avals[1,p])
summary = pd.DataFrame(data = [p_lower,p_median,p_upper],
index = ('{:}% CI Lower Limit'.format(CI),
'Median Value',
'{:}% CI Upper Limit'.format(CI)),
columns = names)
return summary, avals