forked from 5harad/fasttt
/
simulate_data.R
174 lines (148 loc) · 6.99 KB
/
simulate_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
simulate_stop_dataset = function(df, post, noise=0) {
# Simulates a new dataset using the stop model where each race-department threshold with a random error ~ N(0,se=noise).
message(sprintf("Simulating stop dataset with noise %2.3f", noise))
# extract police department, race, and number of stops
nraces = length(unique(df$driver_race))
n = integer(nrow(df))
# initialize vectors
searches_noisy = n
hits_noisy = n
thresh_noisy = n
# extract posterior means for parameters
phi = colMeans(post$phi)
delta = colMeans(post$delta)
t = df$thresholds
# loop over precincts. For each precinct, randomly sample from
for(d in unique(df$location_variable)){
# get parameters for precinct.
idxs = df$location_variable == d
precinct_thresholds = t[idxs]
precinct_phi = phi[idxs]
precinct_delta = delta[idxs]
base_rates = df$base_population[idxs]
stopifnot(abs(sum(base_rates) - 1) < 1e-6)
total_searches = sum(df$num_searches[idxs])
# draw the races of a whole bunch of randomly sampled pedestrians from the base population.
searches = c()
r = c()
signal = NULL
n_draws = 1e4
while (sum(searches) < total_searches) {
multinomial_draw = rmultinom(n_draws, 1, base_rates)
new_r = integer(n_draws)
for(r_i in 1:nraces){ # race order is NOT necessarily White-Black-Hispanic here, it is whatever order
new_r[multinomial_draw[r_i,] == 1] = r_i
}
t_noisy = inv.logit(logit(precinct_thresholds[new_r]) + rnorm(n_draws, mean=0, sd=noise))
signal_new = draw_from_signal_distribution(n_draws, phi = precinct_phi[new_r], delta = precinct_delta[new_r], sigma_g = 1) %>%
sample_frac() # shuffle
p_guilt = signal_to_p(signal_new$signal_samples, phi = precinct_phi[new_r], delta = precinct_delta[new_r], sigma_g = 1)
r = c(r, new_r)
signal = bind_rows(signal, signal_new)
searches = c(searches, p_guilt > t_noisy)
n_draws = n_draws*10
}
searches = which(searches)[1:total_searches]
r = factor(r, levels = 1:nraces)
searches_by_race = table(r[searches])
hits_by_race = table(r[searches][signal$guilty[searches] == 'guilty'])
searches_noisy[idxs] = searches_by_race
hits_noisy[idxs] = hits_by_race
}
# return simulated dataset along with aggregate search & hit rates
df = df %>%
mutate(t_baseline = t,
num_searches = searches_noisy,
num_hits = hits_noisy)
return(df)
}
simulate_search_dataset = function(df, post, noise=0) {
# Simulates a new dataset using the search model where each race-department threshold with a random error ~ N(0,se=noise).
# extract police department, race, and number of stops
nraces = length(unique(df$driver_race))
n = integer(nrow(df))
# initialize vectors
searches_noisy = n
hits_noisy = n
thresh_noisy = n
# extract posterior means for parameters
phi_r = colMeans(post$phi_r)
phi_d = colMeans(post$phi_d)
delta_r = colMeans(post$delta_r)
delta_d = colMeans(post$delta_d)
t = df$thresholds
for (i in 1:nrow(df)) {
r = as.integer(df$driver_race[i])
d = as.integer(df$location_variable[i])
nr_stops = df$num_stops[i]
phi = inv.logit(phi_d[d] + phi_r[r])
delta = exp(delta_r[r] + delta_d[d])
# draw probability of posessing contraband for each stop in the race-department pair
signal = draw_from_signal_distribution(nr_stops, phi = phi, delta = delta, sigma_g = 1)
p_guilt = signal_to_p(signal$signal_samples, phi = phi, delta = delta, sigma_g = 1)
# perturb threshold: t'_rd = t_rd + N(mu=0, se=noise)
t_noisy = inv.logit(rep(logit(t[i]), nr_stops) + rnorm(nr_stops, mean=0, sd=noise))
# update results
searches_noisy[i] = sum(p_guilt > t_noisy)
hits_noisy[i] = sum((signal$guilty == 'guilty') & (p_guilt > t_noisy))
thresh_noisy[i] = mean(t_noisy)
}
# return simulated dataset along with aggregate search & hit rates
df = df %>%
mutate(t_baseline = t,
t_noisy = thresh_noisy,
num_searches = searches_noisy,
num_hits = hits_noisy,
search_rate = num_searches/num_stops,
hit_rate = num_hits/num_searches)
return(df)
}
run_on_simulated_data = function(noise_levels, decision, rerun_models = TRUE){
model = gsub('.stan', '', ifelse(decision == 'search_decision', new_frisk_model_name, new_stop_model_name))
for(noise in noise_levels){
message(sprintf("Running simulation for noise level %2.3f, decision %2.3f", noise, decision))
filestring = sprintf('simulated_stop_and_frisk_noise_%2.3f_%s', noise, decision)
output_filename = paste0(base_output_dir, sprintf('%s_%s.RData', filestring, model))
if(rerun_models){
if(decision == 'stop_decision'){
load(paste0(base_output_dir, 'nyc_stop_and_frisk_multinomial_stop_and_frisk.RData'))
stops = simulate_stop_dataset(obs, post, noise = noise)
}else{
load(paste0(base_output_dir, 'nyc_stop_and_frisk_search_decision_model_mixture.RData'))
stops = simulate_search_dataset(obs, post, noise = noise)
}
save(stops, file = paste0(base_input_dir, filestring, '.RData'))
run_threshold_test(filestring, paste0(model, '.stan'))
}
}
}
plot_all_noise_levels = function(decision, noise_levels = c(0, 0.05, 0.1, 0.2, 0.5, 1)) {
model = gsub('.stan', '', ifelse(decision == 'search_decision', new_frisk_model_name, new_stop_model_name))
df = list()
for(noise in noise_levels){
filestring = sprintf('simulated_stop_and_frisk_noise_%2.3f_%s', noise, decision)
output_filename = paste0(base_output_dir, sprintf('%s_%s.RData', filestring, model))
load(output_filename)
df[[length(df)+1]] = get_single_threshold_from_state(obs) %>%
mutate(noise = noise)
}
df = bind_rows(df)
if(decision == 'stop_decision'){
load(paste0(base_output_dir, 'nyc_stop_and_frisk_multinomial_stop_and_frisk.RData'))
}else{
load(paste0(base_output_dir, 'nyc_stop_and_frisk_search_decision_model_mixture.RData'))
}
true_thresholds = get_single_threshold_from_state(obs)
ggsave(sprintf('figures/%s_simulation.pdf', decision),
ggplot(df) + geom_line(aes(x=noise, y=weighted_threshold , color=driver_race)) +
geom_hline(data = true_thresholds, mapping = aes(yintercept = weighted_threshold, color = driver_race), linetype=2) +
our_theme() + theme(legend.position = c(0.8,0.65)) +
scale_x_continuous(expression('\nThreshold heterogeneity (' * sigma * ')')) +
scale_y_continuous('Inferred threshold', labels = scales::percent) +
scale_color_manual('', values = c('White' = 'blue', 'Black' = 'black', 'Hispanic' = 'red')),
width = 6, height = 3.5)
}
# lines below are useful if you want to run several simulations in parallel using a bash script.
# noise_level = as.numeric(commandArgs(trailingOnly=TRUE)[1])
# decision = commandArgs(trailingOnly=TRUE)[2]
# run_on_simulated_data(c(noise_level), decision) -