/
Function_sampling_algorithm.R
278 lines (192 loc) · 9.65 KB
/
Function_sampling_algorithm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
##############################
# Sampling algorithm
###############################
library(lubridate)
library(stringr)
library(data.table)
library(tidyverse)
#' Sampling EMR data for SCALE-IT serosurveillance
#'
#' @param labs Lab test electronic medical record dataframe
#' @param encounters encounters/diagnoses electronic medical record dataframe
#' @param thresh threshold for maximum number of samples (n.b. NOT exact number
#' samples returned)
#' @param proportion_longitudinal proportion of data which are from patients
#' previously sampled
#' @param LISdat Laboratory information system dataframe
#' @param prev_sampled Dataframe of previously sampled data - first column is
#' date of sample collection, second column is MRN
#'
#' @return
#' @export
#'
#' @examples
gen_list <-
function(labs,
encounters,
thresh,
proportion_longitudinal,
LISdat,
LIS_startdate,
LIS_enddate,
prev_sampled) {
# internal function used later
substrRight <- function(x, n) {
substr(x, nchar(x) - n + 1, nchar(x))
}
age_bins <- c(0, 20,40, 60,80,100)
labs <- labs %>% mutate(
age_groups = case_when(
PAT_AGE >= 0 & PAT_AGE < 20 ~ "0-19",
PAT_AGE >= 20 &
PAT_AGE < 40 ~ '20-39',
PAT_AGE >= 40 &
PAT_AGE < 60 ~ '40-59',
PAT_AGE >= 60 &
PAT_AGE < 80 ~ '60-79',
PAT_AGE >= 80 &
PAT_AGE < 110 ~ '80+'
)
)
labs_merge<- merge(labs, encounters, by = "PAT_MRN_ID")
# Identify sentinel population in data
sentinel <-
labs_merge[grepl("Z34", labs_merge[["CURRENT_ICD10_LIST"]]),]
labs_sf <-
labs[labs$zcta >= 94100 & labs$zcta < 94199 | labs$zcta == 99997, ]
# Subset to chemistries
chemistries_sf <-
labs_sf[grepl("METABOLIC", labs_sf[["DESCRIPTION"]]) |
grepl("CREATININE", labs_sf[["DESCRIPTION"]]) |
grepl("ELECTROLYTES", labs_sf[["DESCRIPTION"]]) |
grepl("ANTIBODIES", labs_sf[["DESCRIPTION"]]) |
grepl("ANTIBODY", labs_sf[["DESCRIPTION"]]) |
grepl("SYPHILIS", labs_sf[["DESCRIPTION"]]) |
grepl("RUBELLA", labs_sf[["DESCRIPTION"]]) |
grepl("SERUM / PLASMA", labs_sf[["DESCRIPTION"]]) , ]
sentinel_sf <-
chemistries_sf[chemistries_sf$PAT_MRN_ID %in% sentinel$PAT_MRN_ID,]#sentinel pops living in SF
# select samples from emergency department
ed_chemistries_sf<- chemistries_sf[grepl("EMERGENCY", chemistries_sf[["DEPT"]]),]
# subset to outpatient
op_chemistries_sf <- chemistries_sf %>% filter(PAT_TYPE == "OP")
# Adult outpatient chemistries
adult_op_chemistries_sf <-
op_chemistries_sf %>% filter(PAT_AGE > 17)
# ALL under 18 chemistries (IP and OP)
u18_chemistries_sf <- chemistries_sf %>% filter(PAT_AGE < 18)
# ensure no repeated sentinel populations
sentinel_sf <- distinct(sentinel_sf, PAT_MRN_ID, .keep_all = TRUE)
if (length(unique(sentinel_sf$PAT_MRN_ID)) > thresh * proportion_sentinel) {
sentinel_sf <-
dplyr::sample_n(
sentinel_sf,
size = thresh * proportion_sentinel,
replace = F
)
}
# Generate sample list containing outpatient adults, under 18s, and sentinel
sample_list <-
rbind(adult_op_chemistries_sf, ed_chemistries_sf,u18_chemistries_sf, sentinel_sf)
# Identify MRNS sampled previously
prev_sampled_match<-prev_sampled[prev_sampled$PAT_MRN_ID %in% sample_list$PAT_MRN_ID == TRUE, c("PAT_MRN_ID", "SPEC_COLL_DATE", "SPEC_COLL_TIME")]
sample_match<-unique(sample_list[sample_list$PAT_MRN_ID %in% prev_sampled[, "PAT_MRN_ID"] == TRUE,], by = c("PAT_MRN_ID"))[, c("PAT_MRN_ID", "SPEC_COLL_DTTM")]
prev_MRNs<-merge(sample_match, prev_sampled_match, by = "PAT_MRN_ID")
prev_MRNs$TIME_DIFF<- as.Date(prev_MRNs$SPEC_COLL_DTTM) - as.Date(prev_MRNs$SPEC_COLL_DATE, format = "%m/%d/$Y")
# Identify MRNS tested within last 30 days and exclude - currently no longitudinal MRNs so commented out
recent_MRNS <- prev_MRNs %>% filter( TIME_DIFF < 30)
#longitudinal MRNs
longitudinal_MRNs <- prev_MRNs %>% filter( TIME_DIFF >= 30)
longitudinal_MRNs<-distinct(longitudinal_MRNs, PAT_MRN_ID, .keep_all = TRUE)
# Check how many longitudinal samples we have and, if it exceeds desired proportion, sample to that proportion
if (length(unique(longitudinal_MRNs$PAT_MRN_ID)) > thresh * proportion_longitudinal) {
longitudinal_MRNs <-
dplyr::sample_n(
longitudinal_MRNs,
size = thresh * proportion_longitudinal,
replace = F
)
}
longitudinal_MRNs<-sample_list %>% filter (PAT_MRN_ID %in% longitudinal_MRNs$PAT_MRN_ID ==TRUE)
longitudinal_MRNs<-distinct(longitudinal_MRNs, PAT_MRN_ID, .keep_all = TRUE)
sample_list <-
rbind(sample_list[sample_list$PAT_MRN_ID %in% recent_MRNS$PAT_MRN_ID == FALSE,], sentinel_sf, longitudinal_MRNs)
sample_list<-distinct(sample_list, PAT_MRN_ID, .keep_all = T)
##############################################
## Stratification and sampling by zip and age
##############################################
#if total sampled are below threshold, save all, if not sample from full dataset by age and zip
# count overall samples - remembering sentinel has already been merged with the sample_list above
overall_samples<-length(unique(sample_list$PAT_MRN_ID))
if (overall_samples <= thresh) {
} else{
#stratify by zip
zips <- as.data.frame(table(sample_list$zcta))
colnames(zips)<-c("zip", "Freq")
zip_info<-read.csv("C:/Users/ir515/Box/EMR data/zipcode_info.csv")
zip_info$target_zip <-as.integer(thresh*zip_info$proportion_population)
zip_info$zip<-zip_info$zcta
zips<-right_join(zip_info, zips, by ="zip")
zips$diff <- zips$Freq - zips$target_zip
zips[which(zips$zip == 99997),"diff"]<- thresh/length(unique(zips$zip))
#stratify by age
as.data.frame(table(sample_list$age_groups))
# set target sample size to have equal numbers in age bin
target_agebin <- as.integer(thresh / (length(age_bins)-1))
#see if any age groups are over the target
ages <- as.data.frame(table(sample_list$age_groups))
ages_thin <- ages[ages$Freq > target_agebin, "Var1"]
ages$diff <- ages$Freq - target_agebin
#if a group is over, sample the target number from that group, but only from
#over-represented zipcodes
zips_to_sample <- zips %>% filter(diff > 0)
ages_to_sample <- ages %>% filter(diff > 0)
sample_data <- function(samplezip, sampleage) {
tmp_age <- ages %>% filter(Var1 == sampleage)
age_weight <- tmp_age$diff / sum(ages_to_sample$diff)
tmp_df <-
sample_list %>% filter(zcta == samplezip, age_groups == sampleage)
tmp_zip <- zips %>% filter(zip == samplezip)
if (round((tmp_zip$diff) * age_weight) > nrow(tmp_df)) {
size <- nrow(tmp_df)
} else{
size <- round((tmp_zip$diff) * age_weight)
}
exclude <- dplyr::sample_n(tmp_df, size = size, replace = F)
return(exclude)
}
exclude<-list()
for(i in 1:length(ages_thin)){
res <-
lapply(zips_to_sample$zip, sample_data, sampleage = as.character(ages_thin[i]))
res <- bind_rows(res)
exclude[[i]]<-res
}
exclude<-bind_rows(exclude)
thinned_sample <-
sample_list[sample_list$PAT_MRN_ID %in% exclude$PAT_MRN_ID == FALSE,]
# check for numbers removed
nrow(sample_list)
nrow(thinned_sample)
#check distribution - note age weighting is still off
table(thinned_sample$zcta)
table(thinned_sample$age_groups)
####THE SAMPLES FROM THIS SHOULD THEN BE combined WITH sentinel_sf AND longitudinal_MRNs AS THEY ARE BELOW)#####
sample_list <- rbind(thinned_sample, sentinel_sf, longitudinal_MRNs)
}
#Remove any duplicate MRNs
sample_list <- distinct(sample_list, PAT_MRN_ID, .keep_all = TRUE)
sample_list$PAT_MRN_ID<-as.character(sample_list$PAT_MRN_ID)
#Restrict to UCSF labs
sample_list <- sample_list %>% filter(RESULTING_LAB == "UCSF LAB" |
RESULTING_LAB == "UCSF MOUNT ZION CLINICAL LABS" )
#Make sure that there are no indivduals also tested for COVID
LISdat<-LISdat %>% filter( as.Date(LISdat$DateTimeCollected) > LIS_startdate & as.Date(LISdat$DateTimeCollected) < LIS_enddate )
testedforcovid <- LISdat[grepl("COV19", LISdat[["TestName"]]) |
grepl("COVID-19", LISdat[["TestName"]]) , ]
testedforcovid_admit <- testedforcovid[grepl("PENDING", testedforcovid[["TestName"]]),]
testedforcovid_nonadmit <- testedforcovid[!(testedforcovid$PtNumber %in% testedforcovid_admit$PtNumber),]
sample_list$PAT_MRN_ID<-as.character(sample_list$PAT_MRN_ID)
sample_list <- sample_list[!(sample_list$PAT_MRN_ID %in% testedforcovid_nonadmit$PtNumber),]
return(sample_list)
}