/
07_ConditionalLogistic_AGE_Delta.R
305 lines (244 loc) · 14.9 KB
/
07_ConditionalLogistic_AGE_Delta.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# Goal: Analyze Omicron data using the case-control approach used for Delta and VOC's
# Created 2021-12-15
# Updated 2022-03-28
# Author: Alexander "Sasha" Keyel <alexander.keyel@health.ny.gov
# Load required packages
library(survival)
setwd("C:/hri/DOH_COVID")
source("R/DOH_COVID_hlpr.R") # Requires dfmip package, devtools::install_github('akeyel/dfmip')
in.seed = 20220106 # Ensure repeatability of results
source("R/00_Delta_Prelim_Settings.R")
# Read in the input data file
DOH.data.file = sprintf("Data/cleaned_data%s.csv", figure.label)
if (!file.exists(DOH.data.file)){
stop("THIS ANALYSIS REQUIRES THE CLEANED DATA SET CREATED IN
05_Descriptive_Delta.Rmd or 05_Descriptive_Delta.R. If you have created
that file and are still seeing this message, please check the file paths.")
}
in.data = read.csv(DOH.data.file)
# Check data object to ensure the correct object loaded and there are no irregularities in the data set
View(in.data)
##### Match cases in flipped design & perform conditional logistic regression #####
# REMOVE AGE AS A MATCHING CRITERION
date.offset = 6
table.label = "_ConditionalLogistic_Age"
warning("Data are unsorted. Pre-sorting the data may increase the number of successful matches")
strata.df = match.focal(in.data, in.seed, date.offset, age.offset,
removal = TRUE, age.offset.override = NA,
exclude.age = TRUE)
# Add a time-since-vaccination binary variable
strata.df$TimePostVax = 0
strata.df$TimePostVax[strata.df$Days.between.collection.date.and.vaccination.complete.date < 90] = 1 # Does not affect NA values, so those remain zero
strata.df$TimePostVax[strata.df$Days.between.collection.date.and.vaccination.complete.date >= 90] = 2
strata.df$TimePostBoost = 0
strata.df$TimePostBoost[strata.df$Days.between.collection.date.and.booster.date < 90] = 1 # Does not affect NA values, so those remain zero
strata.df$TimePostBoost[strata.df$Days.between.collection.date.and.booster.date >= 90] = 2
# Make demographic summary table for the flipped design
# Updated 2022-04-29
#CL.file = sprintf("%s/08%s_stratadf%s.csv", out.folder, table.label, figure.label)
#strata.df = read.csv(CL.file)
make.flipped.demographic.table(strata.df, out.folder, figure.label, focal.label, table.label)
# Note: There will be a "'" before 10-19; this is to keep Excel from putting it into Date format, and that will need to be manually removed.
# I couldn't figure out how to get Excel to make it silently disappear the way it does when you enter it in Excel directly.
# Test for significant differences in age (paired) and sex (unpaired)
# Sort data to ensure a paired test
focal.cases = strata.df[strata.df$IS_FOCAL == 1, ]
other.cases = strata.df[strata.df$IS_FOCAL == 0, ]
focal.cases = focal.cases[order(focal.cases$stratum), ]
other.cases = other.cases[order(other.cases$stratum), ]
if (nrow(focal.cases) != nrow(other.cases)){stop("this assumes a 1:1 matched design. Need to change the t-test if not doing a 1:1 match")}
age.t.test = t.test(focal.cases$Age, other.cases$Age, paired = TRUE, alternative = 'two.sided')
age.t.test # Display results to screen
age.t.test.file = sprintf("%s/age_t_test_results%s%s.csv", out.folder, table.label, figure.label)
age.t.test.results = data.frame(t_statistic = age.t.test$statistic,
df = age.t.test$parameter, p_value = age.t.test$p.value,
mean_difference = age.t.test$estimate)
if (age.t.test$p.value < 0.05){
warning(sprintf("There is a significant difference in ages of %s years between
the case and controls (p = %s)", age.t.test$estimate, age.t.test$p.value))
}
write.table(age.t.test.results, file = age.t.test.file, sep = ',',
row.names = FALSE, col.names = TRUE,
append = FALSE)
# Note: assumption of normality is violated, but as long as cases are above 5 in each group, I think this is OK.
mf.t.test = t.test(as.numeric(as.factor(focal.cases$Sex)),as.numeric(as.factor(other.cases$Sex)),
alternative = 'two.sided', paired = FALSE)
mf.t.test
mf.t.test.file = sprintf("%s/sex_t_test_results%s%s.csv", out.folder, table.label, figure.label)
mf.t.test.results = data.frame(t_statistic = mf.t.test$statistic,
df = mf.t.test$parameter, p_value = mf.t.test$p.value,
group_mean_1 = mf.t.test$estimate[1], group_mean_2 = mf.t.test$estimate[2])
if (mf.t.test$p.value < 0.05){
warning(sprintf("There is a significant difference in distribution of sexes between
the case and controls (p = %s)", mf.t.test$p.value))
}
write.table(mf.t.test.results, file = mf.t.test.file, sep = ',',
row.names = FALSE, col.names = TRUE,
append = FALSE)
##### ANALYSIS 3: PERFORM CONDITIONAL LOGISTIC REGRESSION ON THE FLIPPED CASE/CONTROL DESIGN #####
# Add an indicator variable, to make 0, 'no vaccine' the reference state
strata.df$vaccine_type_code = 0
strata.df$vaccine_type_code[strata.df$vaccine_type == "Pfizer"] = 1
strata.df$vaccine_type_code[strata.df$vaccine_type == "Moderna"] = 2
strata.df$vaccine_type_code[strata.df$vaccine_type == "Janssen"] = 3
# Add a time since last dose category (coded on the indicator variable not on the days)
strata.df$TimePostLastDose = strata.df$TimePostVax # No boosters in this data set
# Add a vaccination indicator variable
strata.df$IS.VAX = as.factor(strata.df$case_control)
strata.df$IS.VAX = relevel(strata.df$IS.VAX, ref = "Control")
# Add indicator for Janssen vaccine, to try to make number of doses effect more interpretable
#strata.df$JANSSEN = 0
#strata.df$JANSSEN[strata.df$vaccine_type == "Janssen"] = 1
# This was not as interpretable - reframed below
# A variable similar to vaccine type, but with the MRNA vaccines merged
strata.df$MRNA_MERGED = 0
strata.df$MRNA_MERGED[strata.df$IS.VAX == "Case"] = 1
strata.df$MRNA_MERGED[strata.df$vaccine_type == "Janssen"] = 2
# Read in strata.df file to go back and edit analyses after the code was initially run
# strata.df = read.csv(CL.file)
# Add an age bin category. Start with oldest individuals, so the 90+ will be the reference category
strata.df$AGE.BIN = 0
#age.bins = sort(c(4, 11,17,29, 49, 69, 89, 110), decreasing = TRUE) # Assumes no one over 110
age.bins = sort(c(17,29, 49, 69, 89, 110), decreasing = TRUE) # Assumes no one over 110
count = 0
bin.upper = 110
for (i in 1:length(age.bins)){
bin.upper = age.bins[i]
if (i != length(age.bins)){
bin.lower = age.bins[i + 1] + 1
}else{
bin.lower = 0
}
strata.df$AGE.BIN[strata.df$Age >= bin.lower & strata.df$Age <= bin.upper] = count
count = count + 1
}
nrow(strata.df[strata.df$Age < 12, ]) # Only 8 under 12, but getting model convergence
nrow(strata.df[strata.df$Age < 5, ]) # 3 under 5, also leading to model convergence
# The low sample size in the lower bins is causing problems. Going to do an 'under 18' bin.
# Write matched data set for record-keeping purposes
CL.file = sprintf("%s/08%s_stratadf%s.csv", out.folder, table.label, figure.label)
write.table(strata.df, file = CL.file, append = FALSE, row.names = FALSE, col.names = TRUE,
sep = ',')
# Examine individual model with age
age.formula = as.formula(IS_FOCAL ~ Age + strata(stratum))
age.result = survival::clogit(age.formula, data=strata.df)
# Examine basic 2-way interactions for 5 models
vaxtype.x.age.formula = as.formula(IS_FOCAL ~ Age*as.factor(vaccine_type_code) + strata(stratum))
vaxtype.x.age = survival::clogit(vaxtype.x.age.formula, data=strata.df)
#tpv.x.age.formula = as.formula(IS_FOCAL ~ Age*as.factor(TimePostVax) + strata(stratum))
#timepostvax.x.age = survival::clogit(tpv.x.age.formula, data=strata.df)
#nd.x.age.formula = as.formula(IS_FOCAL ~ Age*VACCINE_DOSES + strata(stratum))
#ndoses.x.age = survival::clogit(nd.x.age.formula, data=strata.df)
tpd.x.age.formula = as.formula(IS_FOCAL ~ Age*as.factor(TimePostLastDose) + strata(stratum))
timepostdose.x.age = survival::clogit(tpd.x.age.formula, data=strata.df)
# Look at full model with no interactions
full.formula = as.formula(IS_FOCAL ~ as.factor(vaccine_type_code) + as.factor(TimePostLastDose) +
VACCINE_DOSES + Age +
strata(stratum))
full.model = survival::clogit(full.formula, data=strata.df)
# Removed number of doses. Not very interpretable, as it is a mix of vaccine type and vaccination status.
#nd.formula = as.formula(IS_FOCAL ~ VACCINE_DOSES + strata(stratum))
#ndoses = survival::clogit(nd.formula, data=strata.df)
#nd.plus.age.formula = as.formula(IS_FOCAL ~ Age + VACCINE_DOSES + strata(stratum))
#ndoses.plus.age = survival::clogit(nd.plus.age.formula, data=strata.df)
# Look at vaccination status or not, as a cleaner model
has.vax.formula = as.formula(IS_FOCAL ~ IS.VAX + strata(stratum)) # Here, case_control is not related to the case and control in IS_FOCAL, but is an indicator for vaccination status.
has.vax = survival::clogit(has.vax.formula, data = strata.df)
# Look at vaccination status or not, as a cleaner model
age.x.vax.formula = as.formula(IS_FOCAL ~ Age*IS.VAX + strata(stratum)) # Here, case_control is not related to the case and control in IS_FOCAL, but is an indicator for vaccination status.
age.x.vax = survival::clogit(age.x.vax.formula, data = strata.df)
# Vaccine Type alone
vaxtype.formula = as.formula(IS_FOCAL ~ as.factor(vaccine_type_code) + strata(stratum))
vaxtype = survival::clogit(vaxtype.formula, data=strata.df)
# Vaccine Type + Age
vaxtype.plus.age.formula = as.formula(IS_FOCAL ~ Age + as.factor(vaccine_type_code) + strata(stratum))
vaxtype.plus.age = survival::clogit(vaxtype.plus.age.formula, data=strata.df)
# Examine age bins plus vaccination status
age.bin.status.formula = as.formula(IS_FOCAL ~ as.factor(AGE.BIN) + IS.VAX + strata(stratum))
bin.status.model = survival::clogit(age.bin.status.formula, data = strata.df)
bin.status.AIC = extractAIC(bin.status.model)[2]
# BELOW TWO MODELS WERE REPLACED WITH A MERGED MRNA - WILL MAKE THE STATISTICS EASIER TO INTERPRET
# Neither these nor those were included in the final analysis.
# There was not much of an improvement, and vaccine type was cleaner to interpret.
# Add a Janssen vaccine indicator plus age to replace number of vaccine doses
#j.formula = as.formula(IS_FOCAL ~ JANSSEN + IS.VAX + strata(stratum))
#janssen = survival::clogit(j.formula, data=strata.df)
#janssen.AIC = extractAIC(janssen)[2]
# Add a Janssen vaccine indicator plus age to replace number of vaccine doses
#j.age.formula = as.formula(IS_FOCAL ~ JANSSEN + IS.VAX + Age + strata(stratum))
#janssen.age = survival::clogit(j.age.formula, data=strata.df)
#janssen.age.AIC = extractAIC(janssen.age)[2]
MRNA.formula = as.formula(IS_FOCAL ~ as.factor(MRNA_MERGED) + strata(stratum))
MRNA = survival::clogit(MRNA.formula, data=strata.df)
MRNA.AIC = extractAIC(MRNA)[2]
# Add a Janssen vaccine indicator plus age to replace number of vaccine doses
MRNA.age.formula = as.formula(IS_FOCAL ~ as.factor(MRNA_MERGED) + Age + strata(stratum))
MRNA.age = survival::clogit(MRNA.age.formula, data=strata.df)
MRNA.age.AIC = extractAIC(MRNA.age)[2]
# Also, no bin 0 individuals in this analysis, so it's using 70 - 89 as the reference group
#NOTE: Bin approach is not currently being included in the AIC comparison.
# It is not at all competitive based on AIC score
# Calculate AIC scores and identify the model with the lowest AIC
models = c("Age", "VaccineType x Age",
"TimePostDose x Age", "FullModel",
"HasVaccine", "HasVaccine x Age",
"VaccineType", "VaccineType + Age",
"VaccineType_mRNA_merged", "VaccineType_mRNA_merged + Age")
# "NumberVaccineDoses x Age", "NumberVaccineDoses", "NumberVaccineDoses + Age",
model.objects = list(age.result, vaxtype.x.age,
timepostdose.x.age, full.model,
has.vax, age.x.vax, vaxtype, vaxtype.plus.age,
MRNA, MRNA.age)
# ndoses.x.age, ndoses, ndoses.plus.age,
formula.objects = list(age.formula, vaxtype.x.age.formula,
tpd.x.age.formula, full.formula, has.vax.formula, age.x.vax.formula,
vaxtype.formula, vaxtype.plus.age.formula,
MRNA.formula, MRNA.age.formula)
# nd.x.age.formula, nd.formula, nd.plus.age.formula,
aic.results = data.frame(model = models, AIC = NA, K = NA, LogLikelihood = NA,
coefficients = NA,
coefficient.Odds.Ratios = NA,
coefficient.Odds.Ratios.lower = NA,
coefficient.Odds.Ratios.upper = NA,
p.value = NA, coefficient.p.values = NA)
for (i in 1:length(models)){
model = models[i]
model.object = model.objects[[i]]
model.summary = summary(model.object)
model.AIC = extractAIC(model.object)[2]
model.K = extractAIC(model.object)[1]
model.loglik = model.object$loglik[2]
aic.results$AIC[i] = model.AIC
aic.results$K[i] = model.K
aic.results$LogLikelihood[i] = model.loglik
aic.results$p.value[i] = model.summary$logtest[3] # 3rd element is the p-value, logtest corresponds to the Likelihood Ratio test
aic.results$coefficients[i] = paste(rownames(model.summary$coefficients), collapse = '; ')
aic.results$coefficient.p.values[i] = paste(round(model.summary$coefficients[ ,5], 4), collapse = '; ')
aic.results$coefficient.Odds.Ratios[i] = paste(round(model.summary$coefficients[ ,2],6), collapse = '; ')
aic.results$coefficient.Odds.Ratios.lower[i] = paste(round(model.summary$conf.int[ ,3],3), collapse = '; ')
aic.results$coefficient.Odds.Ratios.upper[i] = paste(round(model.summary$conf.int[ ,4],3), collapse = '; ')
}
aic.results$SORT.ORDER = seq(1:nrow(aic.results)) # Get an indicator for the R object associated with each model
aic.results$DeltaAIC = aic.results$AIC - min(aic.results$AIC)
aic.results = aic.results[order(aic.results$DeltaAIC), ]
aic.file = sprintf("%s/AIC_RESULTS%s%s.csv", out.folder, table.label, figure.label)
write.table(aic.results, aic.file, sep = ',', row.names = FALSE,
col.names = TRUE)
stop("Please examine the model results, and add additional models as desired based on the
results")
# Takes the model with the lowest delta AIC.
# This ignores ties - so it may be desirable to change this manually
obj.index = aic.results$SORT.ORDER[2] # Changed to take vaccine type
# Fewer parameters than the top model with only a marginal reduction in AIC
# Top model is likely overfit
model.object = model.objects[[obj.index]]
formula.object = formula.objects[[obj.index]]
make.leverage.plot(model.object, formula.object, out.folder, figure.label, table.label)
## Make a plot to examine the bin results (not included in final manuscript)
bin.OR = summary(bin.status.model)$coefficients[ ,2]
bin.OR.df = data.frame(BIN.OR = bin.OR[1:4], bin = seq(1,4))
bin.OR.df = bin.OR.df[order(bin.OR.df$bin, decreasing = TRUE), ]
plot(seq(1,4), bin.OR.df$BIN.OR, xaxt = 'n',
xlab = "Bin", ylab = "Odds Ratio relative to 70-89 age group",
ylim = c(0,4))
axis(side = 1, at = seq(1,4), labels = c("0-17","18-29","30-49","50-69"))