01_ConditionalLogistic_Omicron.R

# Goal: Analyze Omicron data using a matched case-control approach

# Created 2021-12-15
# Updated 2022-03-25

# Author: Alexander "Sasha" Keyel <alexander.keyel@health.ny.gov

# Load required packages
library(survival)

# Basic set up
setwd("C:/hri/DOH_COVID")
source("R/DOH_COVID_hlpr.R") # Requires dfmip package, devtools::install_github('akeyel/dfmip')
source("R/00_Omicron_Prelim_Settings.R")
in.seed = 20220106 # Ensure repeatability of results

# Read in the cleaned input data file generated by 00_Descriptive_Omicron.R
DOH.data.file = sprintf("Data/cleaned_data%s.csv", figure.label)
if (!file.exists(DOH.data.file)){
  stop("THIS ANALYSIS REQUIRES THE CLEANED DATA SET CREATED IN
       00_Descriptive_Omicron.Rmd or 00_Descriptive_Omicron.R. If you have created
       that file and are still seeing this message, please check the file paths.")
}
in.data = read.csv(DOH.data.file)

# Check data object to ensure the correct object loaded and there are no irregularities in the data set
View(in.data)
# Here, case_control is not related to the case and control in IS_FOCAL, but is an indicator for vaccination status.
# This is an artifact of a previous case-control design not used in this analysis

# 2022-05-16 Get lineage summary for manuscript
# Restrict to Omicron emergence period
oep = in.data[in.data$DATE_INDEX >= 332 & in.data$DATE_INDEX <= 389, ]
nrow(oep[oep$IS_FOCAL == 1, ])
nrow(oep[oep$IS_FOCAL == 0, ])

# Restrict to Delta emergence period
dep = in.data[in.data$DATE_INDEX >= 78 & in.data$DATE_INDEX <= 227, ]
table(dep$sequence_result) # Except this includes EVERYTHING. How do we reduce it appropriately?
dep.t = table(dep$sequence_result)
non.delta= sum(dep.t[45:96]) - dep.t[[78]]
delta = sum(dep.t) - non.delta
perc.alpha = (dep.t[54] + dep.t[94]) / non.delta # Q is also Alpha
perc.iota = dep.t[71] / non.delta
perc.gamma = sum(dep.t[87:93]) /non.delta
perc.beta = sum(dep.t[62:63]) / non.delta
perc.B.1.2 = dep.t[57] / non.delta
perc.B.1.243 = dep.t[60] / non.delta
perc.B.1.637 = dep.t[83] / non.delta
perc.remaining = 1 - (perc.alpha + perc.iota + perc.gamma + perc.beta + perc.B.1.2 + perc.B.1.243 + perc.B.1.637)
perc.remaining.nonVOC = 1 - (perc.alpha + perc.iota + perc.gamma + perc.beta)

##### Match cases in flipped design & perform conditional logistic regression #####
date.offset = 6
age.bins = c(4, 11,17,29,  49, 69, 89, 110) # Assumes no one over 110

case.options = case.control.options.v4(in.data, in.seed, date.offset, age.bins)

nrow(case.options[case.options$N_EXACT > 0, ]) # Number with at least one exact match
# Sort in.data so that those with the fewest matches are matched first.
# Sort on Male/Female mismatch first
case.options = case.options[order(case.options$N_NO_MF), ]
# Then sort on exact matches. This makes this the first sort level, secondarily sorted by number of M/F matches
case.options = case.options[order(case.options$N_EXACT), ]
case.options$SORT = seq(1,nrow(case.options))

in.data.2 = merge(in.data, case.options[ , c("Identifier", "SORT")],
                  by = "Identifier", all.x = TRUE)
in.data.2 = in.data.2[order(in.data.2$SORT), ]

strata.df = match.focal.v2(in.data.2, in.seed, date.offset, age.bins,
                        removal = TRUE)

# Add a time-since-vaccination binary variable
strata.df$TimePostVax = 0
strata.df$TimePostVax[strata.df$Days.between.collection.date.and.vaccination.complete.date < 90] = 1 # Does not affect NA values, so those remain zero
strata.df$TimePostVax[strata.df$Days.between.collection.date.and.vaccination.complete.date >= 90] = 2

strata.df$TimePostBoost = 0
strata.df$TimePostBoost[strata.df$Days.between.collection.date.and.booster.date < 90] = 1 # Does not affect NA values, so those remain zero
strata.df$TimePostBoost[strata.df$Days.between.collection.date.and.booster.date >= 90] = 2

# Add Booster status binary variable
strata.df$IS.BOOST = 0
strata.df$IS.BOOST[strata.df$VACCINE_DOSES == 3] = 1 # If 3 doses, it's boosted (unless an upstream subsetting criterion changes)
strata.df$IS.BOOST[strata.df$VACCINE_DOSES == 2 & strata.df$vaccine_type == "Janssen"] = 1
# Could have done booster_type != 'none' as an alternate way of subsetting.
# It does not look like there are any true Janssen boosters in this data set.

#### Patch: Error in make.flipped.demographic.table function was corrected on 2022-03-23 after final results were generated.
# Updated again on 2022-04-29 - changed make.flipped.demographic.table to match the revised age bins
# Read in strata.df object to an empty workspace with helper functions and labels loaded
# Then just re-ran the make.flipped.demographic.table lines.
#table.label = "_ConditionalLogistic"
#CL.file = sprintf("%s/01%s_stratadf%s.csv", out.folder, table.label, figure.label)
#strata.df = read.csv(CL.file)
#### END of patch code

# Make demographic summary table for the flipped design
table.label = "_ConditionalLogistic"
make.flipped.demographic.table(strata.df, out.folder, figure.label, focal.label,
                               table.label, incl.booster = 1)
# Note: There will be a "'" before 10-19; this is to keep Excel from putting it into Date format, and that will need to be manually removed.

# Test for significant differences in age (paired) and sex (unpaired)
# Sort data to ensure a paired test
focal.cases = strata.df[strata.df$IS_FOCAL == 1, ]
other.cases = strata.df[strata.df$IS_FOCAL == 0, ]

focal.cases = focal.cases[order(focal.cases$stratum), ]
other.cases = other.cases[order(other.cases$stratum), ]
if (nrow(focal.cases) != nrow(other.cases)){stop("this assumes a 1:1 matched design. Need to change the t-test if not doing a 1:1 match")}
age.t.test = t.test(focal.cases$Age, other.cases$Age, paired = TRUE, alternative = 'two.sided')
age.t.test # Display results to screen
age.t.test.file = sprintf("%s/age_t_test_results%s%s.csv", out.folder, table.label, figure.label)
age.t.test.results = data.frame(t_statistic = age.t.test$statistic,
                                df = age.t.test$parameter, p_value = age.t.test$p.value,
                                mean_difference = age.t.test$estimate)

if (age.t.test$p.value < 0.05){
  warning(sprintf("There is a significant difference in ages of %s years between
                  the case and controls (p = %s)", age.t.test$estimate, age.t.test$p.value))
}

write.table(age.t.test.results, file = age.t.test.file, sep = ',',
            row.names = FALSE, col.names = TRUE,
            append = FALSE)

# Note: assumption of normality is violated, but as long as cases are above 5 in each group, this should be OK.
mf.t.test = t.test(as.numeric(as.factor(focal.cases$Sex)),as.numeric(as.factor(other.cases$Sex)),
                   alternative = 'two.sided', paired = FALSE)
mf.t.test

mf.t.test.file = sprintf("%s/sex_t_test_results%s%s.csv", out.folder, table.label, figure.label)
mf.t.test.results = data.frame(t_statistic = mf.t.test$statistic,
                                df = mf.t.test$parameter, p_value = mf.t.test$p.value,
                                group_mean_1 = mf.t.test$estimate[1], group_mean_2 = mf.t.test$estimate[2])

if (mf.t.test$p.value < 0.05){
  warning(sprintf("There is a significant difference in distribution of sexes between
                  the case and controls (p = %s)", mf.t.test$p.value))
}

write.table(mf.t.test.results, file = mf.t.test.file, sep = ',',
            row.names = FALSE, col.names = TRUE,
            append = FALSE)

##### ANALYSIS 1: PERFORM CONDITIONAL LOGISTIC REGRESSION ON THE FLIPPED CASE/CONTROL DESIGN #####
# Add an indicator variable, to make 0, 'no vaccine' the reference state
strata.df$vaccine_type_code = 0
strata.df$vaccine_type_code[strata.df$vaccine_type == "Pfizer"] = 1
strata.df$vaccine_type_code[strata.df$vaccine_type == "Moderna"] = 2
strata.df$vaccine_type_code[strata.df$vaccine_type == "Janssen"] = 3

# Add a time since last dose category (coded on the indicator variable not on the days)
strata.df$TimePostLastDose = strata.df$TimePostVax
# If booster is not 0, then use the time from the booster shot.
strata.df$TimePostLastDose[strata.df$TimePostBoost > 0] = strata.df$TimePostBoost[strata.df$TimePostBoost > 0]

# Add vaccination status binary variable
strata.df$IS.VAX = as.factor(strata.df$case_control)
strata.df$IS.VAX = relevel(strata.df$IS.VAX, ref = "Control")

# Add a combined variable that includes three categories: 'Unvaccinated', 'Vaccinated', and 'Vaccinated and Boosted'
strata.df$VAX.STATUS = 0
strata.df$VAX.STATUS[strata.df$case_control == "Case"] = 1
strata.df$VAX.STATUS[strata.df$VACCINE_DOSES == 3] = 2 # If 3 doses, it's boosted (unless an upstream subsetting criterion changes)
strata.df$VAX.STATUS[strata.df$VACCINE_DOSES == 2 & strata.df$vaccine_type == "Janssen"] = 2
strata.df$VAX.STATUS = as.factor(strata.df$VAX)

# Add a combined variable for vaccine status x booster status (boosters could be *ANY* booster)
strata.df$VAX_TYPE_x_BOOST = 0
strata.df$VAX_TYPE_x_BOOST[strata.df$vaccine_type == "Pfizer" & strata.df$IS.BOOST == 0] = 1
strata.df$VAX_TYPE_x_BOOST[strata.df$vaccine_type == "Moderna" & strata.df$IS.BOOST == 0] = 2
strata.df$VAX_TYPE_x_BOOST[strata.df$vaccine_type == "Janssen"] = 3 # ONLY ONE Janssen with booster, so not separating it out
strata.df$VAX_TYPE_x_BOOST[strata.df$vaccine_type == "Pfizer" & strata.df$IS.BOOST == 1] = 4
strata.df$VAX_TYPE_x_BOOST[strata.df$vaccine_type == "Moderna" & strata.df$IS.BOOST == 1] = 5
#strata.df$VAX_TYPE_x_BOOST[strata.df$vaccine_type == "Janssen" & strata.df$IS.BOOST == 1] = 6


# Write matched data set for record-keeping purposes
CL.file = sprintf("%s/01%s_stratadf%s.csv", out.folder, table.label, figure.label)
write.table(strata.df, file = CL.file, append = FALSE, row.names = FALSE, col.names = TRUE,
            sep = ',')

# Examine individual models
has.vax.formula = as.formula(IS_FOCAL ~ IS.VAX + strata(stratum)) 
has.vax = survival::clogit(has.vax.formula, data = strata.df)

has.boost.formula = as.formula(IS_FOCAL ~ IS.BOOST + strata(stratum))
has.boost = survival::clogit(has.boost.formula, data = strata.df)

#boost.plus.vax.formula = as.formula(IS_FOCAL ~ IS.VAX + IS.BOOST + strata(stratum))
boost.plus.vax.formula = as.formula(IS_FOCAL ~ VAX.STATUS + strata(stratum))
boost.plus.vax = survival::clogit(boost.plus.vax.formula, data = strata.df)

#boost.x.vax.formula = as.formula(IS_FOCAL ~ IS.VAX*IS.BOOST + strata(stratum))
#boost.x.vax = survival::clogit(boost.x.vax.formula, data = strata.df)
# This is a nonsensical test - all boosted individuals are vaccinated, so there is no potential for an interaction.

vax.type.formula = as.formula(IS_FOCAL ~ as.factor(vaccine_type_code) + strata(stratum))
vax.type = survival::clogit(vax.type.formula, data=strata.df)

tpv.formula = as.formula(IS_FOCAL ~ as.factor(TimePostVax) + strata(stratum))
time.post.vax = survival::clogit(tpv.formula, data=strata.df)

# It was decided to exclude VACCINE_DOSES variable from the analysis,
# due to the difficulty in interpreting it, as it indirectly combines vaccine type and number of doses received.
#nd.formula = as.formula(IS_FOCAL ~ VACCINE_DOSES + strata(stratum))
#n.doses = survival::clogit(nd.formula, data=strata.df)

tpb.formula = as.formula(IS_FOCAL ~ as.factor(TimePostBoost) + strata(stratum))
time.post.boost = survival::clogit(tpb.formula, data=strata.df)

tpd.formula = as.formula(IS_FOCAL ~ as.factor(TimePostLastDose) + strata(stratum))
time.post.dose = survival::clogit(tpd.formula, data=strata.df)

# Add booster to vaccine type model
vtb.formula = as.formula(IS_FOCAL ~ as.factor(vaccine_type_code) + as.factor(IS.BOOST) + strata(stratum))
vaccine.type.boost = survival::clogit(vtb.formula, data=strata.df)

# Look at booster x vaccine type model, coded to get individual factor effects
vt.x.b.formula = as.formula(IS_FOCAL ~ as.factor(VAX_TYPE_x_BOOST) + strata(stratum))
vaccine.type.x.boost = survival::clogit(vt.x.b.formula, data=strata.df)

# Look at full model with no interactions (changed to drop number of doses in favor of a booster dummy variable on 2022-02-09)
full.formula = as.formula(IS_FOCAL ~ as.factor(vaccine_type_code) + as.factor(TimePostVax) +
                            VAX.STATUS + as.factor(TimePostBoost) +
                            strata(stratum)) #VACCINE_DOSES
full.model = survival::clogit(full.formula, data=strata.df)

# Calculate AIC scores and identify the model with the lowest AIC
models = c("HasVaccine", "VaccineType", "TimePostVaccination",
           "TimePostBooster", "TimePostDose", "FullModel",
           "Booster", "BoosterPlusVax",
           "VaccineType + Booster",
           "VaccineType x Booster") # , "NumberVaccineDoses", "BoosterXVax" "VaccineTypeXNumberVaccineDoses",
model.objects = list(has.vax, vax.type, time.post.vax, time.post.boost,
                     time.post.dose, full.model,
                     has.boost, boost.plus.vax,
                     vaccine.type.boost, vaccine.type.x.boost) # n.doses, , boost.x.vax, , vaxtype.x.ndoses
formula.objects = list(has.vax.formula, vax.type.formula, tpv.formula, tpb.formula,
                       tpd.formula, full.formula, 
                       has.boost.formula, boost.plus.vax.formula,
                       vtb.formula, vt.x.b.formula) # nd.formula, , boost.x.vax.formula vaxtype.x.ndoses.formula,
aic.results = data.frame(model = models, AIC = NA, K = NA, LogLikelihood = NA,
                         coefficients = NA,
                         coefficient.Odds.Ratios = NA,
                         coefficient.Odds.Ratios.lower = NA,
                         coefficient.Odds.Ratios.upper = NA,
                         p.value = NA, coefficient.p.values = NA)
for (i in 1:length(models)){
  model = models[i]
  model.object = model.objects[[i]]
  model.summary = summary(model.object)
  model.AIC = extractAIC(model.object)[2]
  model.K = extractAIC(model.object)[1]
  model.loglik = model.object$loglik[2]
  aic.results$AIC[i] = model.AIC
  aic.results$K[i] = model.K
  aic.results$LogLikelihood[i] = model.loglik
  
  aic.results$p.value[i] = model.summary$logtest[3] # 3rd element is the p-value, logtest corresponds to the Likelihood Ratio test
  aic.results$coefficients[i] = paste(rownames(model.summary$coefficients), collapse = '; ')
  aic.results$coefficient.p.values[i] = paste(round(model.summary$coefficients[ ,5], 4), collapse = '; ')
  aic.results$coefficient.Odds.Ratios[i] = paste(round(model.summary$coefficients[ ,2],6), collapse = '; ')
  aic.results$coefficient.Odds.Ratios.lower[i] = paste(round(model.summary$conf.int[ ,3],3), collapse = '; ')
  aic.results$coefficient.Odds.Ratios.upper[i] = paste(round(model.summary$conf.int[ ,4],3), collapse = '; ')
  
}
aic.results$SORT.ORDER = seq(1:nrow(aic.results)) # Get an indicator for the R object associated with each model
aic.results$DeltaAIC = aic.results$AIC - min(aic.results$AIC)
aic.results = aic.results[order(aic.results$DeltaAIC), ]

aic.file = sprintf("%s/AIC_RESULTS%s%s.csv", out.folder, table.label, figure.label)
write.table(aic.results, aic.file, sep = ',', row.names = FALSE, 
            col.names = TRUE)

warning("Model results were examined, additional models were added as suggested
by the prior results, and the top model was selected for leverage analysis below")

# Takes the model with the lowest delta AIC.
# This ignores ties - so it may be necessary to change this manually
obj.index = aic.results$SORT.ORDER[1] 
model.object = model.objects[[obj.index]] 
formula.object = formula.objects[[obj.index]]
make.leverage.plot(model.object, formula.object, out.folder, figure.label, table.label)

#### SENSITIVITY ANALYSIS: IS THE LACK OF A VACCINE TYPE EFFECT INFLUENCED
# BY INDIVIDUALS THAT ARE ONLY ABLE TO RECEIVE ONE VACCINE TYPE?

# Run sensitivity analysis without re-running all results
# strata.df = read.csv("C:/hri/DOH_COVID/Results/Omicron/01_ConditionalLogistic_stratadf_omicron.csv")

# Examine vaccine type again, excluding <18 year olds who could only receive one type of vaccine (or no vaccine in the case of <5's)
vaccine.sensitivity = strata.df[strata.df$Age >= 18, ]

boost.plus.vax.formula.2 = as.formula(IS_FOCAL ~ as.factor(VAX.STATUS) + strata(stratum))
boost.plus.vax.2 = survival::clogit(boost.plus.vax.formula.2, data = vaccine.sensitivity)
summary.bv = summary(boost.plus.vax.2)
bv.AIC = extractAIC(boost.plus.vax.2)[2]

vax.type.formula.2 = as.formula(IS_FOCAL ~ as.factor(vaccine_type_code) + strata(stratum))
vax.type.2 = survival::clogit(vax.type.formula.2, data=vaccine.sensitivity)
summary.vt = summary(vax.type.2)
vt.AIC = extractAIC(vax.type.2)[2]

vax.type.formula.3 = as.formula(IS_FOCAL ~ as.factor(vaccine_type_code) + as.factor(IS.BOOST) + strata(stratum))
vax.type.3 = survival::clogit(vax.type.formula.3, data=vaccine.sensitivity)
vtb.AIC = extractAIC(vax.type.3)[2]

# These results were not formally output
# Visual inspection suggests that the vaccine type result is NOT dependent
# on the inclusion of younger individuals with the Pfizer vaccine.