SARS-CoV-2_variants_analysis_script.Rmd

---
title: "SARS-CoV-2 ACE2 variant combinations"
output: github_document
---

```{r Set up the analysis script}
## Clear existing environment
rm(list = ls())

## Load basic useful packages
library(tidyverse)
library(ggrepel)
library(ggbeeswarm)
library(reshape)
library(sf)
library(here)
library(ggfortify)
library(googlesheets4)
library(rpart)
library(rpart.plot)
library(ggbreak)
library(ggVennDiagram)
library(network)
library(GGally)
library(ggdendro)

## Set the seed for immediate reproducibility
set.seed(1234567)

## Set the base theme to what I like
theme_set(theme_bw())
theme_update(legend.title = element_blank(), panel.grid.minor = element_blank(), text = element_text(size = 10))

## Setting some universal thresholds for some of the subsequent analyses
quantile_cutoff <- 0.99
minquant_fraction <- 0.2

to_single_notation <- function(arg1){
  if(toupper(arg1) == "ALA"){return("A")}
  if(toupper(arg1) == "CYS"){return("C")}
  if(toupper(arg1) == "ASP"){return("D")}
  if(toupper(arg1) == "GLU"){return("E")}
  if(toupper(arg1) == "GLH"){return("E")}
  if(toupper(arg1) == "PHE"){return("F")}
  if(toupper(arg1) == "GLY"){return("G")}
  if(toupper(arg1) == "HIS"){return("H")}
  if(toupper(arg1) == "ILE"){return("I")}
  if(toupper(arg1) == "LYS"){return("K")}
  if(toupper(arg1) == "LEU"){return("L")}
  if(toupper(arg1) == "MET"){return("M")}
  if(toupper(arg1) == "ASN"){return("N")}
  if(toupper(arg1) == "PRO"){return("P")}
  if(toupper(arg1) == "GLN"){return("Q")}
  if(toupper(arg1) == "ARG"){return("R")}
  if(toupper(arg1) == "SER"){return("S")}
  if(toupper(arg1) == "THR"){return("T")}
  if(toupper(arg1) == "VAL"){return("V")}
  if(toupper(arg1) == "TRP"){return("W")}
  if(toupper(arg1) == "TYR"){return("Y")}
  if(toupper(arg1) == "TER"){return("X")}
}
```

```{r Writing a function capable of calculating the enrichments in the sequencing data for Nextseq 3 data where one index had to be 9nt bc of sequencing issues}
### Next make a function for analyzing the enrichment of the virus in hygromycin antibiotic (thus selecting for infected cells)
index_key1 <- read.csv(file = "Keys/9ntR1_10ntR2.csv", header = T, stringsAsFactors = F) %>% mutate(primer1 = primer, primer2 = primer)
sample_key1 <- read.csv(file = "Keys/Barcode_receptor_samples.csv", header = T, stringsAsFactors = F)

sample_index_key1 <- merge(sample_key1[,c("gene","ortholog","mutant","protease","kozak","plasmid_template","primer1","primer2","concat")], index_key1[,c("primer2","index")], by = "primer2", all.x = T)
sample_index_key1 <- merge(sample_index_key1, index_key1[,c("primer1","index")], by = "primer1", all.x = T)
sample_index_key1$X <- paste(sample_index_key1$index.x,"N",sample_index_key1$index.y, sep = "")
sample_index_key1$sequence <- sample_index_key1$X

## Current list of NGS data
myfiles = list.files(path="Data/Dbl_barcode_NS3", pattern="*.tsv", full.names=TRUE)
myfiles_df <- data.frame("number" = seq(1,length(myfiles)), "index" = myfiles)

## Now defining the function to analyze the enrichement with the NGS data
list_of_indexes <- c(4,5,8,9) ## Delete whenever. Only for troubleshooting.

makeExperimentFrame1 <- function(list_of_indexes){
  ## Deal with the unselected data first
  rep1 <- read.delim(myfiles[list_of_indexes[1]], sep = "\t") %>% mutate(log10_count = log10(count))
  # First filter is to only look at actual designer barcodes
  rep1_filtered1 <- rep1 %>% filter(X %in% sample_index_key1$X)
  # Second filter will be to ignore barcodes with counts so low they are unlikely real
  rep1_filtered2 <- rep1_filtered1 %>% filter(log10_count > (mean(log10_count) - sd(log10_count) * 2))
  
  ## Repeat the process for the second replicate
  rep2 <- read.delim(myfiles[list_of_indexes[2]], sep = "\t") %>% mutate(log10_count = log10(count))
  rep2_filtered1 <- rep2 %>% filter(X %in% sample_index_key1$X)
  rep2_filtered2 <- rep2_filtered1 %>% filter(log10_count > (mean(log10_count) - sd(log10_count) * 2))
  
  unsel <- merge(rep1_filtered2, rep2_filtered2, by = "X", all = T)
  unsel[is.na(unsel)] <- 0
  unsel$rep1_freq <- unsel$count.x / sum(unsel$count.x)
  unsel$rep2_freq <- unsel$count.y / sum(unsel$count.y)
  unsel$u_freq <- 10^((log10(unsel$rep1_freq) + log10(unsel$rep2_freq))/2)
  
  ## Deal with the HygroR data next
  hygro <- merge(read.delim(myfiles[list_of_indexes[3]], sep = "\t"), read.delim(myfiles[list_of_indexes[4]], sep = "\t"), by = "X", all = T)
  ## Combine the data; this should also take care of the filtering, since the unsel was already filtered
  combined_frame <- merge(unsel[,c("X","u_freq")], hygro[,c("X","count.x","count.y")], by = "X", all.x = T) %>% 
    mutate(sel_freq1 = count.x / sum(count.x), sel_freq2 = count.y / sum(count.y)) %>% 
    mutate(h_freq = 10^((log10(sel_freq1) + log10(sel_freq2))/2))
  return_frame <- combined_frame[,c("X","u_freq","h_freq")]
  
  ## Do some additional analysis before returning the data frame
  return_frame$h_enrichment <- return_frame$h_freq / return_frame$u_freq
  colnames(return_frame) <- c("sequence","u_freq","h_freq","h_enrichment")
  return_frame2_troubleshooting <- merge(return_frame, sample_index_key1[,c("gene","ortholog","mutant","protease","kozak","plasmid_template","concat","sequence")], by = "sequence", all = T)
  return_frame2 <- merge(return_frame, sample_index_key1[,c("gene","ortholog","mutant","protease","kozak","plasmid_template","concat","sequence")], by = "sequence", all.x = T)
  return(return_frame2)
}
```

```{r Looking at correlations of the uninfected cells with library v1.0 in Nextseq3}
i0192 <- read.delim(file = "Data/Dbl_barcode_NS3/I0192_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0193 <- read.delim(file = "Data/Dbl_barcode_NS3/I0193_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0194 <- read.delim(file = "Data/Dbl_barcode_NS3/I0194_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0195 <- read.delim(file = "Data/Dbl_barcode_NS3/I0195_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0196 <- read.delim(file = "Data/Dbl_barcode_NS3/I0196_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0197 <- read.delim(file = "Data/Dbl_barcode_NS3/I0197_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0198 <- read.delim(file = "Data/Dbl_barcode_NS3/I0198_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0199 <- read.delim(file = "Data/Dbl_barcode_NS3/I0199_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0200 <- read.delim(file = "Data/Dbl_barcode_NS3/I0200_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0201 <- read.delim(file = "Data/Dbl_barcode_NS3/I0201_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0202 <- read.delim(file = "Data/Dbl_barcode_NS3/I0202_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0203 <- read.delim(file = "Data/Dbl_barcode_NS3/I0203_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0204 <- read.delim(file = "Data/Dbl_barcode_NS3/I0204_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0205 <- read.delim(file = "Data/Dbl_barcode_NS3/I0205_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0206 <- read.delim(file = "Data/Dbl_barcode_NS3/I0206_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0207 <- read.delim(file = "Data/Dbl_barcode_NS3/I0207_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0208 <- read.delim(file = "Data/Dbl_barcode_NS3/I0208_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)

i0192_filtered <- i0192 %>% filter(X %in% sample_index_key1$X)
i0193_filtered <- i0193 %>% filter(X %in% sample_index_key1$X)
i0194_filtered <- i0194 %>% filter(X %in% sample_index_key1$X)
i0195_filtered <- i0195 %>% filter(X %in% sample_index_key1$X)
i0196_filtered <- i0196 %>% filter(X %in% sample_index_key1$X)
i0197_filtered <- i0197 %>% filter(X %in% sample_index_key1$X)
i0198_filtered <- i0198 %>% filter(X %in% sample_index_key1$X)
i0199_filtered <- i0199 %>% filter(X %in% sample_index_key1$X)
i0200_filtered <- i0200 %>% filter(X %in% sample_index_key1$X)
i0201_filtered <- i0201 %>% filter(X %in% sample_index_key1$X)
i0202_filtered <- i0202 %>% filter(X %in% sample_index_key1$X)
i0203_filtered <- i0203 %>% filter(X %in% sample_index_key1$X)
i0204_filtered <- i0204 %>% filter(X %in% sample_index_key1$X)
i0205_filtered <- i0205 %>% filter(X %in% sample_index_key1$X)
i0206_filtered <- i0206 %>% filter(X %in% sample_index_key1$X)
i0207_filtered <- i0207 %>% filter(X %in% sample_index_key1$X)
i0208_filtered <- i0208 %>% filter(X %in% sample_index_key1$X)

ns3_negs_filtered <- merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(i0192_filtered,i0193_filtered,by = "X", all = T),i0194_filtered,by = "X", all = T),i0195_filtered,by = "X", all = T),i0196_filtered,by = "X", all = T),i0197_filtered,by = "X", all = T),i0198_filtered,by = "X", all = T),i0199_filtered,by = "X", all = T),i0200_filtered,by = "X", all = T),i0201_filtered,by = "X", all = T),i0202_filtered,by = "X", all = T),i0203_filtered,by = "X", all = T),i0204_filtered,by = "X", all = T),i0205_filtered,by = "X", all = T),i0206_filtered,by = "X", all = T),i0207_filtered,by = "X", all = T),i0208_filtered,by = "X", all = T)

colnames(ns3_negs_filtered) <- c("X","ns3_u1","ns3_u2","ns3_u3","ns3_u4","ns3_u5","ns3_u6","ns3_u7","ns3_u8","ns3_u9","ns3_u10","ns3_u11","ns3_u12","ns3_u13","ns3_u14","ns3_u15","ns3_u16","ns3_u17")

ns3_negs_filtered$ns3_u1 <- ns3_negs_filtered$ns3_u1 / sum(ns3_negs_filtered$ns3_u1, na.rm = T)
ns3_negs_filtered$ns3_u2 <- ns3_negs_filtered$ns3_u2 / sum(ns3_negs_filtered$ns3_u2, na.rm = T)
ns3_negs_filtered$ns3_u3 <- ns3_negs_filtered$ns3_u3 / sum(ns3_negs_filtered$ns3_u3, na.rm = T)
ns3_negs_filtered$ns3_u4 <- ns3_negs_filtered$ns3_u4 / sum(ns3_negs_filtered$ns3_u4, na.rm = T)
ns3_negs_filtered$ns3_u5 <- ns3_negs_filtered$ns3_u5 / sum(ns3_negs_filtered$ns3_u5, na.rm = T)
ns3_negs_filtered$ns3_u6 <- ns3_negs_filtered$ns3_u6 / sum(ns3_negs_filtered$ns3_u6, na.rm = T)
ns3_negs_filtered$ns3_u7 <- ns3_negs_filtered$ns3_u7 / sum(ns3_negs_filtered$ns3_u7, na.rm = T)
ns3_negs_filtered$ns3_u8 <- ns3_negs_filtered$ns3_u8 / sum(ns3_negs_filtered$ns3_u8, na.rm = T)
ns3_negs_filtered$ns3_u9 <- ns3_negs_filtered$ns3_u9 / sum(ns3_negs_filtered$ns3_u9, na.rm = T)
ns3_negs_filtered$ns3_u10 <- ns3_negs_filtered$ns3_u10 / sum(ns3_negs_filtered$ns3_u10, na.rm = T)
ns3_negs_filtered$ns3_u11 <- ns3_negs_filtered$ns3_u11 / sum(ns3_negs_filtered$ns3_u11, na.rm = T)
ns3_negs_filtered$ns3_u12 <- ns3_negs_filtered$ns3_u12 / sum(ns3_negs_filtered$ns3_u12, na.rm = T)
ns3_negs_filtered$ns3_u13 <- ns3_negs_filtered$ns3_u13 / sum(ns3_negs_filtered$ns3_u13, na.rm = T)
ns3_negs_filtered$ns3_u14 <- ns3_negs_filtered$ns3_u14 / sum(ns3_negs_filtered$ns3_u14, na.rm = T)
ns3_negs_filtered$ns3_u15 <- ns3_negs_filtered$ns3_u15 / sum(ns3_negs_filtered$ns3_u15, na.rm = T)
ns3_negs_filtered$ns3_u16 <- ns3_negs_filtered$ns3_u16 / sum(ns3_negs_filtered$ns3_u16, na.rm = T)
ns3_negs_filtered$ns3_u17 <- ns3_negs_filtered$ns3_u17 / sum(ns3_negs_filtered$ns3_u17, na.rm = T)

ggplot() + scale_x_log10() + scale_y_log10() + 
  geom_point(data = ns3_negs_filtered, aes(x = ns3_u1, y = ns3_u2))

ggplot() + scale_x_log10() + scale_y_log10() + 
  geom_point(data = ns3_negs_filtered, aes(x = ns3_u1, y = ns3_u3))

ggplot() + scale_x_log10() + scale_y_log10() + 
  geom_point(data = ns3_negs_filtered, aes(x = ns3_u2, y = ns3_u3))

Neg_cntrl_count_correlations <- ggplot() + 
  labs(x = "Variant frequency in\nreplicate 1", y = "Variant frequency in\nreplicate 2") +
  scale_x_log10() + scale_y_log10() + 
  geom_hline(yintercept = 1e-5, linetype = 2) +
  geom_vline(xintercept = 1e-5, linetype = 2) +
  geom_abline(slope = 1, linetype = 2, alpha = 0.3) +
  geom_point(data = ns3_negs_filtered, aes(x = ns3_u1, y = ns3_u2), alpha = 0.5) +
  #geom_point(data = ns3_negs_filtered, aes(x = freq192, y = freq194), alpha = 0.5) +
  #geom_point(data = ns3_negs_filtered, aes(x = freq193, y = freq194), alpha = 0.5)
  NULL
Neg_cntrl_count_correlations
ggsave(file = "Plots/Neg_cntrl_count_correlations.pdf", Neg_cntrl_count_correlations, height = 1.75, width = 2)

# Correlations are reasonable, suggesting we can indeed reproducibly quantitate things
# R^2 values to report
paste("The R^2 for the unselected frequency counts shown:", round(cor(ns3_negs_filtered$ns3_u1, ns3_negs_filtered$ns3_u2, use = "complete")^2,2))

```

```{r Calculating the enrichment scores in Nextseq 3}
ns3_g740d1 <- makeExperimentFrame1(c(4,5,18,18)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment)) ## 195, 196, 209, 210
ns3_g740d2 <- makeExperimentFrame1(c(4,5,19,19)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment)) ## 195, 196, 209, 210
ns3_alpha1 <- makeExperimentFrame1(c(6,7,20,20)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns3_alpha2 <- makeExperimentFrame1(c(6,7,21,21)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns3_beta1 <- makeExperimentFrame1(c(8,9,22,22)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns3_beta2 <- makeExperimentFrame1(c(8,9,23,23)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns3_delta1 <- makeExperimentFrame1(c(10,11,24,24)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns3_delta2 <- makeExperimentFrame1(c(10,11,25,25)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns3_gamma1 <- makeExperimentFrame1(c(12,13,26,26)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment)) ## 203, 204, 217, 218
ns3_gamma2 <- makeExperimentFrame1(c(12,13,27,27)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment)) ## 203, 204, 217, 218
ns3_ba1omicron1 <- makeExperimentFrame1(c(14,15,28,28)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns3_ba1omicron2 <- makeExperimentFrame1(c(14,15,29,29)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
```

```{r Importing the relevant data from our Nextseq4 and Nextseq7 submissions with intended 10nt indices}
### Next make a function for analyzing the enrichment of the virus in hygromycin antibiotic (thus selecting for infected cells)
index_key2 <- read.csv(file = "Keys/10ntR1_10ntR2.csv", header = T, stringsAsFactors = F) %>% mutate(primer1 = primer, primer2 = primer)
sample_key2 <- read.csv(file = "Keys/Barcode_receptor_samples.csv", header = T, stringsAsFactors = F)

sample_index_key2 <- merge(sample_key2[,c("gene","ortholog","mutant","protease","kozak","plasmid_template","primer1","primer2","concat")], index_key2[,c("primer2","index")], by = "primer2", all.x = T)
sample_index_key2 <- merge(sample_index_key2, index_key2[,c("primer1","index")], by = "primer1", all.x = T)
sample_index_key2$X <- paste(sample_index_key2$index.x,sample_index_key2$index.y, sep = "")
sample_index_key2$sequence <- sample_index_key2$X

## Current list of NGS data
myfiles = list.files(path="Data/Dbl_barcode_NS4", pattern="*.tsv", full.names=TRUE)
myfiles_df <- data.frame("number" = seq(1,length(myfiles)), "index" = myfiles)

## Now defining the function to analyze the enrichement with the NGS data
list_of_indexes <- c(3,3,4,4) ## Delete whenever. Only for troubleshooting.

makeExperimentFrame2 <- function(list_of_indexes){
  ## Deal with the unselected data first
  rep1 <- read.delim(myfiles[list_of_indexes[1]], sep = "\t") %>% mutate(log10_count = log10(count))
  # First filter is to only look at actual designer barcodes
  rep1_filtered1 <- rep1 %>% filter(X %in% sample_index_key2$X)
  # Second filter will be to ignore barcodes with counts so low they are unlikely real
  rep1_filtered2 <- rep1_filtered1 %>% filter(log10_count > (mean(log10_count) - sd(log10_count) * 2))
  
  ## Repeat the process for the second replicate
  rep2 <- read.delim(myfiles[list_of_indexes[2]], sep = "\t") %>% mutate(log10_count = log10(count))
  rep2_filtered1 <- rep2 %>% filter(X %in% sample_index_key2$X)
  rep2_filtered2 <- rep2_filtered1 %>% filter(log10_count > (mean(log10_count) - sd(log10_count) * 2))
  
  unsel <- merge(rep1_filtered2, rep2_filtered2, by = "X", all = T)
  unsel[is.na(unsel)] <- 0
  unsel$rep1_freq <- unsel$count.x / sum(unsel$count.x)
  unsel$rep2_freq <- unsel$count.y / sum(unsel$count.y)
  unsel$u_freq <- 10^((log10(unsel$rep1_freq) + log10(unsel$rep2_freq))/2)
  
  ## Deal with the HygroR data next
  hygro <- merge(read.delim(myfiles[list_of_indexes[3]], sep = "\t"), read.delim(myfiles[list_of_indexes[4]], sep = "\t"), by = "X", all = T)
  ## Combine the data; this should also take care of the filtering, since the unsel was already filtered
  combined_frame <- merge(unsel[,c("X","u_freq")], hygro[,c("X","count.x","count.y")], by = "X", all.x = T)
  combined_frame$sel_freq1 <- combined_frame$count.x / sum(combined_frame$count.x, na.rm = T)
  combined_frame$sel_freq2 <- combined_frame$count.y / sum(combined_frame$count.y, na.rm = T)
  combined_frame$h_freq <- 10^((log10(combined_frame$sel_freq1) + log10(combined_frame$sel_freq2))/2)
  return_frame <- combined_frame[,c("X","u_freq","h_freq")]
  
  ## Do some additional analysis before returning the data frame
  return_frame$h_enrichment <- return_frame$h_freq / return_frame$u_freq
  colnames(return_frame) <- c("sequence","u_freq","h_freq","h_enrichment")
  return_frame2_troubleshooting <- merge(return_frame, sample_index_key2[,c("gene","ortholog","mutant","protease","kozak","plasmid_template","concat","sequence")], by = "sequence", all = T)
  return_frame2 <- merge(return_frame, sample_index_key2[,c("gene","ortholog","mutant","protease","kozak","plasmid_template","concat","sequence")], by = "sequence", all.x = T)
  return(return_frame2)
}
```

```{r Looking at correlations of the uninfected cells with library v1.0 in Nextseq4}
i0274 <- read.delim(file = "Data/Dbl_barcode_NS4/I0274_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0276 <- read.delim(file = "Data/Dbl_barcode_NS4/I0276_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0278 <- read.delim(file = "Data/Dbl_barcode_NS4/I0278_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0280 <- read.delim(file = "Data/Dbl_barcode_NS4/I0280_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0282 <- read.delim(file = "Data/Dbl_barcode_NS4/I0282_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0284 <- read.delim(file = "Data/Dbl_barcode_NS4/I0284_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0287 <- read.delim(file = "Data/Dbl_barcode_NS4/I0287_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0289 <- read.delim(file = "Data/Dbl_barcode_NS4/I0289_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0291 <- read.delim(file = "Data/Dbl_barcode_NS4/I0291_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0293 <- read.delim(file = "Data/Dbl_barcode_NS4/I0293_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0295 <- read.delim(file = "Data/Dbl_barcode_NS4/I0295_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0297 <- read.delim(file = "Data/Dbl_barcode_NS4/I0297_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0299 <- read.delim(file = "Data/Dbl_barcode_NS4/I0299_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0305 <- read.delim(file = "Data/Dbl_barcode_NS4/I0305_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0308 <- read.delim(file = "Data/Dbl_barcode_NS4/I0308_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)
i0310 <- read.delim(file = "Data/Dbl_barcode_NS4/I0310_lib.tsv", sep = "\t", header = T, stringsAsFactors = F)

unsel_ns4 <- merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(merge(i0274, i0276, by = "X", all = T), i0278, by = "X", all = T), i0280, by = "X", all = T), i0282, by = "X", all = T), i0284, by = "X", all = T), i0287, by = "X", all = T), i0289, by = "X", all = T), i0291, by = "X", all = T), i0293, by = "X", all = T), i0295, by = "X", all = T), i0297, by = "X", all = T), i0299, by = "X", all = T), i0305, by = "X", all = T), i0308, by = "X", all = T), i0310, by = "X", all = T)

colnames(unsel_ns4) <- c("X","u1","u2","u3","u4","u5","u6","u7","u8","u9","u10","u11","u12","u13","u14","u15","u16")
unsel_ns4_filtered <- unsel_ns4 %>% filter(X %in% sample_index_key2$X)
unsel_ns4_filtered_freq <- unsel_ns4_filtered
unsel_ns4_filtered_freq[is.na(unsel_ns4_filtered)] <- 0

for(x in 2:ncol(unsel_ns4_filtered_freq)){
  for(y in 1:nrow(unsel_ns4_filtered_freq)){
    unsel_ns4_filtered_freq[y,x] <- unsel_ns4_filtered_freq[y,x] / sum(unsel_ns4_filtered_freq[,x])
  }
}

unsel_ns4_filtered_freq_melted <- melt(unsel_ns4_filtered_freq, id = "X")
unsel_mean = data.frame("X" = unsel_ns4_filtered_freq$X, "freq" = rowMeans(unsel_ns4_filtered_freq[,2:17]))
unsel_mean_filtered <- unsel_mean %>% filter(freq > 1e-4)
unsel_mean_filtered$count <- unsel_mean_filtered$freq * 1e12
write.table(file = "Data/Dbl_barcode_NS4/zCombined_unsel.tsv", unsel_mean_filtered[,c("X","count")], sep = "\t", quote = F, row.names = F)

### Combining ALL negative control sequencing
ns3_negs_filtered$seq9 <- ns3_negs_filtered$X
ns4_negs_filtered_seq9 <- unsel_ns4_filtered_freq
ns4_negs_filtered_seq9$seq9 <- paste0(substr(ns4_negs_filtered_seq9$X,1,5),substr(ns4_negs_filtered_seq9$X,7,10),"N",substr(ns4_negs_filtered_seq9$X,11,19))

ns34_negs_filtered <- merge(ns3_negs_filtered, ns4_negs_filtered_seq9, by = "seq9", all = T)
ns34_negs_filtered[is.na(ns34_negs_filtered)] <- 0
ns34_negs_filtered <- ns34_negs_filtered[,colnames(ns34_negs_filtered)[!(colnames(ns34_negs_filtered) %in% c("X.x","X.y"))]]

## Do all pairwise correlations of unselected cells
unsel_correlation_vector <- c()
for(x in 2:ncol(ns34_negs_filtered)){
  for(y in 2:ncol(ns34_negs_filtered)){
    unsel_correlation_vector <- c(unsel_correlation_vector, (cor(ns34_negs_filtered[,x],ns34_negs_filtered[,y])))
  }
}

## Do all pairwise correlations of unselected cells for ns3
ns3_unsel_correlation_vector <- c()
for(x in 2:18){
  for(y in 2:18){
    ns3_unsel_correlation_vector <- c(ns3_unsel_correlation_vector, (cor(ns34_negs_filtered[,x],ns34_negs_filtered[,y])))
  }
}

## Do all pairwise correlations of unselected cells for ns4
ns4_unsel_correlation_vector <- c()
for(x in 19:34){
  for(y in 19:34){
    ns4_unsel_correlation_vector <- c(ns4_unsel_correlation_vector, (cor(ns34_negs_filtered[,x],ns34_negs_filtered[,y])))
  }
}

combined_unsel_correlation_df <- rbind(data.frame("grouping" = "All_pairwise", "value" = unsel_correlation_vector),
                                       data.frame("grouping" = "Set_1_pairwise", "value" = ns3_unsel_correlation_vector),
                                       data.frame("grouping" = "Set_2_pairwise", "value" = ns4_unsel_correlation_vector))

combined_unsel_correlation_df$grouping <- factor(combined_unsel_correlation_df$grouping, levels = rev(c("All_pairwise", "Set_1_pairwise", "Set_2_pairwise")))

Comprehensive_neg_cntrl_correlations <- ggplot() + theme(legend.position = "top") +
  labs(x = "Pearson's r^2", y = "Count") +
  scale_x_continuous(limits = c(0.4,1.02)) +
  geom_histogram(data = combined_unsel_correlation_df, aes(x = value^2, fill = grouping), color = "black", binwidth = 0.02, alpha = 0.8) +
  facet_grid(rows = vars(grouping), scales = "free_y") + 
  NULL; Comprehensive_neg_cntrl_correlations
ggsave(file = "Plots/Comprehensive_neg_cntrl_correlations.pdf", Comprehensive_neg_cntrl_correlations, height = 3.75, width = 2.5)
```

```{r Calculating the enrichment scores in Nextseq 4}
myfiles = list.files(path="Data/Dbl_barcode_NS4", pattern="*.tsv", full.names=TRUE)
myfiles_df <- data.frame("number" = seq(1,length(myfiles)), "index" = myfiles)

ns4_g928a <- makeExperimentFrame2(c(1,1,2,2)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns4_g928b <- makeExperimentFrame2(c(9,9,10,10)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns4_g928c <- makeExperimentFrame2(c(16,16,17,17)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns4_g928d <- makeExperimentFrame2(c(26,26,27,27)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
```


```{r Calculating the enrichment scores in Nextseq 7}
myfiles = list.files(path="Data/Dbl_barcode_NS7", pattern="*.tsv", full.names=TRUE)
myfiles_df <- data.frame("number" = seq(1,length(myfiles)), "index" = myfiles)

ns7_g928a1 <- makeExperimentFrame2(c(1,1,2,2)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_g928a2 <- makeExperimentFrame2(c(8,8,9,9)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_g928a3 <- makeExperimentFrame2(c(14,14,15,15)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_alpha1 <- makeExperimentFrame2(c(1,1,3,3)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_alpha2 <- makeExperimentFrame2(c(8,8,10,10)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_alpha3 <- makeExperimentFrame2(c(14,14,16,16)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_delta1 <- makeExperimentFrame2(c(1,1,4,4)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_delta2 <- makeExperimentFrame2(c(8,8,11,11)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_delta3 <- makeExperimentFrame2(c(14,14,17,17)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_ba1omicron1 <- makeExperimentFrame2(c(1,1,5,5)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))
ns7_ba1omicron2 <- makeExperimentFrame2(c(8,8,12,12)) %>% filter(h_enrichment != "Inf") %>% arrange(desc(h_enrichment))

```

```{r Combine all data to make a processed file I can upload to GEO}
ns3_g740d1$seq9 <- ns3_g740d1$sequence
ns3_g740d2$seq9 <- ns3_g740d2$sequence
ns3_alpha1$seq9 <- ns3_g740d2$sequence
ns3_alpha2$seq9 <- ns3_alpha2$sequence
ns3_beta1$seq9 <- ns3_beta1$sequence
ns3_beta2$seq9 <- ns3_beta2$sequence
ns3_delta1$seq9 <- ns3_delta1$sequence
ns3_delta2$seq9 <- ns3_delta2$sequence
ns3_gamma1$seq9 <- ns3_gamma1$sequence
ns3_gamma2$seq9 <- ns3_gamma2$sequence
ns3_ba1omicron1$seq9 <- ns3_ba1omicron1$sequence
ns3_ba1omicron2$seq9 <- ns3_ba1omicron2$sequence
ns4_g928a$seq9 <- paste0(substr(ns4_g928a$sequence,1,5),substr(ns4_g928a$sequence,7,10),"N",substr(ns4_g928a$sequence,11,19))
ns4_g928b$seq9 <- paste0(substr(ns4_g928b$sequence,1,5),substr(ns4_g928b$sequence,7,10),"N",substr(ns4_g928b$sequence,11,19))
ns4_g928c$seq9 <- paste0(substr(ns4_g928c$sequence,1,5),substr(ns4_g928c$sequence,7,10),"N",substr(ns4_g928c$sequence,11,19))
ns4_g928d$seq9 <- paste0(substr(ns4_g928d$sequence,1,5),substr(ns4_g928d$sequence,7,10),"N",substr(ns4_g928d$sequence,11,19))
ns7_g928a1$seq9 <- paste0(substr(ns7_g928a1$sequence,1,5),substr(ns7_g928a1$sequence,7,10),"N",substr(ns7_g928a1$sequence,11,19))
ns7_g928a2$seq9 <- paste0(substr(ns7_g928a2$sequence,1,5),substr(ns7_g928a2$sequence,7,10),"N",substr(ns7_g928a2$sequence,11,19))
ns7_g928a3$seq9 <- paste0(substr(ns7_g928a3$sequence,1,5),substr(ns7_g928a3$sequence,7,10),"N",substr(ns7_g928a3$sequence,11,19))
ns7_alpha1$seq9 <- paste0(substr(ns7_alpha1$sequence,1,5),substr(ns7_alpha1$sequence,7,10),"N",substr(ns7_alpha1$sequence,11,19))
ns7_alpha2$seq9 <- paste0(substr(ns7_alpha2$sequence,1,5),substr(ns7_alpha2$sequence,7,10),"N",substr(ns7_alpha2$sequence,11,19))
ns7_alpha3$seq9 <- paste0(substr(ns7_alpha3$sequence,1,5),substr(ns7_alpha3$sequence,7,10),"N",substr(ns7_alpha3$sequence,11,19))
ns7_delta1$seq9 <- paste0(substr(ns7_delta1$sequence,1,5),substr(ns7_delta1$sequence,7,10),"N",substr(ns7_delta1$sequence,11,19))
ns7_delta2$seq9 <- paste0(substr(ns7_delta2$sequence,1,5),substr(ns7_delta2$sequence,7,10),"N",substr(ns7_delta2$sequence,11,19))
ns7_delta3$seq9 <- paste0(substr(ns7_delta3$sequence,1,5),substr(ns7_delta3$sequence,7,10),"N",substr(ns7_delta3$sequence,11,19))
ns7_ba1omicron1$seq9 <- paste0(substr(ns7_ba1omicron1$sequence,1,5),substr(ns7_ba1omicron1$sequence,7,10),"N",substr(ns7_ba1omicron1$sequence,11,19))
ns7_ba1omicron2$seq9 <- paste0(substr(ns7_ba1omicron2$sequence,1,5),substr(ns7_ba1omicron2$sequence,7,10),"N",substr(ns7_ba1omicron2$sequence,11,19))

for_geo <- merge(ns3_g740d1[,c("gene","ortholog","mutant","protease","kozak","plasmid_template","concat","seq9")], ns7_g928a1[,c("sequence","seq9")], by = "seq9", all = T)

## Adding the 8 replicates for D614G
for_geo <- merge(for_geo, ns3_g740d1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns3_g740d2[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns4_g928a[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns4_g928b[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns4_g928c[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_g928a1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_g928a2[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_g928a3[,c("seq9","h_enrichment")], by = "seq9", all = T)

## Adding the 5 replicates for Alpha
for_geo <- merge(for_geo, ns3_alpha1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns3_alpha2[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_alpha1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_alpha2[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_alpha3[,c("seq9","h_enrichment")], by = "seq9", all = T)

## Adding the 2 replicates for Beta
for_geo <- merge(for_geo, ns3_beta1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns3_beta2[,c("seq9","h_enrichment")], by = "seq9", all = T)

## Adding the 2 replicates for Gamma
for_geo <- merge(for_geo, ns3_gamma1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns3_gamma2[,c("seq9","h_enrichment")], by = "seq9", all = T)

## Adding the 5 replicates for Delta
for_geo <- merge(for_geo, ns3_delta1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns3_delta2[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_delta1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_delta2[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_delta3[,c("seq9","h_enrichment")], by = "seq9", all = T)

## Adding the 4 replicates for Omicron BA1
for_geo <- merge(for_geo, ns3_ba1omicron1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns3_ba1omicron2[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_ba1omicron1[,c("seq9","h_enrichment")], by = "seq9", all = T)
for_geo <- merge(for_geo, ns7_ba1omicron2[,c("seq9","h_enrichment")], by = "seq9", all = T)

colnames(for_geo) <- c("sequence","gene","ortholog","mutant","protease","kozak","plasmid_template","concat","seq9",
                       "d614g_1","d614g_2","d614g_3","d614g_4","d614g_5","d614g_6","d614g_7","d614g_8",
                       "alpha_1","alpha_2","alpha_3","alpha_4","alpha_5",
                       "beta_1","beta_2",
                       "gamma_1","gammaa_2",
                       "delta_1","delta_2","delta_3","delta_4","delta_5",
                       "ba1_1","ba1_2","ba1_3","ba1_4")

write.csv(file = "Output_tables/Processed_enrichment_scores.csv",for_geo,row.names = F, quote = FALSE)
```


```{r Making an example scatter plot of how certain samples become enriched following selection with the D614G dataset}

d614g_enrichment_example <- merge(merge(merge(merge(merge(merge(merge(merge(merge(ns7_g928a1[,c("plasmid_template","u_freq")], ns7_g928a2[,c("plasmid_template","u_freq")], by = "plasmid_template"), ns7_g928a3[,c("plasmid_template","u_freq")], by = "plasmid_template"), ns4_g928a[,c("plasmid_template","u_freq")], by = "plasmid_template"), ns4_g928b[,c("plasmid_template","u_freq")], by = "plasmid_template"), ns4_g928b[,c("plasmid_template","h_freq")], by = "plasmid_template"), ns4_g928a[,c("plasmid_template","h_freq")], by = "plasmid_template"), ns7_g928a1[,c("plasmid_template","h_freq")], by = "plasmid_template"), ns7_g928a2[,c("plasmid_template","h_freq")], by = "plasmid_template"), ns7_g928a3[,c("plasmid_template","h_freq","ortholog","protease","mutant","kozak")], by = "plasmid_template")

colnames(d614g_enrichment_example)[2:11] <- c("u1","u2","u3","u4","u5","h1","h2","h3","h4","h5")
d614g_enrichment_example$u_freq <- 10^rowMeans(log10(d614g_enrichment_example[,seq(2,6)]))
d614g_enrichment_example$h_freq <- 10^rowMeans(log10(d614g_enrichment_example[,seq(7,10)]))

d614g_enrichment_example2 <- d614g_enrichment_example %>% filter(ortholog %in% c("H.sapiens","Control") & protease == "none" | ortholog %in% c("R.pearsonii", "R.alcyone"))
d614g_enrichment_example2$identifier <- paste0(d614g_enrichment_example2$ortholog," ",d614g_enrichment_example2$mutant," ",d614g_enrichment_example2$kozak)
d614g_enrichment_example3 <- d614g_enrichment_example2 %>% filter(identifier %in% c("H.sapiens WT high","Control dEcto high","H.sapiens WT low", "R.pearsonii WT high", "R.alcyone WT high"))

example_of_enrichment_scatterplot <- ggplot() + 
  labs(x = "Frequency of plasmid before selection", y = "Frequency of plasmid\nafter selection") +
  scale_x_log10() + scale_y_log10() +
  geom_abline(slope = 1, linetype = 2, alpha = 0.3) +
  geom_segment(data = d614g_enrichment_example3, aes(x = u_freq, xend = u_freq, y = u_freq, yend = h_freq), linetype = 1, alpha = 0.3) + 
  geom_point(data = d614g_enrichment_example2, aes(x = u_freq, y = h_freq), alpha = 0.2) +
  geom_point(data = d614g_enrichment_example3, aes(x = u_freq, y = h_freq), alpha = 0.5) +
  geom_text_repel(data = d614g_enrichment_example3, aes(x = u_freq, y = h_freq, label = identifier), segment.color = "orange", min.segment.length = 0, color = "red", size = 2)
example_of_enrichment_scatterplot
ggsave(file = "Plots/example_of_enrichment_scatterplot.pdf", example_of_enrichment_scatterplot, height = 1.75, width = 3)
```

```{r Merging replicate scores for D614G, Alpha, Beta, Gamma, Delta, and Omicron BA1}
s2d614g <- merge(merge(merge(merge(merge(merge(merge(merge(ns3_g740d1[,c("plasmid_template","h_enrichment")],
               ns3_g740d2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns4_g928a[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns4_g928b[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns4_g928c[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns4_g928d[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_g928a1[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_g928a2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_g928a3[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T)
for(x in 2:ncol(s2d614g)){s2d614g[is.na(s2d614g[,x]),x] <- min(s2d614g[,x], na.rm = T)}
s2d614g[,2:ncol(s2d614g)] <- log10(s2d614g[,2:ncol(s2d614g)])
s2d614g$mean_log10 <- rowMeans(s2d614g[,2:ncol(s2d614g)], na.rm = T)
s2d614g$sd_log10 <- apply(s2d614g[,2:(ncol(s2d614g)-1)], 1, sd, na.rm=TRUE)
s2d614g$geomean <- 10^s2d614g$mean_log10


s2alpha <- merge(merge(merge(merge(ns3_alpha1[,c("plasmid_template","h_enrichment")],
               ns3_alpha2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_alpha1[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_alpha2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_alpha3[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T)
for(x in 2:ncol(s2alpha)){s2alpha[is.na(s2alpha[,x]),x] <- min(s2alpha[,x], na.rm = T)}
s2alpha[,2:ncol(s2alpha)] <- log10(s2alpha[,2:ncol(s2alpha)])
s2alpha$mean_log10 <- rowMeans(s2alpha[,2:ncol(s2alpha)], na.rm = T)
s2alpha$sd_log10 <- apply(s2alpha[,2:(ncol(s2alpha)-1)], 1, sd, na.rm=TRUE)
s2alpha$geomean <- 10^s2alpha$mean_log10


s2beta <- merge(ns3_beta1[,c("plasmid_template","h_enrichment")],
               ns3_beta2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T)
for(x in 2:ncol(s2beta)){s2beta[is.na(s2beta[,x]),x] <- min(s2beta[,x], na.rm = T)}
s2beta[,2:ncol(s2beta)] <- log10(s2beta[,2:ncol(s2beta)])
s2beta$mean_log10 <- rowMeans(s2beta[,2:ncol(s2beta)], na.rm = T)
s2beta$sd_log10 <- apply(s2beta[,2:(ncol(s2beta)-1)], 1, sd, na.rm=TRUE)
s2beta$geomean <- 10^s2beta$mean_log10


s2gamma <- merge(ns3_gamma1[,c("plasmid_template","h_enrichment")],
               ns3_gamma2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T)
for(x in 2:ncol(s2gamma)){s2gamma[is.na(s2gamma[,x]),x] <- min(s2gamma[,x], na.rm = T)}
s2gamma[,2:ncol(s2gamma)] <- log10(s2gamma[,2:ncol(s2gamma)])
s2gamma$mean_log10 <- rowMeans(s2gamma[,2:ncol(s2gamma)], na.rm = T)
s2gamma$sd_log10 <- apply(s2gamma[,2:(ncol(s2gamma)-1)], 1, sd, na.rm=TRUE)
s2gamma$geomean <- 10^s2gamma$mean_log10


s2delta <- merge(merge(merge(merge(ns3_delta1[,c("plasmid_template","h_enrichment")],
               ns3_delta2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_delta1[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_delta2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_delta3[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T)
for(x in 2:ncol(s2delta)){s2delta[is.na(s2delta[,x]),x] <- min(s2delta[,x], na.rm = T)}
s2delta[,2:ncol(s2delta)] <- log10(s2delta[,2:ncol(s2delta)])
s2delta$mean_log10 <- rowMeans(s2delta[,2:ncol(s2delta)], na.rm = T)
s2delta$sd_log10 <- apply(s2delta[,2:(ncol(s2delta)-1)], 1, sd, na.rm=TRUE)
s2delta$geomean <- 10^s2delta$mean_log10


s2ba1omicron <- merge(merge(merge(ns3_ba1omicron1[,c("plasmid_template","h_enrichment")],
               ns3_ba1omicron2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_ba1omicron1[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T),
               ns7_ba1omicron2[,c("plasmid_template","h_enrichment")], by = "plasmid_template", all = T)
for(x in 2:ncol(s2ba1omicron)){s2ba1omicron[is.na(s2ba1omicron[,x]),x] <- min(s2ba1omicron[,x], na.rm = T)}
s2ba1omicron[,2:ncol(s2ba1omicron)] <- log10(s2ba1omicron[,2:ncol(s2ba1omicron)])
s2ba1omicron$mean_log10 <- rowMeans(s2ba1omicron[,2:ncol(s2ba1omicron)], na.rm = T)
s2ba1omicron$sd_log10 <- apply(s2ba1omicron[,2:(ncol(s2ba1omicron)-1)], 1, sd, na.rm=TRUE)
s2ba1omicron$geomean <- 10^s2ba1omicron$mean_log10


## Combining all of the above data for each variant into a singular data frame
combined <- merge(merge(merge(merge(merge(s2d614g[,c("plasmid_template","geomean","sd_log10")],
      s2alpha[,c("plasmid_template","geomean","sd_log10")], by = "plasmid_template", all = T),
      s2beta[,c("plasmid_template","geomean","sd_log10")], by = "plasmid_template", all = T),
      s2gamma[,c("plasmid_template","geomean","sd_log10")], by = "plasmid_template", all = T),
      s2delta[,c("plasmid_template","geomean","sd_log10")], by = "plasmid_template", all = T),
      s2ba1omicron[,c("plasmid_template","geomean","sd_log10")], by = "plasmid_template", all = T)

colnames(combined) <- c("plasmid_template","D614G","sd_d614g","Alpha","sd_alpha","Beta","sd_beta","Gamma","sd_gamma","Delta","sd_delta","BA1","sd_ba1")

write.csv(file = "Output_tables/Supplementary_table_1.csv", combined, row.names = F)

## Remove the E329K variant from the analysis because it was in the plasmid library at abnormally low levels
combined2 <- merge(combined, ns3_alpha1[,5:10], by = "plasmid_template", all = T) %>% filter(plasmid_template != "G879B_AttB_[koz-mut]ACE2[E329K]-IRES-mCherry-H2A-P2A-PuroR")

combined2$label <- ""
for(x in 1:nrow(combined2)){
  if(combined2$ortholog[x] == "H.sapiens" & combined2$kozak[x] == "high"){combined2$label[x] <- combined2$ortholog[x]}
  if(combined2$ortholog[x] == "H.sapiens" & combined2$kozak[x] == "low"){combined2$label[x] <- combined2$mutant[x]} else{combined2$label[x] <- combined2$ortholog[x]}}

combined3 <- combined2
```

## Make an overall graph that shows the extent of the infection data we are considering here
```{r Showing how human ACE2 abunadnce level affects infection in the multiplex assay}
kozak_protease <- combined2 %>% filter(ortholog == "H.sapiens" | ortholog == "H.sapiens(rep1)" | ortholog == "H.sapiens(rep2)") %>%  filter(mutant == "WT")
for(x in 1:nrow(kozak_protease)){
  kozak_protease$identifier[x] <- strsplit(kozak_protease$plasmid_template[x],"_")[[1]][1]
}

## Compare high and low Kozak
lowhigh_reps <- kozak_protease %>% filter(protease == "none")
lowhigh_reps_t <- data.frame(t(lowhigh_reps[,c("identifier","D614G","Alpha","Beta","Gamma","Delta","BA1")]))
colnames(lowhigh_reps_t) <- lowhigh_reps_t[1,]
lowhigh_reps_t <- lowhigh_reps_t[-1,]
lowhigh_reps_t$G755A <- as.numeric(lowhigh_reps_t$G755A)
lowhigh_reps_t$G752A <- as.numeric(lowhigh_reps_t$G752A)

lowhigh_reps_t$high_low_infection_ratio <- lowhigh_reps_t$G752A / lowhigh_reps_t$G755A
lowhigh_reps_t$label <- rownames(lowhigh_reps_t)

lowhigh_reps_t$label <- factor(lowhigh_reps_t$label, levels = c("D614G", "Alpha", "Beta","Gamma","Delta","BA1"))

ace2_high_low_plot <- ggplot() + 
  scale_y_log10(limits = c(0.3, 20)) + 
  labs(y = "Fold increase\nto infection", x = NULL) +
  theme(axis.text.x = element_blank(), panel.grid.major.x = element_blank(), axis.ticks.x = element_blank()) + 
  geom_point(data = lowhigh_reps_t, aes(x = 0, y = high_low_infection_ratio, color = label), alpha = 0.4) +
  geom_point(data = lowhigh_reps_t, aes(x = 0, y = 10^mean(log10(high_low_infection_ratio))), alpha = 0.4, shape = 95, color = "red", size = 8)
ace2_high_low_plot
ggsave(file = "Plots/ace2_high_low_plot.pdf", ace2_high_low_plot, height = 0.9, width = 2)
```


```{r Seeing how adding TMPRSS2 alters infection with human ACE2 at high abundance levels}
## Now let's look at the effect of TMPRSS2 on the high ACE2 expressor cells
high_protease <- kozak_protease %>% filter(identifier %in% c("G828A","G752A"))
high_protease_t <- data.frame(t(high_protease[,c("identifier","D614G","Alpha","Beta","Gamma","Delta","BA1")]))
colnames(high_protease_t) <- high_protease_t[1,]
high_protease_t <- high_protease_t[-1,]
high_protease_t$G828A <- as.numeric(high_protease_t$G828A)
high_protease_t$G752A <- as.numeric(high_protease_t$G752A)
high_protease_t$tmprss2_infection_ratio <- high_protease_t$G828A / high_protease_t$G752A
high_protease_t$label <- rownames(high_protease_t)

high_protease_t$label <- factor(high_protease_t$label, levels = c("D614G", "Alpha", "Beta","Gamma","Delta","BA1"))

ace2_tmprss2_plot <- ggplot() + 
  scale_y_log10(limits = c(0.3, 20)) + 
  labs(y = "Fold increase\nto infection", x = NULL) +
  theme(axis.text.x = element_blank(), panel.grid.major.x = element_blank(), axis.ticks.x = element_blank()) + 
  geom_point(data = high_protease_t, aes(x = 0, y = tmprss2_infection_ratio, color = label), alpha = 0.4) +
  geom_point(data = high_protease_t, aes(x = 0, y = 10^mean(log10(tmprss2_infection_ratio))), alpha = 0.4, shape = 95, color = "red", size = 8)
ace2_tmprss2_plot
ggsave(file = "Plots/ace2_tmprss2_plot.pdf", ace2_tmprss2_plot, height = 0.9, width = 2)
```

```{r Subsetted on the ortholog data at high abundance levels so they can be internally scaled}
tmprss2 <- combined2 %>% filter(protease == "TMPRSS2" & mutant == "WT" | kozak == "high" & mutant == "dEcto" | protease == "none" & kozak == "high")

## Values left unscaled
ortholog1 <- tmprss2[colnames(tmprss2)[(colnames(tmprss2) %in% c("ortholog","D614G","Alpha","Beta","Gamma","Delta","BA1"))]] 
ortholog1_melt <- melt(ortholog1, id = "ortholog")

factor_levels_for_large_heatmap <- c("Control", "H.sapiens", "H.sapiens(rep1)", "H.sapiens(rep2)", "M.musculus", "S.scrofa", "M.javanica", "R.landeri", "R.alcyone", "R.ferrumequinum", "R.shameli", "R.affinis",  "R.sinicus_215", "R.sinicus_275", "R.sinicus_200", "R.sinicus_472", "R.pearsonii")

ortholog1_melt$ortholog <- factor(ortholog1_melt$ortholog, levels = factor_levels_for_large_heatmap)
ortholog1_melt$variable <- factor(ortholog1_melt$variable, levels = c("D614G","Alpha","Beta","Gamma","Delta","BA1"))

orthologs_unscaled <- ggplot() + labs(x = NULL, y = NULL) + 
  scale_fill_gradient(low = "white", high = "red") + 
  theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) + 
  geom_tile(data = ortholog1_melt, aes(x = ortholog, y = variable, fill = value))
orthologs_unscaled
ggsave(file = "Plots/orthologs_unscaled.pdf", orthologs_unscaled, height = 3, width = 5)

## Scaled to dEcto and Human High Kozak with TMPRSS2
tmprss2b <- tmprss2
for(x in colnames(tmprss2)[(colnames(tmprss2) %in% c("D614G","Alpha","Beta","Gamma","Delta","BA1"))]){
  temp_high <-  mean(c(tmprss2b[ tmprss2b$plasmid_template == "G828A_AttB_ACE2-2A-TMPRSS2_IRES_mCherry-H2A-P2A-PuroR" ,x],tmprss2b[ tmprss2b$plasmid_template == "G852C_AttB_ACE2-2A-TMPRSS2_IRES_miRFP670-H2A-P2A-PuroR" ,x]))#max(tmprss2b[,x], na.rm = T)
  temp_low <-  tmprss2b[ tmprss2b$plasmid_template == "G758A_AttB_ACE2[dEcto]-IRES-mCherry-H2A-P2A-PuroR" ,x]
  tmprss2b[,x] <- (tmprss2b[,x] - temp_low) / (temp_high - temp_low)
}
ortholog2 <- tmprss2b[colnames(tmprss2b)[(colnames(tmprss2b) %in% c("ortholog","D614G","Alpha","Beta","Gamma","Delta","BA1"))]]
ortholog2_melt <- melt(ortholog2, id = "ortholog")


ortholog2_melt$ortholog <- factor(ortholog2_melt$ortholog, levels = factor_levels_for_large_heatmap)
ortholog2_melt$variable <- factor(ortholog2_melt$variable, levels = c("D614G","Alpha","Beta","Gamma","Delta","BA1"))

orthologs2_s2variants <- ggplot() + 
  labs(x = NULL, y = NULL) + 
  scale_fill_gradient(low = "white", high = "red") + 
  theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) + 
  geom_tile(data = ortholog2_melt, aes(x = ortholog, y = variable, fill = value)); orthologs2_s2variants
ggsave(file = "Plots/orthologs2_s2variants.pdf", orthologs2_s2variants, height = 3, width = 5)
```

```{r Subsetting on the variants of human ACE2 with low abundance so these can be internally scaled}
low_kozak <- combined2 %>% filter(kozak == "low" | mutant == "dEcto")

## Normal unscaled
low_kozak1 <- low_kozak

low_kozak1 <- low_kozak[colnames(low_kozak)[(colnames(low_kozak) %in% c("mutant","D614G","Alpha","Beta","Gamma","Delta","BA1"))]]
low_kozak1_melt <- melt(low_kozak1, id = "mutant")
#low_kozak1_melt$variable <- factor(low_kozak1_melt$variable, levels = c(""))
low_kozak1_melt$mutant <- factor(low_kozak1_melt$mutant, levels = c("WT", "dEcto", "I21N", "E23K", "K26E", "K31D", "E35K", "D38H", "G326E", "E329K", "G352V", "K353D", "D355N"))

low_kozak1_ace2mutants <- ggplot() + labs(x = NULL, y = NULL) + theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) + 
  scale_fill_gradient(low = "white", high = "red") + 
  geom_tile(data = low_kozak1_melt, aes(x = mutant, y = variable, fill = value))
low_kozak1_ace2mutants
ggsave(file = "Plots/low_kozak1_ace2mutants.pdf", low_kozak1_ace2mutants, height = 1.25, width = 4)


## Scaled to dEcto and Human Low Kozak
low_kozak2 <- low_kozak
for(x in c(colnames(low_kozak)[(colnames(low_kozak) %in% c("D614G","Alpha","Beta","Gamma","Delta","BA1"))])){
  temp_high <-  low_kozak2[ low_kozak2$plasmid_template == "G755A_AttB_[koz-mut]ACE2-IRES-mCherry-H2A-P2A-PuroR" ,x]
  temp_low <-  low_kozak2[ low_kozak2$plasmid_template == "G758A_AttB_ACE2[dEcto]-IRES-mCherry-H2A-P2A-PuroR" ,x]
  low_kozak2[,x] <- (low_kozak2[,x] - temp_low) / (temp_high - temp_low)
}

low_kozak2 <- low_kozak2[,c(colnames(low_kozak)[(colnames(low_kozak) %in% c("mutant","D614G","Alpha","Beta","Gamma","Delta","BA1"))])]
low_kozak2_melt <- melt(low_kozak2, id = "mutant")
low_kozak2_melt$variable <- factor(low_kozak2_melt$variable, levels = c("D614G","Alpha","Beta","Gamma","Delta","BA1"))
low_kozak2_melt$mutant <- factor(low_kozak2_melt$mutant, levels = c("WT", "dEcto", "I21N", "E23K", "K26E", "K31D", "E35K", "D38H", "G326E", "E329K", "G352V", "K353D", "D355N"))

low_kozak2_ace2mutants <- ggplot() + labs(x = NULL, y = NULL) + theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) + 
  scale_fill_gradient(low = "white", high = "red", limits = c()) + 
  geom_tile(data = low_kozak2_melt, aes(x = mutant, y = variable, fill = value))
low_kozak2_ace2mutants
ggsave(file = "Plots/low_kozak2_ace2mutants.pdf", low_kozak2_ace2mutants, height = 1.1, width = 3.1)


low_kozak2_melt$mutant <- factor(low_kozak2_melt$mutant, levels = rev(c("WT", "dEcto", "I21N", "E23K", "K26E", "K31D", "E35K", "D38H", "G326E", "E329K", "G352V", "K353D", "D355N")))

low_kozak2_ace2mutants_s2variants_flip <- ggplot() + labs(x = NULL, y = NULL) + theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45), legend.position = "top") + 
  scale_fill_gradient(low = "white", high = "red") + 
  geom_tile(data = low_kozak2_melt, aes(x = variable, y = mutant, fill = value))
low_kozak2_ace2mutants_s2variants_flip
ggsave(file = "Plots/low_kozak2_ace2mutants_s2variants_flip.pdf", low_kozak2_ace2mutants_s2variants_flip, height = 3, width = 2.5)

```

```{r REVISION - To show reproducibility of the low Kozak human mutant sample values}
s2d614g_hmuts <- merge(s2d614g, sample_key1[,c("ortholog","mutant","plasmid_template","kozak")])
colnames(s2d614g_hmuts)[2:10] <- c("r1","r2","r3","r4","r5","r6","r7","r8","r9")
s2d614g_hmuts2 <- s2d614g_hmuts %>% filter(ortholog %in% c("H.sapiens","Control"))
s2d614g_hmuts2$label <- paste0(s2d614g_hmuts2$kozak,"_",s2d614g_hmuts2$mutant)

s2d614g_hmuts2_melt <- melt(s2d614g_hmuts2, "id" = "label") %>% filter(variable %in% c("r1","r2","r3","r4","r5","r6","r7","r8","r9")) %>% filter(label != "high_D355N" & label != "high_WT")
s2d614g_hmuts2_melt$value <- as.numeric(s2d614g_hmuts2_melt$value)

s2d614g_hmuts2_melt_median <- s2d614g_hmuts2_melt %>% group_by(label) %>% summarize(median = median(value)) %>% arrange(desc(median))
s2d614g_hmuts2_melt$label <- factor(s2d614g_hmuts2_melt$label, levels = s2d614g_hmuts2_melt_median$label)

ggplot() + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5), panel.grid.major.x = element_blank()) +
  geom_boxplot(data = s2d614g_hmuts2_melt, aes(x = label, y = value), coef=0, outlier.alpha = 0) +
  geom_quasirandom(data = s2d614g_hmuts2_melt, aes(x = label, y = value), color = "red", alpha = 0.3) +
  geom_point(data = s2d614g_hmuts2_melt_median, aes(x = label, y = median), shape = 95, size = 8) +
  labs(x = NULL, y = "Log10 enrichment") +
  NULL


s2ba1omicron_hmuts <- merge(s2ba1omicron, sample_key1[,c("ortholog","mutant","plasmid_template","kozak")])
colnames(s2ba1omicron_hmuts)[2:5] <- c("r1","r2","r3","r4")
s2ba1omicron_hmuts2 <- s2ba1omicron_hmuts %>% filter(ortholog %in% c("H.sapiens","Control"))
s2ba1omicron_hmuts2$label <- paste0(s2ba1omicron_hmuts2$kozak,"_",s2ba1omicron_hmuts2$mutant)

s2ba1omicron_hmuts2_melt <- melt(s2ba1omicron_hmuts2, "id" = "label") %>% filter(variable %in% c("r1","r2","r3","r4","r5","r6","r7","r8","r9")) %>% filter(label != "low_E329K" & label != "high_D355N" & label != "high_WT")
s2ba1omicron_hmuts2_melt$value <- as.numeric(s2ba1omicron_hmuts2_melt$value)

s2ba1omicron_hmuts2_melt_median <- s2ba1omicron_hmuts2_melt %>% group_by(label) %>% summarize(median = median(value)) %>% arrange(desc(median))
s2ba1omicron_hmuts2_melt$label <- factor(s2ba1omicron_hmuts2_melt$label, levels = s2ba1omicron_hmuts2_melt_median$label)

ggplot() + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5), panel.grid.major.x = element_blank()) +
  geom_boxplot(data = s2ba1omicron_hmuts2_melt, aes(x = label, y = value), coef=0, outlier.alpha = 0) +
  geom_quasirandom(data = s2ba1omicron_hmuts2_melt, aes(x = label, y = value), color = "red", alpha = 0.3) +
  geom_point(data = s2ba1omicron_hmuts2_melt_median, aes(x = label, y = median), shape = 95, size = 8) +
  labs(x = NULL, y = "Log10 enrichment") +
  NULL

## Getting ratios

d614g_dEcto_wt <- merge(s2d614g_hmuts2_melt %>% filter(label == "high_dEcto"),s2d614g_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "D614G")
ba1_dEcto_wt <- merge(s2ba1omicron_hmuts2_melt %>% filter(label == "high_dEcto"),s2ba1omicron_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "Omicron BA1")
d614g_ba1_dEcto_wt <- rbind(d614g_dEcto_wt[,c("label.x","virus","variable","ratio")], ba1_dEcto_wt[,c("label.x","virus","variable","ratio")])

d614g_k31d_wt <- merge(s2d614g_hmuts2_melt %>% filter(label == "low_K31D"),s2d614g_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "D614G")
ba1_k31d_wt <- merge(s2ba1omicron_hmuts2_melt %>% filter(label == "low_K31D"),s2ba1omicron_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "Omicron BA1")
d614g_ba1_k31d_wt <- rbind(d614g_k31d_wt[,c("label.x","virus","variable","ratio")], ba1_k31d_wt[,c("label.x","virus","variable","ratio")])

d614g_d38h_wt <- merge(s2d614g_hmuts2_melt %>% filter(label == "low_D38H"),s2d614g_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "D614G")
ba1_d38h_wt <- merge(s2ba1omicron_hmuts2_melt %>% filter(label == "low_D38H"),s2ba1omicron_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "Omicron BA1")
d614g_ba1_d38h_wt <- rbind(d614g_d38h_wt[,c("label.x","virus","variable","ratio")], ba1_d38h_wt[,c("label.x","virus","variable","ratio")])

d614g_e35k_wt <- merge(s2d614g_hmuts2_melt %>% filter(label == "low_E35K"),s2d614g_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "D614G")
ba1_e35k_wt <- merge(s2ba1omicron_hmuts2_melt %>% filter(label == "low_E35K"),s2ba1omicron_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "Omicron BA1")
d614g_ba1_e35k_wt <- rbind(d614g_e35k_wt[,c("label.x","virus","variable","ratio")], ba1_e35k_wt[,c("label.x","virus","variable","ratio")])

d614g_e329k_wt <- merge(s2d614g_hmuts2_melt %>% filter(label == "low_E329K"),s2d614g_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "D614G")
ba1_e329k_wt <- merge(s2ba1omicron_hmuts2_melt %>% filter(label == "low_E329K"),s2ba1omicron_hmuts2_melt %>% filter(label == "low_WT"), by = "variable") %>% mutate(ratio = value.x - value.y) %>% mutate(virus = "Omicron BA1")
d614g_ba1_e329k_wt <- rbind(d614g_e329k_wt[,c("label.x","virus","variable","ratio")], ba1_e329k_wt[,c("label.x","virus","variable","ratio")])


combined_ratios <- rbind(d614g_ba1_k31d_wt, d614g_ba1_d38h_wt, d614g_ba1_e35k_wt, d614g_ba1_e329k_wt) %>% mutate(linear_ratio = 10^ratio) #%>% filter(!(virus == "Omicron BA1" & variable == "r4" & label.x %in% c("low_E35K","low_D38H","low_E329K")))
combined_ratios$label.x <- factor(combined_ratios$label.x, levels = c("low_K31D","low_E35K","low_D38H","low_E329K"))

combined_ratios_ave <- combined_ratios %>% group_by(label.x, virus) %>% summarize(geomean_ratio = 10^(mean(ratio)), mean_ratio = mean(linear_ratio))

Revision_variant_WT_ratios_plot <- ggplot() + 
  theme(panel.grid.major.x = element_blank(), legend.position = "bottom", axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) +
  geom_point(data = combined_ratios, aes(x = label.x, y = linear_ratio, color = virus), position = position_dodge(width = 0.5), alpha = 0.4) +
  geom_point(data = combined_ratios_ave, aes(x = label.x, y = mean_ratio, color = virus), position = position_dodge(width = 0.5), shape = 95, size = 8) +
  labs(x = NULL, y = "Variant to WT ACE2\ninfection ratio") +
  NULL; Revision_variant_WT_ratios_plot
ggsave(file = "Plots/Revision_variant_WT_ratios_plot.pdf", Revision_variant_WT_ratios_plot, height = 2.5, width = 3)

```


```{r REVISION - Bootstrapping of the K31D difference between D614G and Omicron}
## Bootstrap test
bootstrap_n <- 1000
label_vector <- c("low_K31D")
d614g_geomean_vector <- c()
ba1_geomean_vector <- c()
for(x in 1:bootstrap_n){
  for(y in 1:length(label_vector)){
    temp_d614g_vector <- (combined_ratios %>% filter(virus == "D614G" & label.x == label_vector[y]))[,"ratio"]
    temp_ba1_vector <- (combined_ratios %>% filter(virus == "Omicron BA1" & label.x == label_vector[y]))[,"ratio"]
    temp_d614g_value <- mean(sample(temp_d614g_vector,length(temp_d614g_vector),replace = T))
    d614g_geomean_vector <- c(d614g_geomean_vector,temp_d614g_value)
    temp_ba1_value <- mean(sample(temp_ba1_vector,length(temp_ba1_vector),replace = T))
    ba1_geomean_vector <- c(ba1_geomean_vector,temp_ba1_value)
  }
}

bootstrap_results_df <- data.frame("d614g" = d614g_geomean_vector, "ba1" = ba1_geomean_vector)
bootstrap_results_df$ba1_larger <- (bootstrap_results_df$ba1 - bootstrap_results_df$d614g) > 0

sum(bootstrap_results_df) / nrow(bootstrap_results_df)

bootstrap_results_df_melt <- melt(bootstrap_results_df[,c("d614g","ba1")])
bootstrap_results_df_melt$variable <- as.character(bootstrap_results_df_melt$variable)

bootstrap_results_df_melt[bootstrap_results_df_melt$variable == "d614g","variable"] <- "D614G"
bootstrap_results_df_melt[bootstrap_results_df_melt$variable == "ba1","variable"] <- "Omicron BA1"

Revision_D614G_BA1_K31D_plot <- ggplot() + theme(panel.grid.major.x = element_blank()) + 
  geom_violin(data = bootstrap_results_df_melt, aes(x = variable, y = value)) + 
  geom_point(data = d614g_ba1_k31d_wt, aes(x = virus, y = ratio), position = position_dodge(width = 0.5)) +
  labs(x = NULL, y = "K31D to WT ratio") +
  NULL; Revision_D614G_BA1_K31D_plot
ggsave(file = "Plots/Revision_D614G_BA1_K31D_plot.pdf", Revision_D614G_BA1_K31D_plot, height = 2.5, width = 2)

```


```{r REVISION - To show reproducibility of the low Kozak human mutant sample values}
## Probability that the D614G and Omicron BA1 versions of each sample come from the same population
s2d614g_hmuts2_melt$virus <- "D614G"
s2ba1omicron_hmuts2_melt$virus <- "Omicron_BA1"

d614g_omicron_comparison <- rbind(s2d614g_hmuts2_melt, s2ba1omicron_hmuts2_melt)

ggplot() + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5), panel.grid.major.x = element_blank()) +
  #geom_boxplot(data = s2ba1omicron_hmuts2_melt, aes(x = label, y = value), coef=0, outlier.alpha = 0) +
  geom_quasirandom(data = d614g_omicron_comparison, aes(x = label, y = value, color = virus), alpha = 0.8) +
  #geom_point(data = d614g_omicron_comparison, aes(x = label, y = median), shape = 95, size = 8) +
  labs(x = NULL, y = "Log10 enrichment") +
  NULL

```

```{r REVISION - Looking at how variable some of the ortholog infection values are}

s2d614g_orthologs <- s2d614g_hmuts %>% filter(!(ortholog %in% c("H.sapiens")))

s2d614g_orthologs_melt <- melt(s2d614g_orthologs) %>% filter(variable %in% c("r1","r2","r3","r4","r5","r6","r7","r8","r9"))

s2d614g_orthologs_melt_ave <- s2d614g_orthologs_melt %>% group_by(ortholog) %>% summarize(meanvalue = mean(value))

Revision_D614G_orthologs_plot <- ggplot() + theme(axis.text.x = element_text(hjust = 0, vjust = 0.5, angle = -90), panel.grid.major.x = element_blank()) + 
  labs(x = NULL , y = "Log10 enrichment ratio") +
  geom_quasirandom(data = s2d614g_orthologs_melt, aes(x = ortholog, y = value), alpha = 0.3) +
  geom_point(data = s2d614g_orthologs_melt_ave, aes(x = ortholog, y = meanvalue), size = 8, shape = 95) +
  NULL; Revision_D614G_orthologs_plot
ggsave(file = "Plots/Revision_D614G_orthologs_plot.pdf", Revision_D614G_orthologs_plot, height = 2.5, width = 4)

```


```{r Making a combined heatmap with both orthologs and variants to demonstrate the full scale of enrichment scores captured in this dataset}
not_ortholog_not_lowkozak <- combined2 %>% filter(kozak == "high" & protease == "none")

combined_ortholog_variant <- rbind((ortholog1_melt %>% mutate(sample = ortholog))[,c("sample","variable","value")],
                                   (low_kozak1_melt %>% mutate(sample = mutant))[,c("sample","variable","value")]) %>% filter(variable %in% c("D614G","Alpha","Beta","Gamma","Delta","BA1"))

combined_ortholog_variant$variable <- factor(combined_ortholog_variant$variable, levels = c("D614G","Alpha","Beta","Gamma","Delta","BA1"))

combined_ortholog_variant$sample <- factor(combined_ortholog_variant$sample, levels = rev(c("Control", "H.sapiens", "H.sapiens(rep1)", "H.sapiens(rep2)", "M.musculus", "S.scrofa", "M.javanica", "R.landeri", "R.alcyone", "R.ferrumequinum", "R.shameli", "R.affinis",  "R.sinicus_215", "R.sinicus_275", "R.sinicus_200", "R.sinicus_472", "R.pearsonii", "WT", "dEcto", "I21N", "E23K", "K26E", "K31D", "E35K", "D38H", "G326E", "E329K", "G352V", "K353D", "D355N")))

combined_ortholog_variant_plot <- ggplot() + labs(x = NULL, y = NULL) + theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45), legend.position = "top") + 
  scale_fill_gradient(low = "white", high = "red", na.value = "white") +  #limits=c(-0.25,0.75), 
  geom_tile(data = combined_ortholog_variant, aes(x = variable, y = sample, fill = value))
combined_ortholog_variant_plot
ggsave(file = "Plots/combined_ortholog_variant_plot.pdf", combined_ortholog_variant_plot, height = 4, width = 2)
```

```{r Comparing out multiplex assay scores to the traditional infection assay results from our 2021 Plos Pathogens article}
human_ace2_variants <- read.delim(file = "Data/Other_papers/2021_Shukla_PlosPathog/journal.ppat.1009715.s008.tsv", sep = "\t")
human_ace2_variants2 <- human_ace2_variants %>% group_by(cell_label) %>% summarize(mean_infection = mean(scaled_infection)) #10^mean(log_scaled_infection))

human_ace2_variants2$mutant <- NA
for(x in 3:nrow(human_ace2_variants2)){
  human_ace2_variants2$mutant[x] <- strsplit(human_ace2_variants2$cell_label[x],"-")[[1]][2]
}
human_ace2_variants2$mutant[1] <- "dEcto"
human_ace2_variants2$mutant[2] <- "WT"

low_kozak1_s2variants <- low_kozak1_melt %>% filter(variable == "D614G")
human_ace2_variants3 <- merge(human_ace2_variants2,low_kozak1_s2variants[,c("mutant","value")])
human_ace2_variants3$value <- 10^human_ace2_variants3$value / 10^human_ace2_variants3[human_ace2_variants3$mutant == "WT","value"]

Variant_validation_graph <- ggplot() + labs(x = "Flow cytometry infectivity\nmeasurement from published study", y = "Sequencing-based\ninfectivity measurement\nfrom current study") +
  scale_x_log10(breaks = c(0.1,0.5,1,3)) + 
  scale_y_log10(breaks = c(0.1,0.5,1,3)) + 
  geom_point(data = human_ace2_variants3, aes(x = mean_infection, y = value), alpha = 0.5) +
  geom_text_repel(data = human_ace2_variants3, aes(x = mean_infection, y = value, label = mutant), color = "red", alpha = 0.8, size = 1.5, segment.color = "orange", segment.alpha = 0.5)
Variant_validation_graph
ggsave(file = "Plots/Variant_validation_graph.pdf", Variant_validation_graph, height = 1.75, width = 2)
```

```{r Comparing out multiplex assay scores to the somewhat traditional infection assay results from our 2022 Plos Biology article}
ortholog_2color_data <- read.delim(file = "Data/Other_papers/2022_Roelle_PlosBiol/Fig5.csv", sep = ",")
ortholog_2color_data_flow <- ortholog_2color_data %>% filter(type == "flow_cytometry" & virus_label == "SARS-CoV-2 RBD") %>% mutate(ortholog = cell_label)
ortholog_2color_data_micro <- ortholog_2color_data %>%  filter(type == "microscopy" & cell_label == "SARS-CoV-2 RBD") %>% mutate(ortholog = virus_label)

ortholog_2color_data_flow$norm_geomean <- ortholog_2color_data_flow$geomean / ortholog_2color_data_flow[ortholog_2color_data_flow$ortholog == "H.sapiens","geomean"]
ortholog_2color_data_micro$norm_geomean <- ortholog_2color_data_micro$geomean / ortholog_2color_data_micro[ortholog_2color_data_micro$ortholog == "H.sapiens","geomean"]

norm_ortholog_2color_data <- merge(ortholog_2color_data_flow[,c("ortholog","norm_geomean")], ortholog_2color_data_micro[,c("ortholog","norm_geomean")], by = "ortholog")
norm_ortholog_2color_data$norm_geomean <- rowMeans(norm_ortholog_2color_data[,c("norm_geomean.x","norm_geomean.y")])

norm_ortholog_2color_data[norm_ortholog_2color_data$ortholog == "R.sinicus200","ortholog"] <- "R.sinicus_200"
norm_ortholog_2color_data[norm_ortholog_2color_data$ortholog == "R.sinicus215","ortholog"] <- "R.sinicus_215"
norm_ortholog_2color_data[norm_ortholog_2color_data$ortholog == "R.sinicus472","ortholog"] <- "R.sinicus_472"
norm_ortholog_2color_data[norm_ortholog_2color_data$ortholog == "NULL (fs)","ortholog"] <- "Neg cntrl"

ortholog1[ortholog1$ortholog == "Control","ortholog"] <- "dEcto"
ortholog1_melt_s2_plosbiol <- merge(ortholog1[,c("ortholog","D614G")], norm_ortholog_2color_data[,c("ortholog","norm_geomean")], by = "ortholog", all = T)
ortholog1_melt_s2_plosbiol$D614G <- ortholog1_melt_s2_plosbiol$D614G / ortholog1_melt_s2_plosbiol[ortholog1_melt_s2_plosbiol$ortholog == "H.sapiens","D614G"]

Variant_validation_graph2 <- ggplot() + labs(x = "Flow cytometry infectivity\nmeasurement from published study", y = "Sequencing-based\ninfectivity measurement\nfrom current study") +
  scale_x_log10(breaks = c(0.1,0.3,1,3)) + 
  scale_y_log10(breaks = c(0.1,0.3,1,3)) + 
  geom_point(data = ortholog1_melt_s2_plosbiol, aes(x = norm_geomean, y = D614G), alpha = 0.5) +
  geom_text_repel(data = ortholog1_melt_s2_plosbiol, aes(x = norm_geomean, y = D614G, label = ortholog), color = "red", alpha = 0.8, size = 1.5, segment.color = "orange", segment.alpha = 0.5)
Variant_validation_graph2
ggsave(file = "Plots/Variant_validation_graph2.pdf", Variant_validation_graph2, height = 1.75, width = 2)
```

```{r Comparing correlations with human ACE2 variants}
ggplot() + 
  geom_point(data = low_kozak, aes(x = D614G, y = Alpha)) +
  geom_text_repel(data = low_kozak, aes(x = D614G, y = Alpha, label = mutant), color = "red")

ggplot() + 
  geom_point(data = low_kozak, aes(x = D614G, y = Beta)) +
  geom_text_repel(data = low_kozak, aes(x = D614G, y = Beta, label = mutant), color = "red")

ggplot() + 
  geom_point(data = low_kozak, aes(x = D614G, y = Gamma)) +
  geom_text_repel(data = low_kozak, aes(x = D614G, y = Gamma, label = mutant), color = "red")

ggplot() + 
  geom_point(data = low_kozak, aes(x = D614G, y = Delta)) +
  geom_text_repel(data = low_kozak, aes(x = D614G, y = Delta, label = mutant), color = "red")

D614G_Omicron_scatterplot <- ggplot() + 
  geom_point(data = low_kozak, aes(x = D614G, y = BA1), alpha = 0.5) +
  geom_text_repel(data = low_kozak, aes(x = D614G, y = BA1, label = mutant), color = "red", size = 2, segment.color = "orange", segment.alpha = 0.5)
D614G_Omicron_scatterplot
ggsave(file = "Plots/D614G_Omicron_scatterplot.pdf", D614G_Omicron_scatterplot, height = 1.5, width = 1.7)

```


## The below section looks at all of the SARS-CoV-2 variant RBD co-structures with human ACE2

```{r D614G RBD-ACE2 contact maps}
pdb_7sxy_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7sxy_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7sxy_dist_frame$position <- as.numeric(rownames(pdb_7sxy_dist_frame))
pdb_7sxy_dist_frame_melt <- melt(pdb_7sxy_dist_frame, id = "position")
pdb_7sxy_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7sxy_dist_frame_melt$variable,2,5)) + 329
pdb_7sxy_dist_frame_melt$ace2_position <- as.numeric(pdb_7sxy_dist_frame_melt$position) + 18
pdb_7sxy_dist_ace2_positionlist <- as.numeric(unique(pdb_7sxy_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7sxy_dist_rbd_positionlist <- as.numeric(unique(pdb_7sxy_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7sxy_dist_frame_melt$ace2_rbd <- paste(pdb_7sxy_dist_frame_melt$ace2_pos, pdb_7sxy_dist_frame_melt$rbd_pos, sep = "_")

pdb_6lzg_dist_frame <- read.csv(file = "Data/PDB_contact_maps/6lzg_ace2row_rbdcol_dist_matrix_min.csv")
pdb_6lzg_dist_frame$position <- as.numeric(rownames(pdb_6lzg_dist_frame))
pdb_6lzg_dist_frame_melt <- melt(pdb_6lzg_dist_frame, id = "position")
pdb_6lzg_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_6lzg_dist_frame_melt$variable,2,5)) + 332
pdb_6lzg_dist_frame_melt$ace2_position <- as.numeric(pdb_6lzg_dist_frame_melt$position) + 18
pdb_6lzg_dist_ace2_positionlist <- as.numeric(unique(pdb_6lzg_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_6lzg_dist_rbd_positionlist <- as.numeric(unique(pdb_6lzg_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_6lzg_dist_frame_melt$ace2_rbd <- paste(pdb_6lzg_dist_frame_melt$ace2_pos, pdb_6lzg_dist_frame_melt$rbd_pos, sep = "_")

pdb_6m0j_dist_frame <- read.csv(file = "Data/PDB_contact_maps/6m0j_ace2row_rbdcol_dist_matrix_min.csv")
pdb_6m0j_dist_frame$position <- as.numeric(rownames(pdb_6m0j_dist_frame))
pdb_6m0j_dist_frame_melt <- melt(pdb_6m0j_dist_frame, id = "position")
pdb_6m0j_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_6m0j_dist_frame_melt$variable,2,5)) + 332
pdb_6m0j_dist_frame_melt$ace2_position <- as.numeric(pdb_6m0j_dist_frame_melt$position) + 18
pdb_6m0j_dist_ace2_positionlist <- as.numeric(unique(pdb_6m0j_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_6m0j_dist_rbd_positionlist <- as.numeric(unique(pdb_6m0j_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_6m0j_dist_frame_melt$ace2_rbd <- paste(pdb_6m0j_dist_frame_melt$ace2_pos, pdb_6m0j_dist_frame_melt$rbd_pos, sep = "_")
pdb_6m0j_dist_frame_melt$dist_6m0j <- pdb_6m0j_dist_frame_melt$value

pdb_6m17_dist_frame <- read.csv(file = "Data/PDB_contact_maps/6m17_ace2row_rbdcol_dist_matrix_min.csv")
pdb_6m17_dist_frame$position <- as.numeric(rownames(pdb_6m17_dist_frame))
pdb_6m17_dist_frame_melt <- melt(pdb_6m17_dist_frame, id = "position")
pdb_6m17_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_6m17_dist_frame_melt$variable,2,5)) + 335
pdb_6m17_dist_frame_melt$ace2_position <- as.numeric(pdb_6m17_dist_frame_melt$position) + 20
pdb_6m17_dist_ace2_positionlist <- as.numeric(unique(pdb_6m17_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_6m17_dist_rbd_positionlist <- as.numeric(unique(pdb_6m17_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_6m17_dist_frame_melt$ace2_rbd <- paste(pdb_6m17_dist_frame_melt$ace2_pos, pdb_6m17_dist_frame_melt$rbd_pos, sep = "_")
pdb_6m17_dist_frame_melt$dist_6m17 <- pdb_6m17_dist_frame_melt$value

## D614G
pdb_d614g_combined <- merge(merge(merge(pdb_7sxy_dist_frame_melt[,c("ace2_rbd","ace2_position","rbd_position","value")],pdb_6lzg_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"),pdb_6m0j_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_6m17_dist_frame_melt[,c("ace2_rbd","dist_6m17")], by = "ace2_rbd") %>% filter(!is.na(rbd_position) & !is.na(ace2_position))
colnames(pdb_d614g_combined) <- c("ace2_rbd","ace2_position","rbd_position","dist_7sxy", "dist_6lzg", "dist_6m0j", "dist_6m17")
pdb_d614g_combined[pdb_d614g_combined$dist_7sxy > 50,"dist_7sxy"] <- NA
pdb_d614g_combined[pdb_d614g_combined$dist_6lzg > 50,"dist_6lzg"] <- NA
pdb_d614g_combined[pdb_d614g_combined$dist_6m0j > 50,"dist_6m0j"] <- NA
pdb_d614g_combined[pdb_d614g_combined$dist_6m17 > 50,"dist_6m17"] <- NA
pdb_d614g_combined$dist_d614g <- rowMeans(pdb_d614g_combined[,c("dist_7sxy", "dist_6lzg", "dist_6m0j", "dist_6m17")], na.rm = T)
for(x in 1:nrow(pdb_d614g_combined)){
  pdb_d614g_combined$cv_d614g[x] <- sd(pdb_d614g_combined[x,c("dist_7sxy", "dist_6lzg", "dist_6m0j", "dist_6m17")], na.rm = T) / pdb_d614g_combined$dist_d614g[x]
}

d614g_ace2_31_combined <- pdb_d614g_combined %>% filter(ace2_position == 31)
ACE2_K31_D614G_distances <- ggplot() + geom_hline(yintercept = 10) + labs(x = "D614G RBD residue", y = "Distance (angstroms)") +
  scale_y_continuous(limits = c(0,50)) +
  geom_point(data = d614g_ace2_31_combined, aes(x = rbd_position, y = dist_7sxy), color = "blue", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_31_combined, aes(x = rbd_position, y = dist_6lzg), color = "blue", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_31_combined, aes(x = rbd_position, y = dist_6m0j), color = "blue", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_31_combined, aes(x = rbd_position, y = dist_6m17), color = "blue", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_31_combined, aes(x = rbd_position, y = dist_d614g), color = "blue", alpha = 0.4, size = 1, shape = 4)
ACE2_K31_D614G_distances
ggsave(file = "Plots/ACE2_K31_D614G_distances.pdf", ACE2_K31_D614G_distances, height = 2, width = 2.5)

d614g_ace2_38_combined <- pdb_d614g_combined %>% filter(ace2_position == 38)
ACE2_D38_D614G_distances <- ggplot() + geom_hline(yintercept = 10) + labs(x = "D614G RBD residue", y = "Distance (angstroms)") +
  scale_y_continuous(limits = c(0,50)) +
  geom_point(data = d614g_ace2_38_combined, aes(x = rbd_position, y = dist_7sxy), color = "red", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_38_combined, aes(x = rbd_position, y = dist_6lzg), color = "red", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_38_combined, aes(x = rbd_position, y = dist_6m0j), color = "red", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_38_combined, aes(x = rbd_position, y = dist_6m17), color = "red", shape = 16, alpha = 0.2, size = 1) +
  geom_point(data = d614g_ace2_38_combined, aes(x = rbd_position, y = dist_d614g), color = "red", alpha = 0.4, size = 1, shape = 4)
ACE2_D38_D614G_distances
ggsave(file = "Plots/ACE2_D38_D614G_distances.pdf", ACE2_D38_D614G_distances, height = 2, width = 2.5)

ACE2_K31_D38_D614G_distances <- ggplot() + geom_hline(yintercept = 10) + labs(x = "D614G RBD residue", y = "Distance (angstroms)") +
  scale_y_continuous(limits = c(0,50)) +
  geom_line(data = d614g_ace2_31_combined, aes(x = rbd_position, y = dist_d614g), color = "blue", alpha = 0.6) +
  geom_line(data = d614g_ace2_38_combined, aes(x = rbd_position, y = dist_d614g), color = "red", alpha = 0.6)
ACE2_K31_D38_D614G_distances
ggsave(file = "Plots/ACE2_K31_D38_D614G_distances.pdf", ACE2_K31_D38_D614G_distances, height = 2, width = 2.5)
```

```{r Alpha RBD-ACE2 contact maps}
pdb_8dlk_dist_frame <- read.csv(file = "Data/PDB_contact_maps/8dlk_ace2row_rbdcol_dist_matrix_min.csv")
pdb_8dlk_dist_frame$position <- as.numeric(rownames(pdb_8dlk_dist_frame))
pdb_8dlk_dist_frame_melt <- melt(pdb_8dlk_dist_frame, id = "position")
pdb_8dlk_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_8dlk_dist_frame_melt$variable,2,5)) + 329
pdb_8dlk_dist_frame_melt$ace2_position <- as.numeric(pdb_8dlk_dist_frame_melt$position) + 18
pdb_8dlk_dist_ace2_positionlist <- as.numeric(unique(pdb_8dlk_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_8dlk_dist_rbd_positionlist <- as.numeric(unique(pdb_8dlk_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_8dlk_dist_frame_melt$ace2_rbd <- paste(pdb_8dlk_dist_frame_melt$ace2_pos, pdb_8dlk_dist_frame_melt$rbd_pos, sep = "_")

pdb_7r0z_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7r0z_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7r0z_dist_frame$position <- as.numeric(rownames(pdb_7r0z_dist_frame))
pdb_7r0z_dist_frame_melt <- melt(pdb_7r0z_dist_frame, id = "position")
pdb_7r0z_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7r0z_dist_frame_melt$variable,2,5)) + 321
pdb_7r0z_dist_frame_melt$ace2_position <- as.numeric(pdb_7r0z_dist_frame_melt$position) + 18
pdb_7r0z_dist_ace2_positionlist <- as.numeric(unique(pdb_7r0z_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7r0z_dist_rbd_positionlist <- as.numeric(unique(pdb_7r0z_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7r0z_dist_frame_melt$ace2_rbd <- paste(pdb_7r0z_dist_frame_melt$ace2_pos, pdb_7r0z_dist_frame_melt$rbd_pos, sep = "_")

pdb_7mjn_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7mjn_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7mjn_dist_frame$position <- as.numeric(rownames(pdb_7mjn_dist_frame))
pdb_7mjn_dist_frame_melt <- melt(pdb_7mjn_dist_frame, id = "position")
pdb_7mjn_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7mjn_dist_frame_melt$variable,2,5)) + 329
pdb_7mjn_dist_frame_melt$ace2_position <- as.numeric(pdb_7mjn_dist_frame_melt$position) + 18
pdb_7mjn_dist_ace2_positionlist <- as.numeric(unique(pdb_7mjn_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7mjn_dist_rbd_positionlist <- as.numeric(unique(pdb_7mjn_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7mjn_dist_frame_melt$ace2_rbd <- paste(pdb_7mjn_dist_frame_melt$ace2_pos, pdb_7mjn_dist_frame_melt$rbd_pos, sep = "_")

pdb_7ekf_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7ekf_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7ekf_dist_frame$position <- as.numeric(rownames(pdb_7ekf_dist_frame))
pdb_7ekf_dist_frame_melt <- melt(pdb_7ekf_dist_frame, id = "position")
pdb_7ekf_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7ekf_dist_frame_melt$variable,2,5)) + 332
pdb_7ekf_dist_frame_melt$ace2_position <- as.numeric(pdb_7ekf_dist_frame_melt$position) + 18
pdb_7ekf_dist_ace2_positionlist <- as.numeric(unique(pdb_7ekf_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7ekf_dist_rbd_positionlist <- as.numeric(unique(pdb_7ekf_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7ekf_dist_frame_melt$ace2_rbd <- paste(pdb_7ekf_dist_frame_melt$ace2_pos, pdb_7ekf_dist_frame_melt$rbd_pos, sep = "_")
pdb_7ekf_dist_frame_melt$dist <- pdb_7ekf_dist_frame_melt$value

## Alpha
pdb_alpha_combined <- merge(merge(pdb_8dlk_dist_frame_melt[,c("ace2_rbd","ace2_position","rbd_position","value")], pdb_7mjn_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_7ekf_dist_frame_melt[,c("ace2_rbd","dist")], by = "ace2_rbd") %>% filter(!is.na(rbd_position) & !is.na(ace2_position))
colnames(pdb_alpha_combined) <- c("ace2_rbd","ace2_position","rbd_position","dist_8dlk","dist_7mjn","dist_7ekf")
pdb_alpha_combined[pdb_alpha_combined$dist_8dlk > 50,"dist_8dlk"] <- NA
pdb_alpha_combined[pdb_alpha_combined$dist_7mjn > 50,"dist_7mjn"] <- NA
pdb_alpha_combined[pdb_alpha_combined$dist_7ekf > 50,"dist_7ekf"] <- NA
pdb_alpha_combined$dist_alpha <- rowMeans(pdb_alpha_combined[,c("dist_8dlk","dist_7mjn","dist_7ekf")], na.rm = T)
for(x in 1:nrow(pdb_alpha_combined)){
  pdb_alpha_combined$cv_alpha[x] <- sd(pdb_alpha_combined[x,c("dist_8dlk", "dist_7mjn","dist_7ekf")], na.rm = T) / pdb_alpha_combined$dist_alpha[x]
}

alpha_ace2_31_combined <- pdb_alpha_combined %>% filter(ace2_position == 31)
ggplot() + geom_hline(yintercept = 10) +
  geom_line(data = alpha_ace2_31_combined, aes(x = rbd_position, y = dist_8dlk), color = "red") +
  geom_line(data = alpha_ace2_31_combined, aes(x = rbd_position, y = dist_7mjn), color = "red") +
  geom_line(data = alpha_ace2_31_combined, aes(x = rbd_position, y = dist_7ekf), color = "green") +
  NULL
```

```{r Beta RBD-ACE2 contact maps}
pdb_7vx4_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7vx4_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7vx4_dist_frame$position <- as.numeric(rownames(pdb_7vx4_dist_frame))
pdb_7vx4_dist_frame_melt <- melt(pdb_7vx4_dist_frame, id = "position")
pdb_7vx4_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7vx4_dist_frame_melt$variable,2,5)) + 332
pdb_7vx4_dist_frame_melt$ace2_position <- as.numeric(pdb_7vx4_dist_frame_melt$position) + 18
pdb_7vx4_dist_ace2_positionlist <- as.numeric(unique(pdb_7vx4_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7vx4_dist_rbd_positionlist <- as.numeric(unique(pdb_7vx4_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7vx4_dist_frame_melt$ace2_rbd <- paste(pdb_7vx4_dist_frame_melt$ace2_pos, pdb_7vx4_dist_frame_melt$rbd_pos, sep = "_")

pdb_8dln_dist_frame <- read.csv(file = "Data/PDB_contact_maps/8dln_ace2row_rbdcol_dist_matrix_min.csv")
pdb_8dln_dist_frame$position <- as.numeric(rownames(pdb_8dln_dist_frame))
pdb_8dln_dist_frame_melt <- melt(pdb_8dln_dist_frame, id = "position")
pdb_8dln_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_8dln_dist_frame_melt$variable,2,5)) + 329
pdb_8dln_dist_frame_melt$ace2_position <- as.numeric(pdb_8dln_dist_frame_melt$position) + 18
pdb_8dln_dist_ace2_positionlist <- as.numeric(unique(pdb_8dln_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_8dln_dist_rbd_positionlist <- as.numeric(unique(pdb_8dln_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_8dln_dist_frame_melt$ace2_rbd <- paste(pdb_8dln_dist_frame_melt$ace2_pos, pdb_8dln_dist_frame_melt$rbd_pos, sep = "_")

pdb_7v80_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7v80_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7v80_dist_frame$position <- as.numeric(rownames(pdb_7v80_dist_frame))
pdb_7v80_dist_frame_melt <- melt(pdb_7v80_dist_frame, id = "position")
pdb_7v80_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7v80_dist_frame_melt$variable,2,5)) + 329
pdb_7v80_dist_frame_melt$ace2_position <- as.numeric(pdb_7v80_dist_frame_melt$position) + 18
pdb_7v80_dist_ace2_positionlist <- as.numeric(unique(pdb_7v80_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7v80_dist_rbd_positionlist <- as.numeric(unique(pdb_7v80_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7v80_dist_frame_melt$ace2_rbd <- paste(pdb_7v80_dist_frame_melt$ace2_pos, pdb_7v80_dist_frame_melt$rbd_pos, sep = "_")

pdb_7ekg_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7ekg_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7ekg_dist_frame$position <- as.numeric(rownames(pdb_7ekg_dist_frame))
pdb_7ekg_dist_frame_melt <- melt(pdb_7ekg_dist_frame, id = "position")
pdb_7ekg_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7ekg_dist_frame_melt$variable,2,5)) + 332
pdb_7ekg_dist_frame_melt$ace2_position <- as.numeric(pdb_7ekg_dist_frame_melt$position) + 18
pdb_7ekg_dist_ace2_positionlist <- as.numeric(unique(pdb_7ekg_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7ekg_dist_rbd_positionlist <- as.numeric(unique(pdb_7ekg_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7ekg_dist_frame_melt$ace2_rbd <- paste(pdb_7ekg_dist_frame_melt$ace2_pos, pdb_7ekg_dist_frame_melt$rbd_pos, sep = "_")
pdb_7ekg_dist_frame_melt$dist_7ekg <- pdb_7ekg_dist_frame_melt$value

pdb_7sy6_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7sy6_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7sy6_dist_frame$position <- as.numeric(rownames(pdb_7sy6_dist_frame))
pdb_7sy6_dist_frame_melt <- melt(pdb_7sy6_dist_frame, id = "position")
pdb_7sy6_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7sy6_dist_frame_melt$variable,2,5)) + 329
pdb_7sy6_dist_frame_melt$ace2_position <- as.numeric(pdb_7sy6_dist_frame_melt$position) + 18
pdb_7sy6_dist_ace2_positionlist <- as.numeric(unique(pdb_7sy6_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7sy6_dist_rbd_positionlist <- as.numeric(unique(pdb_7sy6_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7sy6_dist_frame_melt$ace2_rbd <- paste(pdb_7sy6_dist_frame_melt$ace2_pos, pdb_7sy6_dist_frame_melt$rbd_pos, sep = "_")
pdb_7sy6_dist_frame_melt$dist_7sy6 <- pdb_7sy6_dist_frame_melt$value


## Beta
pdb_beta_combined <- merge(merge(merge(merge(pdb_7vx4_dist_frame_melt[,c("ace2_rbd","ace2_position","rbd_position","value")], pdb_8dln_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_7v80_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_7ekg_dist_frame_melt[,c("ace2_rbd","dist_7ekg")], by = "ace2_rbd"), pdb_7sy6_dist_frame_melt[,c("ace2_rbd","dist_7sy6")], by = "ace2_rbd") %>% filter(!is.na(rbd_position) & !is.na(ace2_position))
colnames(pdb_beta_combined) <- c("ace2_rbd","ace2_position","rbd_position","dist_7vx4", "dist_8dln", "dist_7v80", "dist_7ekg", "dist_7sy6")
pdb_beta_combined[pdb_beta_combined$dist_7vx4 > 50,"dist_7vx4"] <- NA
pdb_beta_combined[pdb_beta_combined$dist_8dln > 50,"dist_8dln"] <- NA
pdb_beta_combined[pdb_beta_combined$dist_7v80 > 50,"dist_7v80"] <- NA
pdb_beta_combined[pdb_beta_combined$dist_7ekg > 50,"dist_7ekg"] <- NA
pdb_beta_combined[pdb_beta_combined$dist_7sy6 > 50,"dist_7sy6"] <- NA
pdb_beta_combined$dist_beta <- rowMeans(pdb_beta_combined[,c("dist_7vx4","dist_8dln", "dist_7v80", "dist_7ekg", "dist_7sy6")], na.rm = T)
for(x in 1:nrow(pdb_beta_combined)){
  pdb_beta_combined$cv_beta[x] <- sd(pdb_beta_combined[x,c("dist_7vx4", "dist_8dln", "dist_7v80", "dist_7ekg", "dist_7sy6")]) / pdb_beta_combined$dist_beta[x]
}

beta_ace2_31_combined <- pdb_beta_combined %>% filter(ace2_position == 31)
ggplot() + geom_hline(yintercept = 10) +
  geom_line(data = beta_ace2_31_combined, aes(x = rbd_position, y = dist_7vx4), color = "red") +
  geom_line(data = beta_ace2_31_combined, aes(x = rbd_position, y = dist_8dln), color = "blue") +
  geom_line(data = beta_ace2_31_combined, aes(x = rbd_position, y = dist_7v80), color = "green") +
  geom_line(data = beta_ace2_31_combined, aes(x = rbd_position, y = dist_7ekg), color = "black")
```

```{r Gamma RBD-ACE2 contact maps}
pdb_7ekc_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7ekc_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7ekc_dist_frame$position <- as.numeric(rownames(pdb_7ekc_dist_frame))
pdb_7ekc_dist_frame_melt <- melt(pdb_7ekc_dist_frame, id = "position")
pdb_7ekc_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7ekc_dist_frame_melt$variable,2,5)) + 332
pdb_7ekc_dist_frame_melt$ace2_position <- as.numeric(pdb_7ekc_dist_frame_melt$position) + 18
pdb_7ekc_dist_ace2_positionlist <- as.numeric(unique(pdb_7ekc_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7ekc_dist_rbd_positionlist <- as.numeric(unique(pdb_7ekc_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7ekc_dist_frame_melt$ace2_rbd <- paste(pdb_7ekc_dist_frame_melt$ace2_pos, pdb_7ekc_dist_frame_melt$rbd_pos, sep = "_")

pdb_7v84_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7v84_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7v84_dist_frame$position <- as.numeric(rownames(pdb_7v84_dist_frame))
pdb_7v84_dist_frame_melt <- melt(pdb_7v84_dist_frame, id = "position") %>% filter(variable != "X")
pdb_7v84_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7v84_dist_frame_melt$variable,2,5)) + 329
pdb_7v84_dist_frame_melt$ace2_position <- as.numeric(pdb_7v84_dist_frame_melt$position) + 18
pdb_7v84_dist_ace2_positionlist <- as.numeric(unique(pdb_7v84_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7v84_dist_rbd_positionlist <- as.numeric(unique(pdb_7v84_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7v84_dist_frame_melt$ace2_rbd <- paste(pdb_7v84_dist_frame_melt$ace2_pos, pdb_7v84_dist_frame_melt$rbd_pos, sep = "_")

pdb_8dlq_dist_frame <- read.csv(file = "Data/PDB_contact_maps/8dlq_ace2row_rbdcol_dist_matrix_min.csv")
pdb_8dlq_dist_frame$position <- as.numeric(rownames(pdb_8dlq_dist_frame))
pdb_8dlq_dist_frame_melt <- melt(pdb_8dlq_dist_frame, id = "position")
pdb_8dlq_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_8dlq_dist_frame_melt$variable,2,5)) + 329
pdb_8dlq_dist_frame_melt$ace2_position <- as.numeric(pdb_8dlq_dist_frame_melt$position) + 18
pdb_8dlq_dist_ace2_positionlist <- as.numeric(unique(pdb_8dlq_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_8dlq_dist_rbd_positionlist <- as.numeric(unique(pdb_8dlq_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_8dlq_dist_frame_melt$ace2_rbd <- paste(pdb_8dlq_dist_frame_melt$ace2_pos, pdb_8dlq_dist_frame_melt$rbd_pos, sep = "_")

pdb_7sy8_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7sy8_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7sy8_dist_frame$position <- as.numeric(rownames(pdb_7sy8_dist_frame))
pdb_7sy8_dist_frame_melt <- melt(pdb_7sy8_dist_frame, id = "position")
pdb_7sy8_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7sy8_dist_frame_melt$variable,2,5)) + 329
pdb_7sy8_dist_frame_melt$ace2_position <- as.numeric(pdb_7sy8_dist_frame_melt$position) + 18
pdb_7sy8_dist_ace2_positionlist <- as.numeric(unique(pdb_7sy8_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7sy8_dist_rbd_positionlist <- as.numeric(unique(pdb_7sy8_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7sy8_dist_frame_melt$ace2_rbd <- paste(pdb_7sy8_dist_frame_melt$ace2_pos, pdb_7sy8_dist_frame_melt$rbd_pos, sep = "_")
pdb_7sy8_dist_frame_melt$dist_7sy8 <- pdb_7sy8_dist_frame_melt$value


## Gamma
pdb_gamma_combined <- merge(merge(merge(pdb_7ekc_dist_frame_melt[,c("ace2_rbd","ace2_position","rbd_position","value")], pdb_7v84_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"),pdb_8dlq_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"),pdb_7sy8_dist_frame_melt[,c("ace2_rbd","dist_7sy8")], by = "ace2_rbd") %>% filter(!is.na(rbd_position) & !is.na(ace2_position))
colnames(pdb_gamma_combined) <- c("ace2_rbd","ace2_position","rbd_position","dist_7ekc", "dist_7v84", "dist_8dlq","dist_7sy8")
pdb_gamma_combined[pdb_gamma_combined$dist_7ekc > 50,"dist_7ekc"] <- NA
pdb_gamma_combined$dist_7v84 <- as.numeric(pdb_gamma_combined$dist_7v84)
for(x in 1:nrow(pdb_gamma_combined)){
  if(!is.na(pdb_gamma_combined$dist_7v84[x])){
    if(pdb_gamma_combined$dist_7v84[x] > 50){
      pdb_gamma_combined$dist_7v84[x] <- NA}}}
pdb_gamma_combined[pdb_gamma_combined$dist_8dlq > 50,"dist_8dlq"] <- NA
pdb_gamma_combined[pdb_gamma_combined$dist_7sy8 > 50,"dist_7sy8"] <- NA
pdb_gamma_combined$dist_gamma <- rowMeans(pdb_gamma_combined[,c("dist_7ekc","dist_7v84", "dist_8dlq","dist_7sy8")], na.rm = T)
for(x in 1:nrow(pdb_gamma_combined)){
  pdb_gamma_combined$cv_gamma[x] <- sd(pdb_gamma_combined[x,c("dist_7ekc","dist_7v84", "dist_8dlq","dist_7sy8")], na.rm = T) / pdb_gamma_combined$dist_gamma[x]
}

gamma_ace2_31_combined <- pdb_gamma_combined %>% filter(ace2_position == 31)
ggplot() + geom_hline(yintercept = 10) +
  geom_line(data = gamma_ace2_31_combined, aes(x = rbd_position, y = dist_7ekc), color = "red") +
  geom_line(data = gamma_ace2_31_combined, aes(x = rbd_position, y = dist_7v84), color = "blue") +
  geom_line(data = gamma_ace2_31_combined, aes(x = rbd_position, y = dist_8dlq), color = "green") +
  geom_line(data = gamma_ace2_31_combined, aes(x = rbd_position, y = dist_8dlq), color = "orange")
```


```{r Delta RBD-ACE2 contact maps}
pdb_7w9i_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7w9i_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7w9i_dist_frame$position <- as.numeric(rownames(pdb_7w9i_dist_frame))
pdb_7w9i_dist_frame_melt <- melt(pdb_7w9i_dist_frame, id = "position")
pdb_7w9i_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7w9i_dist_frame_melt$variable,2,5)) + 332
pdb_7w9i_dist_frame_melt$ace2_position <- as.numeric(pdb_7w9i_dist_frame_melt$position) + 18
pdb_7w9i_dist_ace2_positionlist <- as.numeric(unique(pdb_7w9i_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7w9i_dist_rbd_positionlist <- as.numeric(unique(pdb_7w9i_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7w9i_dist_frame_melt$ace2_rbd <- paste(pdb_7w9i_dist_frame_melt$ace2_pos, pdb_7w9i_dist_frame_melt$rbd_pos, sep = "_")

pdb_7v8b_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7v8b_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7v8b_dist_frame$position <- as.numeric(rownames(pdb_7v8b_dist_frame))
pdb_7v8b_dist_frame_melt <- melt(pdb_7v8b_dist_frame, id = "position")
pdb_7v8b_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7v8b_dist_frame_melt$variable,2,5)) + 329
pdb_7v8b_dist_frame_melt$ace2_position <- as.numeric(pdb_7v8b_dist_frame_melt$position) + 18
pdb_7v8b_dist_ace2_positionlist <- as.numeric(unique(pdb_7v8b_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7v8b_dist_rbd_positionlist <- as.numeric(unique(pdb_7v8b_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7v8b_dist_frame_melt$ace2_rbd <- paste(pdb_7v8b_dist_frame_melt$ace2_pos, pdb_7v8b_dist_frame_melt$rbd_pos, sep = "_")

pdb_7tew_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7tew_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7tew_dist_frame$position <- as.numeric(rownames(pdb_7tew_dist_frame))
pdb_7tew_dist_frame_melt <- melt(pdb_7tew_dist_frame, id = "position")
pdb_7tew_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7tew_dist_frame_melt$variable,2,5)) + 329
pdb_7tew_dist_frame_melt$ace2_position <- as.numeric(pdb_7tew_dist_frame_melt$position) + 18
pdb_7tew_dist_ace2_positionlist <- as.numeric(unique(pdb_7tew_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7tew_dist_rbd_positionlist <- as.numeric(unique(pdb_7tew_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7tew_dist_frame_melt$ace2_rbd <- paste(pdb_7tew_dist_frame_melt$ace2_pos, pdb_7tew_dist_frame_melt$rbd_pos, sep = "_")

pdb_7wbq_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7wbq_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7wbq_dist_frame$position <- as.numeric(rownames(pdb_7wbq_dist_frame))
pdb_7wbq_dist_frame_melt <- melt(pdb_7wbq_dist_frame, id = "position")
pdb_7wbq_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7wbq_dist_frame_melt$variable,2,5)) + 332
pdb_7wbq_dist_frame_melt$ace2_position <- as.numeric(pdb_7wbq_dist_frame_melt$position) + 18
pdb_7wbq_dist_ace2_positionlist <- as.numeric(unique(pdb_7wbq_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7wbq_dist_rbd_positionlist <- as.numeric(unique(pdb_7wbq_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7wbq_dist_frame_melt$ace2_rbd <- paste(pdb_7wbq_dist_frame_melt$ace2_pos, pdb_7wbq_dist_frame_melt$rbd_pos, sep = "_")
pdb_7wbq_dist_frame_melt$dist_7wbq <- pdb_7wbq_dist_frame_melt$value

## Delta
pdb_delta_combined <- merge(merge(merge(pdb_7w9i_dist_frame_melt[,c("ace2_rbd","ace2_position","rbd_position","value")], pdb_7v8b_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_7tew_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_7wbq_dist_frame_melt[,c("ace2_rbd","dist_7wbq")], by = "ace2_rbd") %>% filter(!is.na(rbd_position) & !is.na(ace2_position))
colnames(pdb_delta_combined) <- c("ace2_rbd","ace2_position","rbd_position","dist_7w9i", "dist_7v8b", "dist_7tew", "dist_7wbq")
pdb_delta_combined[pdb_delta_combined$dist_7w9i > 50,"dist_7w9i"] <- NA
pdb_delta_combined[pdb_delta_combined$dist_7v8b > 50,"dist_7v8b"] <- NA
pdb_delta_combined[pdb_delta_combined$dist_7tew > 50,"dist_7tew"] <- NA
pdb_delta_combined[pdb_delta_combined$dist_7wbq > 50,"dist_7wbq"] <- NA
pdb_delta_combined$dist_delta <- rowMeans(pdb_delta_combined[,c("dist_7w9i","dist_7v8b", "dist_7tew", "dist_7wbq")], na.rm = T)
for(x in 1:nrow(pdb_delta_combined)){
  pdb_delta_combined$cv_delta[x] <- sd(pdb_delta_combined[x,c("dist_7w9i","dist_7v8b", "dist_7tew", "dist_7wbq")], na.rm = T) / pdb_delta_combined$dist_delta[x]
}

delta_ace2_31_combined <- pdb_delta_combined %>% filter(ace2_position == 31)
ggplot() + geom_hline(yintercept = 10) +
  geom_line(data = delta_ace2_31_combined, aes(x = rbd_position, y = dist_7w9i), color = "red") +
  geom_line(data = delta_ace2_31_combined, aes(x = rbd_position, y = dist_7v8b), color = "blue") +
  geom_line(data = delta_ace2_31_combined, aes(x = rbd_position, y = dist_7tew), color = "green") +
  NULL
```

```{r BA1 RBD-ACE2 contact maps}
pdb_7wk6_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7wk6_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7wk6_dist_frame$position <- as.numeric(rownames(pdb_7wk6_dist_frame))
pdb_7wk6_dist_frame_melt <- melt(pdb_7wk6_dist_frame, id = "position")
pdb_7wk6_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7wk6_dist_frame_melt$variable,2,5)) + 332
pdb_7wk6_dist_frame_melt$ace2_position <- as.numeric(pdb_7wk6_dist_frame_melt$position) + 18
pdb_7wk6_dist_ace2_positionlist <- as.numeric(unique(pdb_7wk6_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7wk6_dist_rbd_positionlist <- as.numeric(unique(pdb_7wk6_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7wk6_dist_frame_melt$ace2_rbd <- paste(pdb_7wk6_dist_frame_melt$ace2_pos, pdb_7wk6_dist_frame_melt$rbd_pos, sep = "_")

pdb_7t9l_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7t9l_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7t9l_dist_frame$position <- as.numeric(rownames(pdb_7t9l_dist_frame))
pdb_7t9l_dist_frame_melt <- melt(pdb_7t9l_dist_frame, id = "position")
pdb_7t9l_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7t9l_dist_frame_melt$variable,2,5)) + 329
pdb_7t9l_dist_frame_melt$ace2_position <- as.numeric(pdb_7t9l_dist_frame_melt$position) + 18
pdb_7t9l_dist_ace2_positionlist <- as.numeric(unique(pdb_7t9l_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7t9l_dist_rbd_positionlist <- as.numeric(unique(pdb_7t9l_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7t9l_dist_frame_melt$ace2_rbd <- paste(pdb_7t9l_dist_frame_melt$ace2_pos, pdb_7t9l_dist_frame_melt$rbd_pos, sep = "_")

pdb_7wpb_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7wpb_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7wpb_dist_frame$position <- as.numeric(rownames(pdb_7wpb_dist_frame))
pdb_7wpb_dist_frame_melt <- melt(pdb_7wpb_dist_frame, id = "position")
pdb_7wpb_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7wpb_dist_frame_melt$variable,2,5)) + 330
pdb_7wpb_dist_frame_melt$ace2_position <- as.numeric(pdb_7wpb_dist_frame_melt$position) + 18
pdb_7wpb_dist_ace2_positionlist <- as.numeric(unique(pdb_7wpb_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7wpb_dist_rbd_positionlist <- as.numeric(unique(pdb_7wpb_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7wpb_dist_frame_melt$ace2_rbd <- paste(pdb_7wpb_dist_frame_melt$ace2_pos, pdb_7wpb_dist_frame_melt$rbd_pos, sep = "_")

pdb_7wbp_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7wbp_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7wbp_dist_frame$position <- as.numeric(rownames(pdb_7wbp_dist_frame))
pdb_7wbp_dist_frame_melt <- melt(pdb_7wbp_dist_frame, id = "position") %>% filter(variable != "X")
pdb_7wbp_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7wbp_dist_frame_melt$variable,2,5)) + 332
pdb_7wbp_dist_frame_melt$ace2_position <- as.numeric(pdb_7wbp_dist_frame_melt$position) + 18
pdb_7wbp_dist_ace2_positionlist <- as.numeric(unique(pdb_7wbp_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7wbp_dist_rbd_positionlist <- as.numeric(unique(pdb_7wbp_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7wbp_dist_frame_melt$ace2_rbd <- paste(pdb_7wbp_dist_frame_melt$ace2_pos, pdb_7wbp_dist_frame_melt$rbd_pos, sep = "_")
pdb_7wbp_dist_frame_melt$dist_7wbp <- pdb_7wbp_dist_frame_melt$value

pdb_7whh_dist_frame <- read.csv(file = "Data/PDB_contact_maps/7whh_ace2row_rbdcol_dist_matrix_min.csv")
pdb_7whh_dist_frame$position <- as.numeric(rownames(pdb_7whh_dist_frame))
pdb_7whh_dist_frame_melt <- melt(pdb_7whh_dist_frame, id = "position") %>% filter(variable != "X")
pdb_7whh_dist_frame_melt$rbd_position <-as.numeric(substr(pdb_7whh_dist_frame_melt$variable,2,5)) + 332
pdb_7whh_dist_frame_melt$ace2_position <- as.numeric(pdb_7whh_dist_frame_melt$position) + 18
pdb_7whh_dist_ace2_positionlist <- as.numeric(unique(pdb_7whh_dist_frame_melt$ace2_position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7whh_dist_rbd_positionlist <- as.numeric(unique(pdb_7whh_dist_frame_melt$position)) ## For interpretation, need to add 316 later since actually starts at residue 331
pdb_7whh_dist_frame_melt$ace2_rbd <- paste(pdb_7whh_dist_frame_melt$ace2_pos, pdb_7whh_dist_frame_melt$rbd_pos, sep = "_")
pdb_7whh_dist_frame_melt$dist_7whh <- pdb_7whh_dist_frame_melt$value

## BA1
pdb_ba1_combined <- merge(merge(merge(merge(pdb_7wk6_dist_frame_melt[,c("ace2_rbd","ace2_position","rbd_position","value")], pdb_7t9l_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_7wpb_dist_frame_melt[,c("ace2_rbd","value")], by = "ace2_rbd"), pdb_7wbp_dist_frame_melt[,c("ace2_rbd","dist_7wbp")], by = "ace2_rbd"), pdb_7whh_dist_frame_melt[,c("ace2_rbd","dist_7whh")], by = "ace2_rbd") %>% filter(!is.na(rbd_position) & !is.na(ace2_position)) %>% filter(ace2_position != 401)
colnames(pdb_ba1_combined) <- c("ace2_rbd","ace2_position","rbd_position","dist_7wk6", "dist_7t9l", "dist_7wpb", "dist_7wbp", "dist_7whh")
pdb_ba1_combined[pdb_ba1_combined$dist_7wk6 > 50,"dist_7wk6"] <- NA
pdb_ba1_combined[pdb_ba1_combined$dist_7t9l > 50,"dist_7t9l"] <- NA
pdb_ba1_combined[pdb_ba1_combined$dist_7wpb > 50,"dist_7wpb"] <- NA
pdb_ba1_combined[pdb_ba1_combined$dist_7wbp > 50,"dist_7wbp"] <- NA
pdb_ba1_combined[pdb_ba1_combined$dist_7whh > 50,"dist_7whh"] <- NA
pdb_ba1_combined$dist_ba1 <- rowMeans(pdb_ba1_combined[,c("dist_7wk6","dist_7t9l", "dist_7wpb","dist_7wbp", "dist_7whh")], na.rm = T)
for(x in 1:nrow(pdb_ba1_combined)){
  pdb_ba1_combined$cv_ba1[x] <- sd(pdb_ba1_combined[x,c("dist_7wk6","dist_7t9l", "dist_7wpb","dist_7wbp", "dist_7whh")], na.rm = T) / pdb_ba1_combined$dist_ba1[x]
}

ba1_ace2_31_combined <- pdb_ba1_combined %>% filter(ace2_position == 31)
ggplot() + geom_hline(yintercept = 10) +
  geom_line(data = ba1_ace2_31_combined, aes(x = rbd_position, y = dist_7wk6), color = "black") +
  geom_line(data = ba1_ace2_31_combined, aes(x = rbd_position, y = dist_7t9l), color = "black") +
  geom_line(data = ba1_ace2_31_combined, aes(x = rbd_position, y = dist_7wpb), color = "black") +
  geom_line(data = ba1_ace2_31_combined, aes(x = rbd_position, y = dist_7wbp), color = "green") +
  geom_line(data = ba1_ace2_31_combined, aes(x = rbd_position, y = dist_7whh), color = "red") +
  NULL
```

```{r Merging all of the structural distance data into a single dataframe}
pdb_variants <- merge(merge(merge(merge(merge(pdb_d614g_combined[,c("ace2_rbd","ace2_position","rbd_position","dist_d614g")], pdb_alpha_combined[,c("ace2_rbd","dist_alpha")], by = "ace2_rbd"), pdb_beta_combined[,c("ace2_rbd","dist_beta")], by = "ace2_rbd"), pdb_gamma_combined[,c("ace2_rbd","dist_gamma")], by = "ace2_rbd"), pdb_delta_combined[,c("ace2_rbd","dist_delta")], by = "ace2_rbd"), pdb_ba1_combined[,c("ace2_rbd","dist_ba1")], by = "ace2_rbd") #pdb_muba1_combined[,c("ace2_rbd","dist_muba1")], by = "ace2_rbd"), 

## Check to make sure all RBD numbers are aligned
ace2_31_combined <- pdb_variants %>% filter(ace2_position == 31)
ggplot() + labs(x = "RBD residue", y = "Distance (angstroms)") +
  scale_y_continuous(limits = c(0,50)) +
  geom_hline(yintercept = c(4,10), alpha = 0.4) +
  geom_point(data = ace2_31_combined, aes(x = rbd_position, y = dist_d614g), color = "green", alpha = 0.4, size = 2) +
  geom_point(data = ace2_31_combined, aes(x = rbd_position, y = dist_alpha), color = "orange", alpha = 0.4, size = 2) +
  geom_point(data = ace2_31_combined, aes(x = rbd_position, y = dist_beta), color = "red", alpha = 0.4, size = 2) +
  geom_point(data = ace2_31_combined, aes(x = rbd_position, y = dist_gamma), color = "red", alpha = 0.4, size = 2) +
  geom_point(data = ace2_31_combined, aes(x = rbd_position, y = dist_delta), color = "blue", alpha = 0.4, size = 2) +
  geom_point(data = ace2_31_combined, aes(x = rbd_position, y = dist_ba1), color = "purple", alpha = 0.4, size = 2) +
  NULL

ACE2_K31_variantsRBD_distances <- ggplot() + theme(panel.grid.major = element_blank()) + 
  labs(x = "RBD residue", y = "Distance (angstroms)") +
  scale_y_continuous(limits = c(0,50)) +
  geom_hline(yintercept = c(4,10), alpha = 0.2) +
  geom_line(data = ace2_31_combined, aes(x = rbd_position, y = dist_d614g), color = "red", alpha = 0.4) +
  geom_line(data = ace2_31_combined, aes(x = rbd_position, y = dist_alpha), color = "orange", alpha = 0.4) +
  geom_line(data = ace2_31_combined, aes(x = rbd_position, y = dist_beta), color = "green", alpha = 0.2) +
  geom_line(data = ace2_31_combined, aes(x = rbd_position, y = dist_gamma), color = "cyan", alpha = 0.2) +
  geom_line(data = ace2_31_combined, aes(x = rbd_position, y = dist_delta), color = "blue", alpha = 0.2) +
  geom_line(data = ace2_31_combined, aes(x = rbd_position, y = dist_ba1), color = "purple", alpha = 0.2) +
  NULL
ggsave(file = "Plots/ACE2_K31_variantsRBD_distances.pdf", ACE2_K31_variantsRBD_distances, height = 2, width = 2.5)

ace2_38_combined <- pdb_variants %>% filter(ace2_position == 38)
ACE2_D38_variantsRBD_distances <- ggplot() + theme(panel.grid.major = element_blank()) + 
  labs(x = "RBD residue", y = "Distance (angstroms)") +
  scale_y_continuous(limits = c(0,50)) +
  geom_hline(yintercept = c(4,10), alpha = 0.2) +
  geom_line(data = ace2_38_combined, aes(x = rbd_position, y = dist_d614g), color = "red", alpha = 0.4) +
  geom_line(data = ace2_38_combined, aes(x = rbd_position, y = dist_alpha), color = "orange", alpha = 0.4) +
  geom_line(data = ace2_38_combined, aes(x = rbd_position, y = dist_beta), color = "green", alpha = 0.2) +
  geom_line(data = ace2_38_combined, aes(x = rbd_position, y = dist_gamma), color = "cyan", alpha = 0.2) +
  geom_line(data = ace2_38_combined, aes(x = rbd_position, y = dist_delta), color = "blue", alpha = 0.2) +
  geom_line(data = ace2_38_combined, aes(x = rbd_position, y = dist_ba1), color = "purple", alpha = 0.2) +
  NULL
ggsave(file = "Plots/ACE2_D38_variantsRBD_distances.pdf", ACE2_D38_variantsRBD_distances, height = 2, width = 2.5)
```

```{r Calculate CV by position}
## Variability by ACE2 position

pdb_variants$mean = rowMeans(pdb_variants[4:9])
pdb_variants$sd <- apply(pdb_variants[,4:9], 1, sd)
pdb_variants$cv <- pdb_variants$sd / pdb_variants$mean

pdb_variants_20_or_less <- pdb_variants %>% filter(mean <= 20)

ggplot() + scale_fill_continuous(low = "white", high = "black") +
  geom_tile(data = pdb_variants_20_or_less, aes(x = ace2_position, y = rbd_position, fill = cv))

Variable_contact_histogram <- ggplot() + 
  scale_x_log10(limits = c(0.003,1)) + labs(x = "Coefficient of variation", y = "Number of contacts") +
  geom_histogram(data = pdb_variants_20_or_less, aes(x = cv), bins = 70) +
  geom_vline(xintercept = 0.2, color = "red", alpha = 0.4)
ggsave(file = "Plots/Variable_contact_histogram.pdf", Variable_contact_histogram, height = 2, width = 2.2)

pdb_variants_20_or_less_cv_pt2 <- pdb_variants_20_or_less %>% filter(cv >= 0.2)
```

```{r Filtering the data based on a max distance cutoff}

# Filter for positions less than 10 angstroms for all structures
cutoff_med <- 10
pdb_variants_med <- pdb_variants %>% filter(dist_d614g < cutoff_med & dist_alpha < cutoff_med & dist_beta < cutoff_med & dist_gamma < cutoff_med & dist_delta < cutoff_med & dist_ba1 < cutoff_med)

ggplot() + geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_alpha))
ggplot() + geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_beta))
ggplot() + geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_gamma))
ggplot() + geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_delta))
ggplot() + geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_ba1))
ggplot() + geom_point(data = pdb_variants_med, aes(x = dist_beta, y = dist_gamma))
ggplot() + geom_point(data = pdb_variants_med, aes(x = dist_beta, y = dist_delta))

Angstrom_scatterplot_D614G_Delta <- ggplot() + labs(x = "Angstroms (D614G)", y = "Angstroms (Delta)") +
  geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_delta), alpha = 0.3)
Angstrom_scatterplot_D614G_Delta
ggsave(file = "Plots/Angstrom_scatterplot_D614G_Delta.pdf", Angstrom_scatterplot_D614G_Delta, height = 1.5, width = 1.5)

Angstrom_scatterplot_Beta_Gamma <- ggplot() + labs(x = "Angstroms (Gamma)", y = "Angstroms (Beta)") +
  geom_point(data = pdb_variants_med, aes(x = dist_gamma, y = dist_beta), alpha = 0.3)
Angstrom_scatterplot_Beta_Gamma
ggsave(file = "Plots/Angstrom_scatterplot_Beta_Gamma.pdf", Angstrom_scatterplot_Beta_Gamma, height = 1.5, width = 1.5)

Angstrom_scatterplot_D614G_Beta <- ggplot() + labs(x = "Angstroms (D614G)", y = "Angstroms (Beta)") +
  geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_beta), alpha = 0.3)
Angstrom_scatterplot_D614G_Beta
ggsave(file = "Plots/Angstrom_scatterplot_D614G_Beta.pdf", Angstrom_scatterplot_D614G_Beta, height = 1.5, width = 1.5)

Angstrom_scatterplot_D614G_BA1 <- ggplot() + labs(x = "Angstroms (D614G)", y = "Angstroms (BA1)") +
  geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_ba1), alpha = 0.3)
Angstrom_scatterplot_D614G_BA1
ggsave(file = "Plots/Angstrom_scatterplot_D614G_BA1.pdf", Angstrom_scatterplot_D614G_BA1, height = 1.5, width = 1.5)

```

```{r Identifying interaction pairs that are outside of the normal distribution of natural variability}

critical_value <- 1.96

## Now do pairwise differences between distances based on the SARS-CoV-2 variant
## D614G
pdb_variants_med$d614g_alpha <- (pdb_variants_med$dist_d614g - pdb_variants_med$dist_alpha) / rowMeans(pdb_variants_med[,c("dist_d614g","dist_alpha")])
pdb_variants_med$d614g_alpha_outlier <- pdb_variants_med$d614g_alpha < (mean(pdb_variants_med$d614g_alpha) - sd(pdb_variants_med$d614g_alpha) * critical_value) | pdb_variants_med$d614g_alpha > (mean(pdb_variants_med$d614g_alpha) + sd(pdb_variants_med$d614g_alpha) * critical_value)
D614G_Alpha_Histogram <- ggplot() + scale_x_continuous(limits = c(-0.5,0.5), breaks = c(c(-0.5,0,0.5))) + 
  geom_histogram(data = pdb_variants_med, aes(x = d614g_alpha), binwidth = 0.04, color = "black") +
  geom_histogram(data = pdb_variants_med %>% filter(d614g_alpha_outlier == TRUE), aes(x = d614g_alpha), binwidth = 0.04, fill = "red", color = "black")
ggsave(file = "Plots/D614G_Alpha_Histogram.pdf", D614G_Alpha_Histogram, height = 1.25, width = 1.4)

pdb_variants_med$d614g_beta <- (pdb_variants_med$dist_d614g - pdb_variants_med$dist_beta) / rowMeans(pdb_variants_med[,c("dist_d614g","dist_beta")])
pdb_variants_med$d614g_beta_outlier <- pdb_variants_med$d614g_beta < (mean(pdb_variants_med$d614g_beta) - sd(pdb_variants_med$d614g_beta) * critical_value) | pdb_variants_med$d614g_beta > (mean(pdb_variants_med$d614g_beta) + sd(pdb_variants_med$d614g_beta) * critical_value)
D614G_Beta_Histogram <- ggplot() + scale_x_continuous(limits = c(-0.5,0.5), breaks = c(c(-0.5,0,0.5))) +
  geom_histogram(data = pdb_variants_med, aes(x = d614g_beta), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(d614g_beta_outlier == TRUE), aes(x = d614g_beta), binwidth = 0.04, fill = "red", color = "black")
ggsave(file = "Plots/D614G_Beta_Histogram.pdf", D614G_Beta_Histogram, height = 1.25, width = 1.4)

pdb_variants_med$d614g_gamma <- (pdb_variants_med$dist_d614g - pdb_variants_med$dist_gamma) / rowMeans(pdb_variants_med[,c("dist_d614g","dist_gamma")])
pdb_variants_med$d614g_gamma_outlier <- pdb_variants_med$d614g_gamma < (mean(pdb_variants_med$d614g_gamma) - sd(pdb_variants_med$d614g_gamma) * critical_value) | pdb_variants_med$d614g_gamma > (mean(pdb_variants_med$d614g_gamma) + sd(pdb_variants_med$d614g_gamma) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = d614g_gamma), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(d614g_gamma_outlier == TRUE), aes(x = d614g_gamma), binwidth = 0.04, fill = "red", color = "black")

pdb_variants_med$d614g_delta <- (pdb_variants_med$dist_d614g - pdb_variants_med$dist_delta) / rowMeans(pdb_variants_med[,c("dist_d614g","dist_delta")])
pdb_variants_med$d614g_delta_outlier <- pdb_variants_med$d614g_delta < (mean(pdb_variants_med$d614g_delta) - sd(pdb_variants_med$d614g_delta) * critical_value) | pdb_variants_med$d614g_delta > (mean(pdb_variants_med$d614g_delta) + sd(pdb_variants_med$d614g_delta) * critical_value)
D614G_Delta_Histogram <- ggplot() + scale_x_continuous(limits = c(-0.5,0.5), breaks = c(c(-0.5,0,0.5))) + 
  geom_histogram(data = pdb_variants_med, aes(x = d614g_delta), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(d614g_delta_outlier == TRUE), aes(x = d614g_delta), binwidth = 0.04, fill = "red", color = "black")
ggsave(file = "Plots/D614G_Delta_Histogram.pdf", D614G_Delta_Histogram, height = 1.25, width = 1.4)

pdb_variants_med$d614g_ba1 <- (pdb_variants_med$dist_d614g - pdb_variants_med$dist_ba1) / rowMeans(pdb_variants_med[,c("dist_d614g","dist_ba1")])
pdb_variants_med$d614g_ba1_outlier <- pdb_variants_med$d614g_ba1 < (mean(pdb_variants_med$d614g_ba1) - sd(pdb_variants_med$d614g_ba1) * critical_value) | pdb_variants_med$d614g_ba1 > (mean(pdb_variants_med$d614g_ba1) + sd(pdb_variants_med$d614g_ba1) * critical_value)
D614G_BA1_Histogram <- ggplot() + scale_x_continuous(limits = c(-0.5,0.5), breaks = c(c(-0.5,0,0.5))) + 
  geom_histogram(data = pdb_variants_med, aes(x = d614g_ba1), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(d614g_ba1_outlier == TRUE), aes(x = d614g_ba1), binwidth = 0.04, fill = "red", color = "black")
ggsave(file = "Plots/D614G_BA1_Histogram.pdf", D614G_BA1_Histogram, height = 1.25, width = 1.4)

## Alpha
pdb_variants_med$alpha_beta <- (pdb_variants_med$dist_alpha - pdb_variants_med$dist_beta) / rowMeans(pdb_variants_med[,c("dist_alpha","dist_beta")])
pdb_variants_med$alpha_beta_outlier <- pdb_variants_med$alpha_beta < (mean(pdb_variants_med$alpha_beta) - sd(pdb_variants_med$alpha_beta) * critical_value) | pdb_variants_med$alpha_beta > (mean(pdb_variants_med$alpha_beta) + sd(pdb_variants_med$alpha_beta) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = alpha_beta), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(alpha_beta_outlier == TRUE), aes(x = alpha_beta), binwidth = 0.04, fill = "red", color = "black")

pdb_variants_med$alpha_gamma <- (pdb_variants_med$dist_alpha - pdb_variants_med$dist_gamma) / rowMeans(pdb_variants_med[,c("dist_alpha","dist_gamma")])
pdb_variants_med$alpha_gamma_outlier <- pdb_variants_med$alpha_gamma < (mean(pdb_variants_med$alpha_gamma) - sd(pdb_variants_med$alpha_gamma) * critical_value) | pdb_variants_med$alpha_gamma > (mean(pdb_variants_med$alpha_gamma) + sd(pdb_variants_med$alpha_gamma) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = alpha_gamma), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(alpha_gamma_outlier == TRUE), aes(x = alpha_gamma), binwidth = 0.04, fill = "red", color = "black")

pdb_variants_med$alpha_delta <- (pdb_variants_med$dist_alpha - pdb_variants_med$dist_delta) / rowMeans(pdb_variants_med[,c("dist_alpha","dist_delta")])
pdb_variants_med$alpha_delta_outlier <- pdb_variants_med$alpha_delta < (mean(pdb_variants_med$alpha_delta) - sd(pdb_variants_med$alpha_delta) * critical_value) | pdb_variants_med$alpha_delta > (mean(pdb_variants_med$alpha_delta) + sd(pdb_variants_med$alpha_delta) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = alpha_delta), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(alpha_delta_outlier == TRUE), aes(x = alpha_delta), binwidth = 0.04, fill = "red", color = "black")

pdb_variants_med$alpha_ba1 <- (pdb_variants_med$dist_alpha - pdb_variants_med$dist_ba1) / rowMeans(pdb_variants_med[,c("dist_alpha","dist_ba1")])
pdb_variants_med$alpha_ba1_outlier <- pdb_variants_med$alpha_ba1 < (mean(pdb_variants_med$alpha_ba1) - sd(pdb_variants_med$alpha_ba1) * critical_value) | pdb_variants_med$alpha_ba1 > (mean(pdb_variants_med$alpha_ba1) + sd(pdb_variants_med$alpha_ba1) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = alpha_ba1), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(alpha_ba1_outlier == TRUE), aes(x = alpha_ba1), binwidth = 0.04, fill = "red", color = "black")

## Beta
pdb_variants_med$beta_gamma <- (pdb_variants_med$dist_beta - pdb_variants_med$dist_gamma) / rowMeans(pdb_variants_med[,c("dist_beta","dist_gamma")])
pdb_variants_med$beta_gamma_outlier <- pdb_variants_med$beta_gamma < (mean(pdb_variants_med$beta_gamma) - sd(pdb_variants_med$beta_gamma) * critical_value) | pdb_variants_med$beta_gamma > (mean(pdb_variants_med$beta_gamma) + sd(pdb_variants_med$beta_gamma) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = beta_gamma), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(beta_gamma_outlier == TRUE), aes(x = beta_gamma), binwidth = 0.04, fill = "red", color = "black")

pdb_variants_med$beta_delta <- (pdb_variants_med$dist_beta - pdb_variants_med$dist_delta) / rowMeans(pdb_variants_med[,c("dist_beta","dist_delta")])
pdb_variants_med$beta_delta_outlier <- pdb_variants_med$beta_delta < (mean(pdb_variants_med$beta_delta) - sd(pdb_variants_med$beta_delta) * critical_value) | pdb_variants_med$beta_delta > (mean(pdb_variants_med$beta_delta) + sd(pdb_variants_med$beta_delta) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = beta_delta), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(beta_delta_outlier == TRUE), aes(x = beta_delta), binwidth = 0.04, fill = "red", color = "black")

pdb_variants_med$beta_ba1 <- (pdb_variants_med$dist_beta - pdb_variants_med$dist_ba1) / rowMeans(pdb_variants_med[,c("dist_beta","dist_ba1")])
pdb_variants_med$beta_ba1_outlier <- pdb_variants_med$beta_ba1 < (mean(pdb_variants_med$beta_ba1) - sd(pdb_variants_med$beta_ba1) * critical_value) | pdb_variants_med$beta_ba1 > (mean(pdb_variants_med$beta_ba1) + sd(pdb_variants_med$beta_ba1) * critical_value)
Beta_BA1_Histogram <- ggplot() + scale_x_continuous(limits = c(-0.5,0.5), breaks = c(c(-0.5,0,0.5))) + 
  geom_histogram(data = pdb_variants_med, aes(x = beta_ba1), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(beta_ba1_outlier == TRUE), aes(x = beta_ba1), binwidth = 0.04, fill = "red", color = "black")
ggsave(file = "Plots/Beta_BA1_Histogram.pdf", Beta_BA1_Histogram, height = 1.25, width = 1.4)

## Gamma
pdb_variants_med$gamma_delta <- (pdb_variants_med$dist_gamma - pdb_variants_med$dist_delta) / rowMeans(pdb_variants_med[,c("dist_gamma","dist_delta")])
pdb_variants_med$gamma_delta_outlier <- pdb_variants_med$gamma_delta < (mean(pdb_variants_med$gamma_delta) - sd(pdb_variants_med$gamma_delta) * critical_value) | pdb_variants_med$gamma_delta > (mean(pdb_variants_med$gamma_delta) + sd(pdb_variants_med$gamma_delta) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = gamma_delta), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(gamma_delta_outlier == TRUE), aes(x = gamma_delta), binwidth = 0.04, fill = "red", color = "black")

pdb_variants_med$gamma_ba1 <- (pdb_variants_med$dist_gamma - pdb_variants_med$dist_ba1) / rowMeans(pdb_variants_med[,c("dist_gamma","dist_ba1")])
pdb_variants_med$gamma_ba1_outlier <- pdb_variants_med$gamma_ba1 < (mean(pdb_variants_med$gamma_ba1) - sd(pdb_variants_med$gamma_ba1) * critical_value) | pdb_variants_med$gamma_ba1 > (mean(pdb_variants_med$gamma_ba1) + sd(pdb_variants_med$gamma_ba1) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = gamma_ba1), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(gamma_ba1_outlier == TRUE), aes(x = gamma_ba1), binwidth = 0.04, fill = "red", color = "black")

## Delta
pdb_variants_med$delta_ba1 <- (pdb_variants_med$dist_delta - pdb_variants_med$dist_ba1) / rowMeans(pdb_variants_med[,c("dist_delta","dist_ba1")])
pdb_variants_med$delta_ba1_outlier <- pdb_variants_med$delta_ba1 < (mean(pdb_variants_med$delta_ba1) - sd(pdb_variants_med$delta_ba1) * critical_value) | pdb_variants_med$delta_ba1 > (mean(pdb_variants_med$delta_ba1) + sd(pdb_variants_med$delta_ba1) * critical_value)
ggplot() + scale_x_continuous(limits = c(-0.5,0.5)) + geom_histogram(data = pdb_variants_med, aes(x = delta_ba1), binwidth = 0.04, color = "black") +
   geom_histogram(data = pdb_variants_med %>% filter(delta_ba1_outlier == TRUE), aes(x = delta_ba1), binwidth = 0.04, fill = "red", color = "black")


## Making the scatterplots again but this time coloring the outliers as red
Angstrom_scatterplot_D614G_Delta2 <- ggplot() + labs(x = "Angstroms (D614G)", y = "Angstroms (Delta)") +
  geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_delta), alpha = 0.3) +
  geom_point(data = pdb_variants_med %>% filter(d614g_delta_outlier == TRUE), aes(x = dist_d614g, y = dist_delta), color = "red", alpha = 0.4) +
  NULL
Angstrom_scatterplot_D614G_Delta2
ggsave(file = "Plots/Angstrom_scatterplot_D614G_Delta2.pdf", Angstrom_scatterplot_D614G_Delta2, height = 1.5, width = 1.5)

Angstrom_scatterplot_Beta_Gamma2 <- ggplot() + labs(x = "Angstroms (Gamma)", y = "Angstroms (Beta)") +
  geom_point(data = pdb_variants_med, aes(x = dist_gamma, y = dist_beta), alpha = 0.3) +
  geom_point(data = pdb_variants_med %>% filter(beta_gamma_outlier == TRUE), aes(x = dist_gamma, y = dist_beta), color = "red", alpha = 0.4) +
  NULL
Angstrom_scatterplot_Beta_Gamma2
ggsave(file = "Plots/Angstrom_scatterplot_Beta_Gamma2.pdf", Angstrom_scatterplot_Beta_Gamma2, height = 1.5, width = 1.5)

Angstrom_scatterplot_D614G_Beta2 <- ggplot() + labs(x = "Angstroms (D614G)", y = "Angstroms (Beta)") +
  geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_beta), alpha = 0.3) +
  geom_point(data = pdb_variants_med %>% filter(d614g_beta_outlier == TRUE), aes(x = dist_d614g, y = dist_beta), color = "red", alpha = 0.4) +
  NULL
Angstrom_scatterplot_D614G_Beta2
ggsave(file = "Plots/Angstrom_scatterplot_D614G_Beta2.pdf", Angstrom_scatterplot_D614G_Beta2, height = 1.5, width = 1.5)

Angstrom_scatterplot_D614G_BA1_2 <- ggplot() + labs(x = "Angstroms (D614G)", y = "Angstroms (BA1)") +
  geom_point(data = pdb_variants_med, aes(x = dist_d614g, y = dist_ba1), alpha = 0.3) +
  geom_point(data = pdb_variants_med %>% filter(d614g_ba1_outlier == TRUE), aes(x = dist_d614g, y = dist_ba1), color = "red", alpha = 0.4) +
  NULL
Angstrom_scatterplot_D614G_BA1_2
ggsave(file = "Plots/Angstrom_scatterplot_D614G_BA1_2.pdf", Angstrom_scatterplot_D614G_BA1_2, height = 1.5, width = 1.5)

Angstrom_scatterplot_Beta_BA1_2 <- ggplot() + labs(x = "Angstroms (Beta)", y = "Angstroms (BA1)") +
  geom_point(data = pdb_variants_med, aes(x = dist_beta, y = dist_ba1), alpha = 0.3) +
  geom_point(data = pdb_variants_med %>% filter(beta_ba1_outlier == TRUE), aes(x = dist_beta, y = dist_ba1), color = "red", alpha = 0.4) +
  NULL
Angstrom_scatterplot_Beta_BA1_2
ggsave(file = "Plots/Angstrom_scatterplot_Beta_BA1_2.pdf", Angstrom_scatterplot_Beta_BA1_2, height = 1.5, width = 1.5)


## Now add everything together 
pdb_variants_med_combined <- data.frame("distance" = c(pdb_variants_med$d614g_alpha, pdb_variants_med$d614g_beta, pdb_variants_med$d614g_beta, pdb_variants_med$d614g_gamma, pdb_variants_med$d614g_ba1, pdb_variants_med$alpha_beta, pdb_variants_med$alpha_gamma, pdb_variants_med$alpha_delta, pdb_variants_med$alpha_ba1, pdb_variants_med$beta_gamma, pdb_variants_med$beta_delta, pdb_variants_med$beta_ba1, pdb_variants_med$gamma_delta, pdb_variants_med$gamma_ba1, pdb_variants_med$delta_ba1),
                                        "outlier" = c(pdb_variants_med$d614g_alpha_outlier, pdb_variants_med$d614g_beta_outlier, pdb_variants_med$d614g_beta_outlier, pdb_variants_med$d614g_gamma_outlier, pdb_variants_med$d614g_ba1_outlier, pdb_variants_med$alpha_beta_outlier, pdb_variants_med$alpha_gamma_outlier, pdb_variants_med$alpha_delta_outlier, pdb_variants_med$alpha_ba1_outlier, pdb_variants_med$beta_gamma_outlier, pdb_variants_med$beta_delta_outlier, pdb_variants_med$beta_ba1_outlier, pdb_variants_med$gamma_delta_outlier, pdb_variants_med$gamma_ba1_outlier, pdb_variants_med$delta_ba1_outlier))


Combined_distance_difference_histogram <- ggplot() + 
  scale_x_continuous(limits = c(-0.4,0.6)) + scale_y_break(c(100, 200), scales = 0.5) + 
  labs(x = "Z-score for difference\nin atomic distance", y = "Number of datapoints") +
  geom_histogram(data = pdb_variants_med_combined, aes(x = distance), binwidth = 0.04, fill = "grey80", color = "grey40") +
   geom_histogram(data = pdb_variants_med_combined %>% filter(outlier == TRUE), aes(x = distance), binwidth = 0.04, fill = "red", color = "red4")
Combined_distance_difference_histogram
ggsave(file = "Plots/Combined_distance_difference_histogram.pdf", Combined_distance_difference_histogram, height = 2, width = 2)

## Figure out what points are outside of the 95% interval here value here
pdb_variants_med_combined$zscore <-  (pdb_variants_med_combined$distance - mean(pdb_variants_med_combined$distance)) / sd(pdb_variants_med_combined$distance)
pdb_variants_med_combined$zscore_outlier <- (pdb_variants_med_combined$zscore > 2.33 | pdb_variants_med_combined$zscore < -2.33)

Combined_zscore_histogram <- ggplot() + 
  scale_x_continuous() + scale_y_break(c(60, 70), scales = 0.5) + 
  labs(x = "Z-score for difference\nin atomic distance", y = "Number of datapoints") +
  geom_histogram(data = pdb_variants_med_combined, aes(x = zscore), binwidth = 0.2, fill = "grey80", color = "grey40") +
  geom_histogram(data = pdb_variants_med_combined %>% filter(zscore_outlier == TRUE), aes(x = zscore), binwidth = 0.2, fill = "red", color = "red4") +
  NULL
Combined_zscore_histogram
ggsave(file = "Plots/Combined_zscore_histogram.pdf", Combined_zscore_histogram, height = 2, width = 2)


Combined_zscore_histogram_continuous <- ggplot() + 
  scale_x_continuous() + 
  labs(x = "Z-score for difference\nin atomic distance", y = "Number of datapoints") +
  geom_histogram(data = pdb_variants_med_combined, aes(x = zscore), binwidth = 0.2, fill = "grey80", color = "grey40") +
  geom_histogram(data = pdb_variants_med_combined %>% filter(zscore_outlier == TRUE), aes(x = zscore), binwidth = 0.2, fill = "red", color = "red4") +
  NULL
Combined_zscore_histogram_continuous
ggsave(file = "Plots/Combined_zscore_histogram_continuous.pdf", Combined_zscore_histogram_continuous, height = 5, width = 4)
```

```{r Making a network of interacting residues}
network_pairs_orig <- pdb_variants_med %>% filter()

network_pairs_orig$n_outliers <- rowSums(network_pairs_orig[,c("d614g_alpha_outlier","d614g_beta_outlier","d614g_gamma_outlier","d614g_delta_outlier","d614g_ba1_outlier","alpha_beta_outlier","alpha_gamma_outlier","alpha_delta_outlier","alpha_ba1_outlier","beta_gamma_outlier","beta_delta_outlier","beta_ba1_outlier","gamma_delta_outlier","gamma_ba1_outlier","delta_ba1_outlier")])

## If a cutoff of 5 or more, then its looking at outliers that were observed in at least 33% of cases
network_pairs <- network_pairs_orig %>% filter(n_outliers >= 5) %>% select(colnames(network_pairs_orig)[colnames(network_pairs_orig) != "n_outliers"])

## Now make a network on only residues that come close in one structure
cutoff_low <- 3.4
pdb_variants_low <- pdb_variants_med %>% filter(dist_d614g < cutoff_low | dist_alpha < cutoff_low | dist_beta < cutoff_low | dist_gamma < cutoff_low | dist_delta < cutoff_low | dist_ba1 < cutoff_low)

## VennDiagram of overlap
network_pairs_venndiagram <- merge(network_pairs %>% select(ace2_rbd) %>% mutate(change = 1),pdb_variants_low %>% select(ace2_rbd) %>% mutate(close = 1), by = "ace2_rbd", all = T)

network_pairs_venndiagram[is.na(network_pairs_venndiagram)] <- 0

venn_data <- list(change=network_pairs$ace2_rbd, close=pdb_variants_low$ace2_rbd)
Pairs_of_interaction_Venn_diagram <- ggVennDiagram(venn_data, label_alpha = 0, category.names = c("Major shifts","< 3.4 Å"),set_size = 4, label_size = 4) + scale_fill_gradient(low="white",high = "grey80") + theme(legend.position = "none")
print(Pairs_of_interaction_Venn_diagram)
ggsave(file = "Plots/Pairs_of_interaction_Venn_diagram.pdf", Pairs_of_interaction_Venn_diagram, height = 2, width = 3)

## Now finish making the final network pairs dataset
network_pairs2 <- rbind(network_pairs, pdb_variants_low) %>% distinct()
length(unique(network_pairs2$ace2_position))
unique(network_pairs2$ace2_position)
length(unique(network_pairs2$rbd_position))
unique(network_pairs2$rbd_position)

network_pairs2$n_outliers <- rowSums(network_pairs2[,c("d614g_alpha_outlier","d614g_beta_outlier","d614g_gamma_outlier","d614g_delta_outlier","d614g_ba1_outlier","alpha_beta_outlier","alpha_gamma_outlier","alpha_delta_outlier","alpha_ba1_outlier","beta_gamma_outlier","beta_delta_outlier","beta_ba1_outlier","gamma_delta_outlier","gamma_ba1_outlier","delta_ba1_outlier")])
network_pairs2 <- network_pairs2 %>% arrange(desc(n_outliers))
network_pairs2$order <- seq(1,nrow(network_pairs2))

Distance_outlier_histogram <- ggplot() + labs(x = "Number of times the datapoint\nwas outside 95% CI", y = "Number of datapoints") +
  scale_y_continuous(breaks = seq(0,8,2)) +
  geom_histogram(data = network_pairs2, aes(x = n_outliers), binwidth = 1, fill = "grey80", color = "black")
Distance_outlier_histogram
ggsave(file = "Plots/Distance_outlier_histogram.pdf", Distance_outlier_histogram, height = 1.75, width = 2)
```

```{r Listing which positions are now being considered and displaying these in Pymol}
## These commands are to output commands into Pymol!

print("fetch 6m17")
print("create d614g_ace2, chain B")
print("create d614g_rbd, chain E")
paste("select ace2_intxn, d614g_ace2 and resi",gsub(", ", "+", toString(unique(network_pairs2$ace2_position))),"and not name c+n+o+ca")
paste("select rbd_intxn, d614g_rbd and resi",gsub(", ", "+", toString(unique(network_pairs2$rbd_position))),"and not name c+n+o+ca")
print("set cartoon_transparency, 0.5")
print("rotate y, 90")
```

```{r Pairwise Pearson's r^2 scores for ensemble structure differences}
## Make a dataframe of pairwise Pearsons's correlations for structure
cor(network_pairs2$dist_d614g, network_pairs2$dist_alpha, method = "pearson")^2
struct_cor_matrix_names <- c("dist_d614g","dist_alpha","dist_beta","dist_gamma","dist_delta","dist_ba1")

struct_cor_matrix_df <- data.frame(index = seq(1,length(struct_cor_matrix_names) * length(struct_cor_matrix_names)))
struct_cor_matrix_df$virus1 <- rep(struct_cor_matrix_names, length(struct_cor_matrix_names))
struct_cor_matrix_df$virus2 <- rep(struct_cor_matrix_names, each = length(struct_cor_matrix_names))
struct_cor_matrix_df$pearson_rsquared <- 0

for(x in 1:nrow(struct_cor_matrix_df)){
    virus1 <- struct_cor_matrix_df$virus1[x]
    virus2 <- struct_cor_matrix_df$virus2[x]
    struct_cor_matrix_df$pearson_rsquared[x] <- cor(network_pairs2[,virus1], network_pairs2[,virus2], method = "pearson")^2
}

struct_cor_matrix_df[struct_cor_matrix_df$virus1 == "dist_d614g","virus1"] <- "D614G"; struct_cor_matrix_df[struct_cor_matrix_df$virus2 == "dist_d614g","virus2"] <- "D614G"
struct_cor_matrix_df[struct_cor_matrix_df$virus1 == "dist_alpha","virus1"] <- "Alpha"; struct_cor_matrix_df[struct_cor_matrix_df$virus2 == "dist_alpha","virus2"] <- "Alpha"
struct_cor_matrix_df[struct_cor_matrix_df$virus1 == "dist_beta","virus1"] <- "Beta"; struct_cor_matrix_df[struct_cor_matrix_df$virus2 == "dist_beta","virus2"] <- "Beta"
struct_cor_matrix_df[struct_cor_matrix_df$virus1 == "dist_gamma","virus1"] <- "Gamma"; struct_cor_matrix_df[struct_cor_matrix_df$virus2 == "dist_gamma","virus2"] <- "Gamma"
struct_cor_matrix_df[struct_cor_matrix_df$virus1 == "dist_delta","virus1"] <- "Delta"; struct_cor_matrix_df[struct_cor_matrix_df$virus2 == "dist_delta","virus2"] <- "Delta"
struct_cor_matrix_df[struct_cor_matrix_df$virus1 == "dist_ba1","virus1"] <- "BA1"; struct_cor_matrix_df[struct_cor_matrix_df$virus2 == "dist_ba1","virus2"] <- "BA1"

struct_cor_matrix_df$virus1 <- factor(struct_cor_matrix_df$virus1, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))
struct_cor_matrix_df$virus2 <- factor(struct_cor_matrix_df$virus2, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))

Structure_pairwise_correlations <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "right", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5)) + 
  labs(x = NULL, y = NULL) +
  scale_fill_gradient(low  = "white", high = "purple", limits = c(0.5,1)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = struct_cor_matrix_df, aes(x = virus1, y = virus2, fill = pearson_rsquared), color = "grey50") +
  geom_text(data = struct_cor_matrix_df %>% filter(pearson_rsquared > 0.9), aes(x = virus1, y = virus2, label = round(pearson_rsquared, 2)), angle= 45, size = 2, color = "white") +
  geom_text(data = struct_cor_matrix_df %>% filter(pearson_rsquared < 0.9), aes(x = virus1, y = virus2, label = round(pearson_rsquared, 2)), angle= 45, size = 2)
Structure_pairwise_correlations
ggsave(file = "Plots/Structure_pairwise_correlations.pdf", Structure_pairwise_correlations, height = 1.9, width = 2.8)

```


```{r pca based on atomic coordinates of key interacting residues in ensemble distances}
# pca on atomic coordinates
pdb_variants_for_pca <- data.frame(t(network_pairs2[seq(4,10)]))[1:6,]
pca_for_pdb = prcomp(pdb_variants_for_pca, scale. = TRUE)

# create data frame with scores
scores = as.data.frame(pca_for_pdb$x)
scores$labels <- c("D614G", "Alpha", "Beta","Gamma","Delta","BA1")

s2_variant_average_pca_plot <- ggplot() + theme(panel.grid = element_blank()) + 
  xlab(paste("Principal component 1\n(",round(summary(pca_for_pdb)$importance[2] * 100,0),"% of variance)",sep="")) + 
  ylab(paste("Principal component 2\n(",round(summary(pca_for_pdb)$importance[5] * 100,0),"% of variance)",sep="")) + 
  geom_text_repel(data = scores, aes(x = PC1, y = PC2, label = labels), segment.color = 'grey90', point.padding = 0.05, size = 2, segment.alpha = 0.5, color = "red") +
  geom_point(data = scores, aes(x = PC1, y = PC2), size = 1) +
  NULL
s2_variant_average_pca_plot
ggsave(file = "Plots/s2_variant_average_pca_plot.pdf", s2_variant_average_pca_plot, height = 1.8, width = 1.75)
```


```{r pca based on atomic coordinates of key interacting residues for individual structures}
### Now getting all of the individual structure data points in here
all_pdb <- merge(merge(merge(merge(merge(pdb_d614g_combined[,c("ace2_rbd","dist_7sxy","dist_6lzg","dist_6m0j","dist_6m17")],
                 pdb_alpha_combined[,c("ace2_rbd","dist_8dlk","dist_7mjn","dist_7ekf")], by = "ace2_rbd"),
                 pdb_beta_combined[,c("ace2_rbd","dist_7vx4","dist_8dln","dist_7v80","dist_7ekg","dist_7sy6")], by = "ace2_rbd"),
                 pdb_gamma_combined[,c("ace2_rbd","dist_7ekc","dist_7v84","dist_8dlq","dist_7sy8")], by = "ace2_rbd"),
                 pdb_delta_combined[,c("ace2_rbd","dist_7w9i","dist_7v8b","dist_7tew","dist_7wbq")], by = "ace2_rbd"),
                 pdb_ba1_combined[,c("ace2_rbd","dist_7wk6","dist_7t9l","dist_7wpb","dist_7wbp","dist_7whh")], by = "ace2_rbd")#,
                 
all_pdb_intxn <- merge(network_pairs2[,c("ace2_rbd","ace2_position","rbd_position")], all_pdb, by = "ace2_rbd")
pdb_variants_for_pca2 <- data.frame(t(all_pdb_intxn[seq(4,28)]))
pdb_variants_for_pca3 <- pdb_variants_for_pca2[ , colSums(is.na(pdb_variants_for_pca2)) == 0]
pca_for_pdb2 = prcomp(pdb_variants_for_pca3, scale = TRUE)
scores2 = as.data.frame(pca_for_pdb2$x)
scores2$pdb <- substr(rownames(scores2),6,12)

pdb_contact_maps_key <- read.csv(file = "Data/pdb_contact_maps/pdb_contact_maps_key.csv", header = T)
scores3 <- merge(scores2, pdb_contact_maps_key, by = "pdb")
scores4 <- scores3[,c("pdb","group","PC1","PC2")]

scores3$group <- factor(scores3$group, levels = c("d614g", "alpha","beta","gamma","delta","ba1"))

s2_variant_all_structures_pca_plot <- ggplot() + theme(panel.grid.major = element_blank()) + 
  xlab(paste("Principal component 1 (",round(summary(pca_for_pdb2)$importance[2] * 100,0),"% of variance explained)",sep="")) + 
  ylab(paste("Principal component 2 (",round(summary(pca_for_pdb2)$importance[5] * 100,0),"% of variance explained)",sep="")) + 
  geom_text_repel(data = scores3, aes(x = PC1, y = PC2, label = pdb), segment.color = 'grey90', point.padding = 0.05, size = 3, segment.alpha = 0.5, color = "red") +
  geom_point(data = scores3, aes(x = PC1, y = PC2, color = group, shape = type), size = 1) +
  NULL; s2_variant_all_structures_pca_plot
ggsave(file = "Plots/S2_variant_all_structures_pca_plot.pdf", s2_variant_all_structures_pca_plot, height = 3, width = 5)
ggsave(file = "Plots/S2_variant_all_structures_pca_plot.png", s2_variant_all_structures_pca_plot, height = 3, width = 5)

### Not getting all of the individual structure datapoints in here
all_cryo <- merge(merge(merge(merge(merge(pdb_d614g_combined[,c("ace2_rbd","dist_7sxy","dist_6m17")],
                 pdb_alpha_combined[,c("ace2_rbd","dist_8dlk","dist_7mjn")], by = "ace2_rbd"),
                 pdb_beta_combined[,c("ace2_rbd","dist_7vx4","dist_8dln","dist_7v80","dist_7sy6")], by = "ace2_rbd"),
                 pdb_gamma_combined[,c("ace2_rbd","dist_7v84","dist_8dlq","dist_7sy8")], by = "ace2_rbd"),
                 pdb_delta_combined[,c("ace2_rbd","dist_7w9i","dist_7v8b","dist_7tew")], by = "ace2_rbd"),
                 pdb_ba1_combined[,c("ace2_rbd","dist_7wk6","dist_7t9l","dist_7wpb")], by = "ace2_rbd")

all_cryo_intxn <- merge(network_pairs2[,c("ace2_rbd","ace2_position","rbd_position")], all_cryo, by = "ace2_rbd")
cryo_variants_for_pca2 <- data.frame(t(all_cryo_intxn[seq(4,9)]))
cryo_variants_for_pca3 <- cryo_variants_for_pca2[ , colSums(is.na(cryo_variants_for_pca2)) == 0]
pca_for_cryo2 = prcomp(cryo_variants_for_pca3, scale. = TRUE)
all_cryo_scores = as.data.frame(pca_for_cryo2$x)
all_cryo_scores$pdb <- substr(rownames(all_cryo_scores),6,12)

cryo_contact_maps_key <- read.csv(file = "Data/PDB_contact_maps/PDB_contact_maps_key.csv", header = T)
all_cryo_scores2 <- merge(all_cryo_scores, pdb_contact_maps_key, by = "pdb") #%>% filter(type == "cryo")
all_cryo_scores3 <- all_cryo_scores2[,c("pdb","group","type","PC1","PC2")]

s2_variant_pca_plot <- ggplot() + 
  xlab(paste("Principal component 1 (",round(summary(pca_for_cryo2)$importance[2] * 100,0),"% of variance explained)",sep="")) + 
  ylab(paste("Principal component 2 (",round(summary(pca_for_cryo2)$importance[5] * 100,0),"% of variance explained)",sep="")) + 
  geom_text_repel(data = all_cryo_scores3, aes(x = PC1, y = PC2, label = pdb), segment.color = 'grey90', point.padding = 0.05, size = 3, segment.alpha = 0.5, color = "red") +
  geom_point(data = all_cryo_scores3, aes(x = PC1, y = PC2, color = group, shape = type), size = 1) +
  NULL
s2_variant_pca_plot
```

```{r Making a network of interacting residues}
#https://stackoverflow.com/questions/53179442/how-to-set-fix-node-positions-with-ggnet2
set.seed(1234567)
frac_cutoff <- 0.2

## Reimport D614G atomic positions
pdb_6lzg_atoms <- read.table(file = "Data/PDB_contact_maps/parsed_6lzg_atoms.tsv", header = TRUE, sep = "\t")
pdb_6lzg_ace2 <- pdb_6lzg_atoms %>% filter(chain == "A") %>% mutate(position = as.numeric(position))
pdb_6lzg_rbd <- pdb_6lzg_atoms %>% filter(chain == "B") %>% mutate(position = as.numeric(position))
pdb_6lzg_ace2_positionlist <- as.numeric(unique(pdb_6lzg_ace2$position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_6lzg_rbd_positionlist <- as.numeric(unique(pdb_6lzg_rbd$position)) ## For interpretation, need to add 316 later since actually starts at residue 331

## ACE2 SEQUENCE
ace2_residues <- pdb_6lzg_ace2 %>% filter(name == "C") %>% mutate(aa3 = aa, ace2_position = position) %>% select(ace2_position,aa3)
ace2_residues$orig <- lapply(ace2_residues$aa3, to_single_notation)
ace2_residues$ace2_residue <- paste0(ace2_residues$orig,ace2_residues$ace2_position)

## D614G SEQUENCE
d614g_residues <- pdb_6lzg_rbd %>% filter(name == "C") %>% mutate(aa3 = aa, rbd_position = position) %>% select(rbd_position,aa3)
d614g_residues$orig <- lapply(d614g_residues$aa3, to_single_notation)
d614g_residues$d614g_residue <- paste0(d614g_residues$orig,d614g_residues$rbd_position)

```


```{r Making a network of interacting residues for D614G with actual distances}
frac_cutoff <- 0
## D614G only
for(x in 1:nrow(network_pairs2)){
  if(network_pairs2$dist_d614g[x] > frac_cutoff | network_pairs2$dist_d614g[x] < -frac_cutoff){network_pairs2$d614g_network[x] <- network_pairs2$dist_d614g[x]} else{network_pairs2$d614g_network[x] <- NA}
}

network_pairs2_temp <- network_pairs2[,c("order","ace2_position","rbd_position")]
network_pairs2_temp2 <- merge(network_pairs2_temp,ace2_residues[,c("ace2_position","ace2_residue")],by = "ace2_position", all.x = T)
network_pairs2_temp3 <- merge(network_pairs2_temp2,d614g_residues[,c("rbd_position","d614g_residue")],by = "rbd_position", all.x = T)
network_pairs2_temp4 <- merge(network_pairs2_temp3,d614g_residues[,c("rbd_position","d614g_residue")],by = "rbd_position", all.x = T)
for(x in 1:nrow(network_pairs2_temp4)){network_pairs2_temp4$rbd_residue[x] <- network_pairs2_temp4$d614g_residue.x[x]}
network_pairs2_temp4 <- network_pairs2_temp4 %>% arrange(order)

simple_edge_df <- data.frame(
  from = network_pairs2_temp4$ace2_residue,
  to = network_pairs2_temp4$rbd_residue,
  weight = round(network_pairs2$d614g_network,3),
  weight2 = round(network_pairs2$d614g_alpha,3),
  stringsAsFactors = FALSE
)
for(x in 1:nrow(simple_edge_df)){if(abs(simple_edge_df$weight2[x]) > 0.2){simple_edge_df$weight2[x] <- simple_edge_df$weight2[x]} else{simple_edge_df$weight2[x] <- NA}}
simple_edge_network <- as.network(simple_edge_df)
# https://briatte.github.io/ggnet/
temp_network <- as.matrix(plot.network(simple_edge_network, mode = "fruchtermanreingold", layout.par = list(repulse.rad = 1000, cell.jitter = 0)))
rs = 1

x = network.vertex.names(simple_edge_network)
x = ifelse(x %in% unique(network_pairs2_temp4$ace2_residue), "ACE2", "RBD")
simple_edge_network %v% "type" = x

set.edge.attribute(simple_edge_network, "color", ifelse(simple_edge_network %e% "weight" > 0 & simple_edge_network %e% "weight" <= 3, "black",
                                                              ifelse(simple_edge_network %e% "weight" > 3 & simple_edge_network %e% "weight" <= 4, "grey50",
                                                                     ifelse(simple_edge_network %e% "weight" > 4 & simple_edge_network %e% "weight" <= 6, "grey70",
                                                                            ifelse(simple_edge_network %e% "weight" > 6 & simple_edge_network %e% "weight" <= 10, "grey90","white")))))


D614G_networkplot <- ggnet2(simple_edge_network, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.9, edge.color = "color", edge.alpha = 0.75, label.color = "purple", label.alpha = 0.9, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(D614G_networkplot)
ggsave(file = "Plots/D614G_networkplot.pdf", D614G_networkplot, height = 2.25, width = 3)
```

```{r Making a network of interacting residues for alpha}

pdb_8dlk_atoms <- read.table(file = "Data/PDB_contact_maps/parsed_8dlk_atoms.tsv", header = TRUE, sep = "\t")
pdb_8dlk_ace2 <- pdb_8dlk_atoms %>% filter(chain == "E") %>% mutate(position = as.numeric(position))
pdb_8dlk_rbd <- pdb_8dlk_atoms %>% filter(chain == "B") %>% mutate(position = as.numeric(position))
pdb_8dlk_ace2_positionlist <- as.numeric(unique(pdb_8dlk_ace2$position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_8dlk_rbd_positionlist <- as.numeric(unique(pdb_8dlk_rbd$position)) ## For interpretation, need to add 316 later since actually starts at residue 331

## ALPHA SEQUENCE
alpha_residues <- pdb_8dlk_rbd %>% filter(name == "C") %>% mutate(aa3 = aa, rbd_position = position) %>% select(rbd_position,aa3)
alpha_residues$orig <- lapply(alpha_residues$aa3, to_single_notation)
alpha_residues$alpha_residue <- paste0(alpha_residues$orig,alpha_residues$rbd_position)

## D614G and alpha
network_pairs2$d614g_alpha_network <- NA
for(x in 1:nrow(network_pairs2)){
  if(network_pairs2$d614g_alpha_outlier[x] == TRUE){network_pairs2$d614g_alpha_network[x] <- network_pairs2$dist_d614g[x] - network_pairs2$dist_alpha[x]} else{network_pairs2$d614g_alpha_network[x] <- 0}
}

network_pairs2_alpha_temp <- network_pairs2[,c("order","ace2_position","rbd_position")]
network_pairs2_alpha_temp2 <- merge(network_pairs2_alpha_temp,ace2_residues[,c("ace2_position","ace2_residue")],by = "ace2_position", all.x = T)
network_pairs2_alpha_temp3 <- merge(network_pairs2_alpha_temp2,d614g_residues[,c("rbd_position","d614g_residue")],by = "rbd_position", all.x = T)
network_pairs2_alpha_temp4 <- merge(network_pairs2_alpha_temp3,alpha_residues[,c("rbd_position","alpha_residue")],by = "rbd_position", all.x = T)
for(x in 1:nrow(network_pairs2_alpha_temp4)){
  if(network_pairs2_alpha_temp4$d614g_residue[x] == network_pairs2_alpha_temp4$alpha_residue[x]){
    network_pairs2_alpha_temp4$rbd_mutant[x] <- FALSE
    network_pairs2_alpha_temp4$rbd_residue[x] <- network_pairs2_alpha_temp4$d614g_residue[x]} else{
      network_pairs2_alpha_temp4$rbd_mutant[x] <- TRUE
      network_pairs2_alpha_temp4$rbd_residue[x] <- paste0(network_pairs2_alpha_temp4$d614g_residue[x],substr(network_pairs2_alpha_temp4$alpha_residue[x],1,1))}
}
network_pairs2_alpha_temp4 <- network_pairs2_alpha_temp4 %>% arrange(order)

simple_edge_df_alpha <- data.frame(
  from = network_pairs2_alpha_temp4$ace2_residue,
  to = network_pairs2_alpha_temp4$rbd_residue,
  weight = round(network_pairs2$dist_alpha,2),
  weight2 = round(network_pairs2$d614g_alpha_network,1),
  weight3 = round(network_pairs2$d614g_alpha_network,1),
  stringsAsFactors = FALSE
)
simple_edge_df_alpha[simple_edge_df_alpha$weight2 == 0,"weight3"] <- NA
simple_edge_network_alpha <- as.network(simple_edge_df_alpha)

x = network.vertex.names(simple_edge_network_alpha)
x = ifelse(x %in% unique(network_pairs2_alpha_temp4$ace2_residue), "ACE2", "RBD")
simple_edge_network_alpha %v% "type" = x
simple_edge_network_alpha %v% "rbd_mut" = network_pairs2_alpha_temp4$rbd_mutant

## Distances for Alpha by itself
set.edge.attribute(simple_edge_network_alpha, "color", ifelse(simple_edge_network_alpha %e% "weight" > 0 & simple_edge_network_alpha %e% "weight" <= 3, "black",
                                                              ifelse(simple_edge_network_alpha %e% "weight" > 3 & simple_edge_network_alpha %e% "weight" <= 4, "grey40",
                                                                     ifelse(simple_edge_network_alpha %e% "weight" > 4 & simple_edge_network_alpha %e% "weight" <= 6, "grey60",
                                                                            ifelse(simple_edge_network_alpha %e% "weight" > 6 & simple_edge_network_alpha %e% "weight" <= 10, "grey80","white")))))


Alpha_networkplot <- ggnet2(simple_edge_network_alpha, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha = 0.3, label.color = "purple", label.alpha = 0.4, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(Alpha_networkplot)
ggsave(file = "Plots/Alpha_networkplot.pdf", Alpha_networkplot, height = 2.25, width = 3)

## Changes in distance as compared to D614G
simple_edge_network_alpha2 <- simple_edge_network_alpha

set.edge.attribute(simple_edge_network_alpha2, "color", 
                   ifelse(simple_edge_network_alpha2 %e% "weight2" > 1 & simple_edge_network_alpha2 %e% "weight2" < 2, "orange",
                          ifelse(simple_edge_network_alpha2 %e% "weight2" > 2, "red",
                                 ifelse(simple_edge_network_alpha2 %e% "weight2" < -2, "blue",
                                 ifelse(simple_edge_network_alpha2 %e% "weight2" < -1, "cyan","grey75")))))

D614G_alpha_networkplot <- ggnet2(simple_edge_network_alpha2, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "weight3", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha = 0.3, label.color = "purple", label.alpha = 0.5, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(D614G_alpha_networkplot)
ggsave(file = "Plots/D614G_alpha_networkplot.pdf", D614G_alpha_networkplot, height = 2.25, width = 3)
```

```{r Making a network of interacting residues for beta}

pdb_7vx4_atoms <- read.table(file = "Data/PDB_contact_maps/parsed_7vx4_atoms.tsv", header = TRUE, sep = "\t")
pdb_7vx4_ace2 <- pdb_7vx4_atoms %>% filter(chain == "A") %>% mutate(position = as.numeric(position))
pdb_7vx4_rbd <- pdb_7vx4_atoms %>% filter(chain == "E") %>% mutate(position = as.numeric(position))
pdb_7vx4_ace2_positionlist <- as.numeric(unique(pdb_7vx4_ace2$position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7vx4_rbd_positionlist <- as.numeric(unique(pdb_7vx4_rbd$position)) ## For interpretation, need to add 316 later since actually starts at residue 331

## beta SEQUENCE
beta_residues <- pdb_7vx4_rbd %>% filter(name == "C") %>% mutate(aa3 = aa, rbd_position = position) %>% select(rbd_position,aa3)
beta_residues$orig <- lapply(beta_residues$aa3, to_single_notation)
beta_residues$beta_residue <- paste0(beta_residues$orig,beta_residues$rbd_position)

## D614G and beta
network_pairs2$d614g_beta_network <- NA
for(x in 1:nrow(network_pairs2)){
  if(network_pairs2$d614g_beta_outlier[x] == TRUE){network_pairs2$d614g_beta_network[x] <- network_pairs2$dist_d614g[x] - network_pairs2$dist_beta[x]} else{network_pairs2$d614g_beta_network[x] <- 0}
}

network_pairs2_beta_temp <- network_pairs2[,c("order","ace2_position","rbd_position")]
network_pairs2_beta_temp2 <- merge(network_pairs2_beta_temp,ace2_residues[,c("ace2_position","ace2_residue")],by = "ace2_position", all.x = T)
network_pairs2_beta_temp3 <- merge(network_pairs2_beta_temp2,d614g_residues[,c("rbd_position","d614g_residue")],by = "rbd_position", all.x = T)
network_pairs2_beta_temp4 <- merge(network_pairs2_beta_temp3,beta_residues[,c("rbd_position","beta_residue")],by = "rbd_position", all.x = T)
for(x in 1:nrow(network_pairs2_beta_temp4)){
  if(network_pairs2_beta_temp4$d614g_residue[x] == network_pairs2_beta_temp4$beta_residue[x]){
    network_pairs2_beta_temp4$rbd_mutant[x] <- FALSE
    network_pairs2_beta_temp4$rbd_residue[x] <- network_pairs2_beta_temp4$d614g_residue[x]} else{
      network_pairs2_beta_temp4$rbd_mutant[x] <- TRUE
      network_pairs2_beta_temp4$rbd_residue[x] <- paste0(network_pairs2_beta_temp4$d614g_residue[x],substr(network_pairs2_beta_temp4$beta_residue[x],1,1))}
}
network_pairs2_beta_temp4 <- network_pairs2_beta_temp4 %>% arrange(order)

simple_edge_df_beta <- data.frame(
  from = network_pairs2_beta_temp4$ace2_residue,
  to = network_pairs2_beta_temp4$rbd_residue,
  weight = round(network_pairs2$dist_beta,2),
  weight2 = round(network_pairs2$d614g_beta_network,1),
  weight3 = round(network_pairs2$d614g_beta_network,1),
  stringsAsFactors = FALSE
)
simple_edge_df_beta[simple_edge_df_beta$weight2 == 0,"weight3"] <- NA
simple_edge_network_beta <- as.network(simple_edge_df_beta)

x = network.vertex.names(simple_edge_network_beta)
x = ifelse(x %in% unique(network_pairs2_beta_temp4$ace2_residue), "ACE2", "RBD")
simple_edge_network_beta %v% "type" = x
simple_edge_network_beta %v% "rbd_mut" = network_pairs2_beta_temp4$rbd_mutant

## Distances for beta by itself
set.edge.attribute(simple_edge_network_beta, "color", ifelse(simple_edge_network_beta %e% "weight" > 0 & simple_edge_network_beta %e% "weight" <= 3, "black",
                                                              ifelse(simple_edge_network_beta %e% "weight" > 3 & simple_edge_network_beta %e% "weight" <= 4, "grey30",
                                                                     ifelse(simple_edge_network_beta %e% "weight" > 4 & simple_edge_network_beta %e% "weight" <= 6, "grey50",
                                                                            ifelse(simple_edge_network_beta %e% "weight" > 6 & simple_edge_network_beta %e% "weight" <= 8, "grey70",
                                                                                   ifelse(simple_edge_network_beta %e% "weight" > 8 & simple_edge_network_beta %e% "weight" <= 10, "grey90","white"))))))


beta_networkplot <- ggnet2(simple_edge_network_beta, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha= 0.3, label.color = "purple", label.beta = 0.4, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(beta_networkplot)
ggsave(file = "Plots/beta_networkplot.pdf", beta_networkplot, height = 2.25, width = 3)

## Changes in distance as compared to D614G
simple_edge_network_beta2 <- simple_edge_network_beta

set.edge.attribute(simple_edge_network_beta2, "color", 
                   ifelse(simple_edge_network_beta2 %e% "weight2" > 1 & simple_edge_network_beta2 %e% "weight2" < 2, "orange",
                          ifelse(simple_edge_network_beta2 %e% "weight2" > 2, "red",
                                 ifelse(simple_edge_network_beta2 %e% "weight2" < -2, "blue",
                                 ifelse(simple_edge_network_beta2 %e% "weight2" < -1, "cyan","grey75")))))

D614G_beta_networkplot <- ggnet2(simple_edge_network_beta2, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "weight3", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha = 0.3, label.color = "purple", label.beta = 0.5, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(D614G_beta_networkplot)
ggsave(file = "Plots/D614G_beta_networkplot.pdf", D614G_beta_networkplot, height = 2.25, width = 3)
```

```{r Making a network of interacting residues for gamma}

pdb_7ekc_atoms <- read.table(file = "Data/PDB_contact_maps/parsed_7ekc_atoms.tsv", header = TRUE, sep = "\t")
pdb_7ekc_ace2 <- pdb_7ekc_atoms %>% filter(chain == "A") %>% mutate(position = as.numeric(position))
pdb_7ekc_rbd <- pdb_7ekc_atoms %>% filter(chain == "B") %>% mutate(position = as.numeric(position))
pdb_7ekc_ace2_positionlist <- as.numeric(unique(pdb_7ekc_ace2$position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7ekc_rbd_positionlist <- as.numeric(unique(pdb_7ekc_rbd$position)) ## For interpretation, need to add 316 later since actually starts at residue 331

## gamma SEQUENCE
gamma_residues <- pdb_7ekc_rbd %>% filter(name == "C") %>% mutate(aa3 = aa, rbd_position = position) %>% select(rbd_position,aa3)
gamma_residues$orig <- lapply(gamma_residues$aa3, to_single_notation)
gamma_residues$gamma_residue <- paste0(gamma_residues$orig,gamma_residues$rbd_position)

## D614G and gamma
network_pairs2$d614g_gamma_network <- NA
for(x in 1:nrow(network_pairs2)){
  if(network_pairs2$d614g_gamma_outlier[x] == TRUE){network_pairs2$d614g_gamma_network[x] <- network_pairs2$dist_d614g[x] - network_pairs2$dist_gamma[x]} else{network_pairs2$d614g_gamma_network[x] <- 0}
}

network_pairs2_gamma_temp <- network_pairs2[,c("order","ace2_position","rbd_position")]
network_pairs2_gamma_temp2 <- merge(network_pairs2_gamma_temp,ace2_residues[,c("ace2_position","ace2_residue")],by = "ace2_position", all.x = T)
network_pairs2_gamma_temp3 <- merge(network_pairs2_gamma_temp2,d614g_residues[,c("rbd_position","d614g_residue")],by = "rbd_position", all.x = T)
network_pairs2_gamma_temp4 <- merge(network_pairs2_gamma_temp3,gamma_residues[,c("rbd_position","gamma_residue")],by = "rbd_position", all.x = T)
for(x in 1:nrow(network_pairs2_gamma_temp4)){
  if(network_pairs2_gamma_temp4$d614g_residue[x] == network_pairs2_gamma_temp4$gamma_residue[x]){
    network_pairs2_gamma_temp4$rbd_mutant[x] <- FALSE
    network_pairs2_gamma_temp4$rbd_residue[x] <- network_pairs2_gamma_temp4$d614g_residue[x]} else{
      network_pairs2_gamma_temp4$rbd_mutant[x] <- TRUE
      network_pairs2_gamma_temp4$rbd_residue[x] <- paste0(network_pairs2_gamma_temp4$d614g_residue[x],substr(network_pairs2_gamma_temp4$gamma_residue[x],1,1))}
}
network_pairs2_gamma_temp4 <- network_pairs2_gamma_temp4 %>% arrange(order)

simple_edge_df_gamma <- data.frame(
  from = network_pairs2_gamma_temp4$ace2_residue,
  to = network_pairs2_gamma_temp4$rbd_residue,
  weight = round(network_pairs2$dist_gamma,2),
  weight2 = round(network_pairs2$d614g_gamma_network,1),
  weight3 = round(network_pairs2$d614g_gamma_network,1),
  stringsAsFactors = FALSE
)
simple_edge_df_gamma[simple_edge_df_gamma$weight2 == 0,"weight3"] <- NA
simple_edge_network_gamma <- as.network(simple_edge_df_gamma)

x = network.vertex.names(simple_edge_network_gamma)
x = ifelse(x %in% unique(network_pairs2_gamma_temp4$ace2_residue), "ACE2", "RBD")
simple_edge_network_gamma %v% "type" = x
simple_edge_network_gamma %v% "rbd_mut" = network_pairs2_gamma_temp4$rbd_mutant

## Distances for gamma by itself
set.edge.attribute(simple_edge_network_gamma, "color", ifelse(simple_edge_network_gamma %e% "weight" > 0 & simple_edge_network_gamma %e% "weight" <= 3, "black",
                                                              ifelse(simple_edge_network_gamma %e% "weight" > 3 & simple_edge_network_gamma %e% "weight" <= 4, "grey30",
                                                                     ifelse(simple_edge_network_gamma %e% "weight" > 4 & simple_edge_network_gamma %e% "weight" <= 6, "grey50",
                                                                            ifelse(simple_edge_network_gamma %e% "weight" > 6 & simple_edge_network_gamma %e% "weight" <= 8, "grey70",
                                                                                   ifelse(simple_edge_network_gamma %e% "weight" > 8 & simple_edge_network_gamma %e% "weight" <= 10, "grey90","white"))))))


gamma_networkplot <- ggnet2(simple_edge_network_gamma, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha= 0.3, label.color = "purple", label.gamma = 0.4, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(gamma_networkplot)
ggsave(file = "Plots/gamma_networkplot.pdf", gamma_networkplot, height = 2.25, width = 3)

## Changes in distance as compared to D614G
simple_edge_network_gamma2 <- simple_edge_network_gamma

set.edge.attribute(simple_edge_network_gamma2, "color", 
                   ifelse(simple_edge_network_gamma2 %e% "weight2" > 1 & simple_edge_network_gamma2 %e% "weight2" < 2, "orange",
                          ifelse(simple_edge_network_gamma2 %e% "weight2" > 2, "red",
                                 ifelse(simple_edge_network_gamma2 %e% "weight2" < -2, "blue",
                                 ifelse(simple_edge_network_gamma2 %e% "weight2" < -1, "cyan","grey75")))))

D614G_gamma_networkplot <- ggnet2(simple_edge_network_gamma2, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "weight3", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha = 0.3, label.color = "purple", label.gamma = 0.5, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(D614G_gamma_networkplot)
ggsave(file = "Plots/D614G_gamma_networkplot.pdf", D614G_gamma_networkplot, height = 2.25, width = 3)
```

```{r Making a network of interacting residues for delta}

pdb_7w9i_atoms <- read.table(file = "Data/PDB_contact_maps/parsed_7w9i_atoms.tsv", header = TRUE, sep = "\t")
pdb_7w9i_ace2 <- pdb_7w9i_atoms %>% filter(chain == "A") %>% mutate(position = as.numeric(position))
pdb_7w9i_rbd <- pdb_7w9i_atoms %>% filter(chain == "E") %>% mutate(position = as.numeric(position))
pdb_7w9i_ace2_positionlist <- as.numeric(unique(pdb_7w9i_ace2$position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7w9i_rbd_positionlist <- as.numeric(unique(pdb_7w9i_rbd$position)) ## For interpretation, need to add 316 later since actually starts at residue 331

## delta SEQUENCE
delta_residues <- pdb_7w9i_rbd %>% filter(name == "C") %>% mutate(aa3 = aa, rbd_position = position) %>% select(rbd_position,aa3)
delta_residues$orig <- lapply(delta_residues$aa3, to_single_notation)
delta_residues$delta_residue <- paste0(delta_residues$orig,delta_residues$rbd_position)

## D614G and delta
network_pairs2$d614g_delta_network <- NA
for(x in 1:nrow(network_pairs2)){
  if(network_pairs2$d614g_delta_outlier[x] == TRUE){network_pairs2$d614g_delta_network[x] <- network_pairs2$dist_d614g[x] - network_pairs2$dist_delta[x]} else{network_pairs2$d614g_delta_network[x] <- 0}
}

network_pairs2_delta_temp <- network_pairs2[,c("order","ace2_position","rbd_position")]
network_pairs2_delta_temp2 <- merge(network_pairs2_delta_temp,ace2_residues[,c("ace2_position","ace2_residue")],by = "ace2_position", all.x = T)
network_pairs2_delta_temp3 <- merge(network_pairs2_delta_temp2,d614g_residues[,c("rbd_position","d614g_residue")],by = "rbd_position", all.x = T)
network_pairs2_delta_temp4 <- merge(network_pairs2_delta_temp3,delta_residues[,c("rbd_position","delta_residue")],by = "rbd_position", all.x = T)
for(x in 1:nrow(network_pairs2_delta_temp4)){
  if(network_pairs2_delta_temp4$d614g_residue[x] == network_pairs2_delta_temp4$delta_residue[x]){
    network_pairs2_delta_temp4$rbd_mutant[x] <- FALSE
    network_pairs2_delta_temp4$rbd_residue[x] <- network_pairs2_delta_temp4$d614g_residue[x]} else{
      network_pairs2_delta_temp4$rbd_mutant[x] <- TRUE
      network_pairs2_delta_temp4$rbd_residue[x] <- paste0(network_pairs2_delta_temp4$d614g_residue[x],substr(network_pairs2_delta_temp4$delta_residue[x],1,1))}
}
network_pairs2_delta_temp4 <- network_pairs2_delta_temp4 %>% arrange(order)

simple_edge_df_delta <- data.frame(
  from = network_pairs2_delta_temp4$ace2_residue,
  to = network_pairs2_delta_temp4$rbd_residue,
  weight = round(network_pairs2$dist_delta,2),
  weight2 = round(network_pairs2$d614g_delta_network,1),
  weight3 = round(network_pairs2$d614g_delta_network,1),
  stringsAsFactors = FALSE
)
simple_edge_df_delta[simple_edge_df_delta$weight2 == 0,"weight3"] <- NA
simple_edge_network_delta <- as.network(simple_edge_df_delta)

x = network.vertex.names(simple_edge_network_delta)
x = ifelse(x %in% unique(network_pairs2_delta_temp4$ace2_residue), "ACE2", "RBD")
simple_edge_network_delta %v% "type" = x
simple_edge_network_delta %v% "rbd_mut" = network_pairs2_delta_temp4$rbd_mutant

## Distances for delta by itself
set.edge.attribute(simple_edge_network_delta, "color", ifelse(simple_edge_network_delta %e% "weight" > 0 & simple_edge_network_delta %e% "weight" <= 3, "black",
                                                              ifelse(simple_edge_network_delta %e% "weight" > 3 & simple_edge_network_delta %e% "weight" <= 4, "grey30",
                                                                     ifelse(simple_edge_network_delta %e% "weight" > 4 & simple_edge_network_delta %e% "weight" <= 6, "grey50",
                                                                            ifelse(simple_edge_network_delta %e% "weight" > 6 & simple_edge_network_delta %e% "weight" <= 8, "grey70",
                                                                                   ifelse(simple_edge_network_delta %e% "weight" > 8 & simple_edge_network_delta %e% "weight" <= 10, "grey90","white"))))))


delta_networkplot <- ggnet2(simple_edge_network_delta, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha= 0.3, label.color = "purple", label.delta = 0.4, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(delta_networkplot)
ggsave(file = "Plots/delta_networkplot.pdf", delta_networkplot, height = 2.25, width = 3)

## Changes in distance as compared to D614G
simple_edge_network_delta2 <- simple_edge_network_delta

set.edge.attribute(simple_edge_network_delta2, "color", 
                   ifelse(simple_edge_network_delta2 %e% "weight2" > 1 & simple_edge_network_delta2 %e% "weight2" < 2, "orange",
                          ifelse(simple_edge_network_delta2 %e% "weight2" > 2, "red",
                                 ifelse(simple_edge_network_delta2 %e% "weight2" < -2, "blue",
                                 ifelse(simple_edge_network_delta2 %e% "weight2" < -1, "cyan","grey75")))))

D614G_delta_networkplot <- ggnet2(simple_edge_network_delta2, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "weight3", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha = 0.3, label.color = "purple", label.delta = 0.5, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(D614G_delta_networkplot)
ggsave(file = "Plots/D614G_delta_networkplot.pdf", D614G_delta_networkplot, height = 2.25, width = 3)
```

```{r Making a network of interacting residues for ba1}

pdb_7wk6_atoms <- read.table(file = "Data/PDB_contact_maps/parsed_7wk6_atoms.tsv", header = TRUE, sep = "\t")
pdb_7wk6_ace2 <- pdb_7wk6_atoms %>% filter(chain == "A") %>% mutate(position = as.numeric(position))
pdb_7wk6_rbd <- pdb_7wk6_atoms %>% filter(chain == "E") %>% mutate(position = as.numeric(position))
pdb_7wk6_ace2_positionlist <- as.numeric(unique(pdb_7wk6_ace2$position)) ## For interpretation, need to add 18 later since actually starts at residue 19
pdb_7wk6_rbd_positionlist <- as.numeric(unique(pdb_7wk6_rbd$position)) ## For interpretation, need to add 316 later since actually starts at residue 331

## ba1 SEQUENCE
ba1_residues <- pdb_7wk6_rbd %>% filter(name == "C") %>% mutate(aa3 = aa, rbd_position = position) %>% select(rbd_position,aa3)
ba1_residues$orig <- lapply(ba1_residues$aa3, to_single_notation)
ba1_residues$ba1_residue <- paste0(ba1_residues$orig,ba1_residues$rbd_position)

## D614G and ba1
network_pairs2$d614g_ba1_network <- NA
for(x in 1:nrow(network_pairs2)){
  if(network_pairs2$d614g_ba1_outlier[x] == TRUE){network_pairs2$d614g_ba1_network[x] <- network_pairs2$dist_d614g[x] - network_pairs2$dist_ba1[x]} else{network_pairs2$d614g_ba1_network[x] <- 0}
}

network_pairs2_ba1_temp <- network_pairs2[,c("order","ace2_position","rbd_position")]
network_pairs2_ba1_temp2 <- merge(network_pairs2_ba1_temp,ace2_residues[,c("ace2_position","ace2_residue")],by = "ace2_position", all.x = T)
network_pairs2_ba1_temp3 <- merge(network_pairs2_ba1_temp2,d614g_residues[,c("rbd_position","d614g_residue")],by = "rbd_position", all.x = T)
network_pairs2_ba1_temp4 <- merge(network_pairs2_ba1_temp3,ba1_residues[,c("rbd_position","ba1_residue")],by = "rbd_position", all.x = T)
for(x in 1:nrow(network_pairs2_ba1_temp4)){
  if(network_pairs2_ba1_temp4$d614g_residue[x] == network_pairs2_ba1_temp4$ba1_residue[x]){
    network_pairs2_ba1_temp4$rbd_mutant[x] <- FALSE
    network_pairs2_ba1_temp4$rbd_residue[x] <- network_pairs2_ba1_temp4$d614g_residue[x]} else{
      network_pairs2_ba1_temp4$rbd_mutant[x] <- TRUE
      network_pairs2_ba1_temp4$rbd_residue[x] <- paste0(network_pairs2_ba1_temp4$d614g_residue[x],substr(network_pairs2_ba1_temp4$ba1_residue[x],1,1))}
}
network_pairs2_ba1_temp4 <- network_pairs2_ba1_temp4 %>% arrange(order)

simple_edge_df_ba1 <- data.frame(
  from = network_pairs2_ba1_temp4$ace2_residue,
  to = network_pairs2_ba1_temp4$rbd_residue,
  weight = round(network_pairs2$dist_ba1,2),
  weight2 = round(network_pairs2$d614g_ba1_network,1),
  weight3 = round(network_pairs2$d614g_ba1_network,1),
  stringsAsFactors = FALSE
)
simple_edge_df_ba1[simple_edge_df_ba1$weight2 == 0,"weight3"] <- NA
simple_edge_network_ba1 <- as.network(simple_edge_df_ba1)

x = network.vertex.names(simple_edge_network_ba1)
x = ifelse(x %in% unique(network_pairs2_ba1_temp4$ace2_residue), "ACE2", "RBD")
simple_edge_network_ba1 %v% "type" = x
simple_edge_network_ba1 %v% "rbd_mut" = network_pairs2_ba1_temp4$rbd_mutant

## Distances for ba1 by itself
set.edge.attribute(simple_edge_network_ba1, "color", ifelse(simple_edge_network_ba1 %e% "weight" > 0 & simple_edge_network_ba1 %e% "weight" <= 3, "black",
                                                              ifelse(simple_edge_network_ba1 %e% "weight" > 3 & simple_edge_network_ba1 %e% "weight" <= 4, "grey30",
                                                                     ifelse(simple_edge_network_ba1 %e% "weight" > 4 & simple_edge_network_ba1 %e% "weight" <= 6, "grey50",
                                                                            ifelse(simple_edge_network_ba1 %e% "weight" > 6 & simple_edge_network_ba1 %e% "weight" <= 8, "grey70",
                                                                                   ifelse(simple_edge_network_ba1 %e% "weight" > 8 & simple_edge_network_ba1 %e% "weight" <= 10, "grey90","white"))))))


ba1_networkplot <- ggnet2(simple_edge_network_ba1, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha= 0.3, label.color = "purple", label.ba1 = 0.4, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(ba1_networkplot)
ggsave(file = "Plots/ba1_networkplot.pdf", ba1_networkplot, height = 2.25, width = 3)

## Changes in distance as compared to D614G
simple_edge_network_ba12 <- simple_edge_network_ba1

set.edge.attribute(simple_edge_network_ba12, "color", 
                   ifelse(simple_edge_network_ba12 %e% "weight2" > 1 & simple_edge_network_ba12 %e% "weight2" < 2, "orange",
                          ifelse(simple_edge_network_ba12 %e% "weight2" > 2, "red",
                                 ifelse(simple_edge_network_ba12 %e% "weight2" < -2, "blue",
                                 ifelse(simple_edge_network_ba12 %e% "weight2" < -1, "cyan","grey75")))))

D614G_ba1_networkplot <- ggnet2(simple_edge_network_ba12, mode = temp_network, label = TRUE, size = 3, label.size = 1.6, arrow.size = 0, edge.size = 1, arrow.type = "closed", edge.label = "weight3", edge.label.size = 1.5, edge.label.color = "black", edge.label.fill = NA, node.alpha = 0.5, edge.color = "color", edge.alpha = 0.3, label.color = "purple", label.ba1 = 0.5, arrow.gap = 0.01, layout.par = list(repulse.rad = 10, area = 10), color = "type", palette = c("ACE2" = "pink", "RBD" = "lightblue")) +
  ggtitle("RBD scaffold associations with RBMs") +
  theme(panel.background = element_rect(color = "grey"), legend.title = element_text(face = "bold", size = rel(rs/2)), plot.title = element_text(size = rel(rs)), legend.position = "right", legend.text = element_text(size = rel(rs/2)))
print(D614G_ba1_networkplot)
ggsave(file = "Plots/D614G_ba1_networkplot.pdf", D614G_ba1_networkplot, height = 2.25, width = 3)
```

```{r For plotting via pymol}
paste0("select alpha_muts, alpha_rbd and resi 501")
paste0("select alpha_closer_to_ace2, alpha_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_alpha_outlier == TRUE & d614g_alpha_network > 1)$rbd_position))))
paste0("select alpha_further_from_ace2, alpha_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_alpha_outlier == TRUE & d614g_alpha_network < -1)$rbd_position))))

paste0("select beta_muts, beta_rbd and resi 417+484+501")
paste0("select beta_closer_to_ace2, beta_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_beta_outlier == TRUE & d614g_beta_network > 1)$rbd_position))))
paste0("select beta_further_from_ace2, beta_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_beta_outlier == TRUE & d614g_beta_network < -1)$rbd_position))))

paste0("select gamma_muts, gamma_rbd and resi 417+484+501")
paste0("select gamma_closer_to_ace2, gamma_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_gamma_outlier == TRUE & d614g_gamma > 0)$rbd_position))))
paste0("select gamma_further_from_ace2, gamma_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_gamma_outlier == TRUE & d614g_gamma < 0)$rbd_position))))

paste0("select delta_muts, delta_rbd and resi 452+478")
paste0("select delta_closer_to_ace2, delta_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_delta_outlier == TRUE & d614g_delta > 0)$rbd_position))))
paste0("select delta_further_from_ace2, delta_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_delta_outlier == TRUE & d614g_delta < 0)$rbd_position))))

paste0("select ba1_muts, ba1_rbd and resi 452+478")
paste0("select ba1_closer_to_ace2, ba1_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_ba1_outlier == TRUE & d614g_ba1 > 0)$rbd_position))))
paste0("select ba1_further_from_ace2, ba1_rbd and resi ", gsub(", ", "+", toString(unique(subset(network_pairs2, d614g_ba1_outlier == TRUE & d614g_ba1 < 0)$rbd_position))))

```

## N501Y mutation in Alpha, Beta, and Gamma
N501Y (RBD) mutation causes Y501 (RBD) to move much closer to D38 (ACE2).
D38 (ACE2) moves further from Q498 (RBD) and G496 (RBD)
K353 (ACE2) also moves further from Q498 (RBD), G496 (RBD), and G447 (RBD), but moves closer to Y453 (RBD)
Q498 (RBD) moves clsoer to L45 (ACE2) and N49 (ACE2)

## K417 and E484K mutations
N501Y already seems primed to make more distant changes:
K417 (RBD) moving away from D30 (ACE2) and Q493 (RBD) moving away from E35 (ACE2)

But this is further accentuated by two additional mutations in Beta and Gamma.
K417N and K417T mutations both increase the shift of the N417/T417 movement away from D30 (ACE2), but also moves it away from H34 (ACE2)
E484K mutation increases the shift of Q493 (RBD) moving away from E35 (ACE2), but K484 (RBD) also moves drastically away from K31 (ACE2), 

BA1 also has K417N, and also exhibits that shift away from D30 (ACE2) and H34 (ACE2).

## E484A in BA1
E484A mutation found alongisde Q493R. A484 moves away from K31 (ACE2) much like E484K seen for Beta/Gamma. Furthermore, K31 also moves away from L492 and F490.


## Alpha -       
Q498 (RBD) also moves away from K353 (ACE2), while moving closer to L45 (ACE2) and N49 (ACE2). 
As K353 (ACE2) moves further from Q498 (RBD), it also moves further from G496 (RBD), F497 (RBD), and G447 (RBD), while moving closer to Y453 (RBD).

## Beta - N501Y (RBD) mutation causes Y501 (RBD) to move much closer to D38 (ACE2) and slightly closer to E37 (ACE2).
[D38 (ACE2) moves further from Q498 (RBD).]
K353 (ACE2) moves further from G496 (RBD) and Q498 (RBD).


```{r quantitating relationships between mutations and structural impacts}
df_alpha <- data.frame(
  from = network_pairs2_alpha_temp4$ace2_residue,
  to = network_pairs2_alpha_temp4$rbd_residue,
  distance = round(network_pairs2$dist_alpha,2),
  difference = round(network_pairs2$d614g_alpha,2),
  stringsAsFactors = FALSE
)
df_alpha$mutated <- nchar(gsub("[0-9]","",df_alpha$to)) - 1
df_alpha$variant <- "Alpha"
ggplot() + geom_quasirandom(data = df_alpha, aes(x = mutated, y = difference))

df_beta <- data.frame(
  from = network_pairs2_beta_temp4$ace2_residue,
  to = network_pairs2_beta_temp4$rbd_residue,
  distance = round(network_pairs2$dist_beta,2),
  difference = round(network_pairs2$d614g_beta,2),
  stringsAsFactors = FALSE
)
df_beta$mutated <- nchar(gsub("[0-9]","",df_beta$to)) - 1
df_beta$variant <- "Beta"
ggplot() + geom_quasirandom(data = df_beta, aes(x = mutated, y = difference))

df_gamma <- data.frame(
  from = network_pairs2_gamma_temp4$ace2_residue,
  to = network_pairs2_gamma_temp4$rbd_residue,
  distance = round(network_pairs2$dist_gamma,2),
  difference = round(network_pairs2$d614g_gamma,2),
  stringsAsFactors = FALSE
)
df_gamma$mutated <- nchar(gsub("[0-9]","",df_gamma$to)) - 1
df_gamma$variant <- "Gamma"
ggplot() + geom_quasirandom(data = df_gamma, aes(x = mutated, y = difference))

df_delta <- data.frame(
  from = network_pairs2_delta_temp4$ace2_residue,
  to = network_pairs2_delta_temp4$rbd_residue,
  distance = round(network_pairs2$dist_delta,2),
  difference = round(network_pairs2$d614g_delta,2),
  stringsAsFactors = FALSE
)
df_delta$mutated <- nchar(gsub("[0-9]","",df_delta$to)) - 1
df_delta$variant <- "Delta"
ggplot() + geom_quasirandom(data = df_delta, aes(x = mutated, y = difference))

df_ba1 <- data.frame(
  from = network_pairs2_ba1_temp4$ace2_residue,
  to = network_pairs2_ba1_temp4$rbd_residue,
  distance = round(network_pairs2$dist_ba1,2),
  difference = round(network_pairs2$d614g_ba1,2),
  stringsAsFactors = FALSE
)
df_ba1$mutated <- nchar(gsub("[0-9]","",df_ba1$to)) - 1
df_ba1$variant <- "BA1"
ggplot() + geom_quasirandom(data = df_ba1, aes(x = mutated, y = difference))

combined_var_mut_diff <- rbind(df_alpha, df_beta, df_gamma, df_delta, df_ba1)
combined_var_mut_diff$variant <- factor(combined_var_mut_diff$variant, levels = c("D614G","Alpha","Beta","Gamma","Delta","BA1"))
combined_var_mut_diff$mutated <- factor(combined_var_mut_diff$mutated)
combined_var_mut_diff_colorvector <- c("0" = "grey80", "1" = "darkred")

Distance_diffs_by_type <- ggplot() + theme(panel.grid.major = element_blank(), legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
  scale_color_manual(values = combined_var_mut_diff_colorvector) + 
  scale_y_continuous(limits = c(-0.9,0.45)) +
  labs(x = NULL, y = "Relative change in distance") + 
  geom_quasirandom(data = combined_var_mut_diff, aes(x = variant, y = difference, color = mutated), alpha = 0.6)
Distance_diffs_by_type
ggsave(file = "Plots/Distance_diffs_by_type.pdf", Distance_diffs_by_type, height = 1.8, width = 2.3)
```

```{r Subsetting for variants we've tested at contact sites}
low_kozak3 <- low_kozak2[,seq(1:6)] - low_kozak2[,1]
low_kozak3$mutant <- low_kozak2$mutant
low_kozak3_melt <- melt(low_kozak3)
low_kozak3_melt2 <- low_kozak3_melt %>% arrange(desc(value)) %>% filter(variable != "D614G")

ggplot() + geom_histogram(data = low_kozak3_melt2, aes(x = value, fill = variable)) + facet_grid(rows = vars(variable))

d614g_alpha <- network_pairs[,c("ace2_rbd","ace2_position","rbd_position","d614g_alpha")]
d614g_alpha$variable <- "Alpha"
colnames(d614g_alpha)[4] <- "distance"

d614g_beta <- network_pairs[,c("ace2_rbd","ace2_position","rbd_position","d614g_beta")]
d614g_beta$variable <- "Beta"
colnames(d614g_beta)[4] <- "distance"

d614g_gamma <- network_pairs[,c("ace2_rbd","ace2_position","rbd_position","d614g_gamma")]
d614g_gamma$variable <- "Gamma"
colnames(d614g_gamma)[4] <- "distance"

d614g_delta <- network_pairs[,c("ace2_rbd","ace2_position","rbd_position","d614g_delta")]
d614g_delta$variable <- "Delta"
colnames(d614g_delta)[4] <- "distance"

d614g_ba1 <- network_pairs[,c("ace2_rbd","ace2_position","rbd_position","d614g_ba1")]
d614g_ba1$variable <- "BA1"
colnames(d614g_ba1)[4] <- "distance"

d614g_distance_diffs <- rbind(d614g_alpha, d614g_beta, d614g_gamma, d614g_delta, d614g_ba1)
human_muts <- read.csv(file = "Keys/Human_mutants.csv", header = T, stringsAsFactors = F)
human_muts2 <- merge(human_muts, low_kozak3_melt2, by = "mutant")
human_muts3 <- merge(human_muts2, d614g_distance_diffs, by = c("ace2_position","variable"))
human_muts3_grouped <- human_muts3 %>% group_by(variable, mutant) %>% summarize(value = mean(value), distance = mean(distance), variable = unique(variable))

Distance_vs_effect <- ggplot() + theme(legend.position = "right") +
  geom_hline(yintercept = 0, linetype = 2, alpha = 0.5) + geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
  geom_point(data = human_muts3_grouped, aes(x = distance, y = value), alpha = 0.4) +
  geom_text_repel(data = human_muts3_grouped, aes(x = distance, y = value, label = mutant, color = variable), alpha = 0.8, size = 2) +
  labs(x = "Angstroms closer", y = "Improved infection")
Distance_vs_effect
ggsave(file = "Plots/Distance_vs_effect.pdf", Distance_vs_effect, height = 2.4, width = 3.8)

human_muts3_grouped2 <- human_muts3 %>% group_by(mutant) %>% summarize(value = mean(value), distance = mean(distance))

Distance_vs_effect_mean <- ggplot() + theme(legend.position = "right") +
  geom_hline(yintercept = 0, linetype = 2, alpha = 0.5) + geom_vline(xintercept = 0, linetype = 2, alpha = 0.5) +
  geom_point(data = human_muts3_grouped, aes(x = distance, y = value, color = mutant), alpha = 0.2) +
  geom_point(data = human_muts3_grouped2, aes(x = distance, y = value, color = mutant), alpha = 0.4, shape = 4, size = 3) +
  geom_text_repel(data = human_muts3_grouped2, aes(x = distance, y = value, label = mutant, color = mutant), alpha = 0.8, size = 2) +
  labs(x = "Angstroms closer", y = "Improved infection")
Distance_vs_effect_mean
ggsave(file = "Plots/Distance_vs_effect_mean.pdf", Distance_vs_effect_mean, height = 2, width = 3.5)
```

```{r Looking specifically at key ACE2 residues}
d38 <- human_muts3 %>% filter(ace2_position == 38)
ggplot() + geom_point(data = d38, aes(x = distance, y = value, color = variable)) +
  geom_text_repel(data = d38, aes(x = distance, y = value, label = rbd_position))

d38_ba1 <- d38 %>% filter(variable == "BA1") %>% group_by(rbd_position) %>% summarize(value = mean(value), distance = mean(distance))
d38_not_ba1 <- d38 %>% filter(variable != "BA1") %>% group_by(rbd_position) %>% summarize(value = mean(value), distance = mean(distance))
d38_comb <- merge(d38_ba1, d38_not_ba1, by = "rbd_position")

ggplot() + geom_vline(xintercept = 0) +
  geom_segment(data = d38_comb, aes(x = distance.x, y = value.x, xend = distance.y, yend = value.y), linetype = 2, alpha = 0.4) +
  geom_point(data = d38_not_ba1, aes(x = distance, y = value)) +
  geom_text_repel(data = d38_not_ba1, aes(x = distance, y = value, label = rbd_position)) +
  geom_point(data = d38_ba1, aes(x = distance, y = value), color = "purple") +
  geom_text_repel(data = d38_ba1, aes(x = distance, y = value, label = rbd_position), color = "purple")

e35 <- human_muts3 %>% filter(ace2_position == 35)
ggplot() + geom_point(data = e35, aes(x = distance, y = value, color = variable)) +
  geom_text_repel(data = e35, aes(x = distance, y = value, label = rbd_position))

k31 <- human_muts3 %>% filter(ace2_position == 31)
ggplot() + geom_point(data = k31, aes(x = distance, y = value, color = variable)) +
  geom_text_repel(data = k31, aes(x = distance, y = value, label = rbd_position))

k353 <- human_muts3 %>% filter(ace2_position == 353)
ggplot() + geom_point(data = k353, aes(x = distance, y = value, color = variable)) +
  geom_text_repel(data = k353, aes(x = distance, y = value, label = rbd_position))
```


## Some additional comparisons between sequence, structure, and function
```{r Identity distance matrix for the tested RBDs}
rbd_identity <- read.csv(file = "Data/Grantham/RBD_identity_matrix.csv", header= T, stringsAsFactors = F)
rownames(rbd_identity) <- rbd_identity$X
rbd_identity2 <- rbd_identity[,2:ncol(rbd_identity)]

variant_list <- c("D614G", "Alpha","Beta","Delta","Gamma","BA1")
rbd_identity3 <- rbd_identity2
colnames(rbd_identity3) <- variant_list
rbd_identity3$virus1 <- variant_list
rbd_identity3_melt <- melt(rbd_identity3, id = "virus1")

rbd_identity3_melt$virus1 <- factor(rbd_identity3_melt$virus1, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))
rbd_identity3_melt$variable <- factor(rbd_identity3_melt$variable, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))

RBD_identity_pairwise_correlations <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "right", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5)) + 
  labs(x = NULL, y = NULL) +
  scale_fill_gradient(low  = "purple", high = "white", limits = c(0,5), na.value = "white") +
  scale_x_discrete(position = "top") +
  geom_tile(data = rbd_identity3_melt, aes(x = virus1, y = variable, fill = value), color = "grey50") +
  geom_text(data = rbd_identity3_melt %>% filter(value >= 2), aes(x = virus1, y = variable, label = value), angle= 45, size = 2) +
  geom_text(data = rbd_identity3_melt %>% filter(value < 2), aes(x = virus1, y = variable, label = value), angle= 45, size = 2, color = "white")
RBD_identity_pairwise_correlations
ggsave(file = "Plots/RBD_identity_pairwise_correlations.pdf", RBD_identity_pairwise_correlations, height = 1.7, width = 2.4)


## PCA on RBD identity matrix
rbd_identity_pca = prcomp(rbd_identity2, scale. = TRUE)

rbd_identity_scores = as.data.frame(rbd_identity_pca$x)
rbd_identity_scores$label <- c("D614G", "Alpha", "Beta","Delta","Gamma","BA1")

rbd_identity_PCA_plot <- ggplot() + 
  xlab(paste("Principal component 1\n(",round(summary(rbd_identity_pca)$importance[2] * 100,0),"% of variance)",sep="")) + 
  ylab(paste("Principal component 2\n(",round(summary(rbd_identity_pca)$importance[5] * 100,0),"% of variance)",sep="")) + 
  geom_point(data = rbd_identity_scores, aes(x = PC1, y = PC2)) +
  geom_text_repel(data = rbd_identity_scores, aes(x = PC1, y = PC2, label = label), color = "red", size = 2)
rbd_identity_PCA_plot
ggsave(file = "Plots/rbd_identity_PCA_plot.pdf", rbd_identity_PCA_plot, height = 1.8, width = 1.75)

```

```{r Grantham Distance Matrix for the tested RBDs}
grantham <- read.csv(file = "Data/Grantham/RBD_Grantham_matrix.csv", header= T, stringsAsFactors = F)
rownames(grantham) <- grantham$X
grantham2 <- grantham[,2:ncol(grantham)]
grantham_pca = prcomp(grantham2, scale. = TRUE)

grantham_scores = as.data.frame(grantham_pca$x)
grantham_scores$label <- c("D614G", "Alpha", "Beta","Delta","Gamma","BA1")

Grantham_PCA_plot <- ggplot() + 
  xlab(paste("Principal component 1\n(",round(summary(grantham_pca)$importance[2] * 100,0),"% of variance)",sep="")) + 
  ylab(paste("Principal component 2\n(",round(summary(grantham_pca)$importance[5] * 100,0),"% of variance)",sep="")) + 
  geom_point(data = grantham_scores, aes(x = PC1, y = PC2)) +
  geom_text_repel(data = grantham_scores, aes(x = PC1, y = PC2, label = label), color = "red", size = 2)
Grantham_PCA_plot
ggsave(file = "Plots/Grantham_PCA_plot.pdf", Grantham_PCA_plot, height = 1.8, width = 1.75)
```

```{r Human ACE2 mutant impacts on variant infectivity pairwise comparison}
combined_ortholog_variant_unmelt <- rbind(
(ortholog1 %>% mutate(sample = ortholog))[,!colnames(ortholog1) %in% "ortholog"],
(low_kozak1 %>% mutate(sample = mutant))[,!colnames(low_kozak1) %in% "mutant"])

## Make a dataframe of pairwise Pearsons's correlations
mut_inf_cor_matrix_names <- c("Alpha","Beta","Delta","Gamma","BA1","D614G")
mut_inf_cor_matrix_df <- data.frame(index = seq(1,length(mut_inf_cor_matrix_names) * length(mut_inf_cor_matrix_names)))
mut_inf_cor_matrix_df$virus1 <- rep(mut_inf_cor_matrix_names, length(mut_inf_cor_matrix_names))
mut_inf_cor_matrix_df$virus2 <- rep(mut_inf_cor_matrix_names, each = length(mut_inf_cor_matrix_names))
mut_inf_cor_matrix_df$pearson_rsquared <- 0

for(x in 1:nrow(mut_inf_cor_matrix_df)){
    virus1 <- mut_inf_cor_matrix_df$virus1[x]
    virus2 <- mut_inf_cor_matrix_df$virus2[x]
    mut_inf_cor_matrix_df$pearson_rsquared[x] <- cor(combined_ortholog_variant_unmelt[18:30,virus1], combined_ortholog_variant_unmelt[18:30,virus2], method = "pearson")^2
}

mut_inf_cor_matrix_df$virus1 <- factor(mut_inf_cor_matrix_df$virus1, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))
mut_inf_cor_matrix_df$virus2 <- factor(mut_inf_cor_matrix_df$virus2, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))

Mut_inf_pairwise_correlations <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "right", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5)) + 
  labs(x = NULL, y = NULL) +
  scale_fill_gradient(low  = "white", high = "purple", limits = c(0,1)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = mut_inf_cor_matrix_df, aes(x = virus1, y = virus2, fill = pearson_rsquared), color = "grey50") +
  geom_text(data = mut_inf_cor_matrix_df %>% filter(pearson_rsquared > 0.8), aes(x = virus1, y = virus2, label = round(pearson_rsquared, 2)), angle= 45, size = 2, color = "white") +
  geom_text(data = mut_inf_cor_matrix_df %>% filter(pearson_rsquared < 0.8), aes(x = virus1, y = virus2, label = round(pearson_rsquared, 2)), angle= 45, size = 2)
Mut_inf_pairwise_correlations
ggsave(file = "Plots/Mut_inf_pairwise_correlations.pdf", Mut_inf_pairwise_correlations, height = 1.9, width = 2.8)
```

```{r PCA on human mutant infection patterns}
## Try PCA on human mutant patterns 
for_variant_infection_pca <- data.frame(t(combined_ortholog_variant_unmelt[,1:6]))
colnames(for_variant_infection_pca) <- combined_ortholog_variant_unmelt$sample
for_variant_infection_pca <- for_variant_infection_pca[,18:30]
pca_for_variant_infection = prcomp(for_variant_infection_pca, scale. = TRUE)
pca_variant_infection_scores = as.data.frame(pca_for_variant_infection$x)
summary(pca_for_variant_infection)$rotation
pca_variant_infection_scores$labels <- rownames(pca_variant_infection_scores)

variant_infection_pca <- ggplot() + theme(panel.grid = element_blank()) + 
  xlab(paste("Principal component 1\n(",round(summary(pca_for_variant_infection)$importance[2] * 100,0),"% of variance)",sep="")) + 
  ylab(paste("Principal Component 2\n(",round(summary(pca_for_variant_infection)$importance[5] * 100,0),"% of variance)",sep="")) + 
  geom_text_repel(data = pca_variant_infection_scores, aes(x = PC1, y = PC2, label = labels), segment.color = 'grey90', point.padding = 0.05, size = 2, segment.alpha = 0.5, color = "red") +
  geom_point(data = pca_variant_infection_scores, aes(x = PC1, y = PC2), size = 1) +
  NULL
variant_infection_pca
ggsave(file = "Plots/variant_infection_pca.pdf", variant_infection_pca, height = 1.8, width = 1.65) #height = 1.6, width = 1.75)

```

## Now shifting to the ortholog-focused parts of the manuscript
```{r ACE2 ortholog first 400 residues identity matrix PCA}
ortholog_identity <- read.csv(file = "Data/Grantham/Ortholog_identity_matrix.csv", header= T, stringsAsFactors = F)
rownames(ortholog_identity) <- ortholog_identity$X
ortholog_identity2 <- ortholog_identity[,2:ncol(ortholog_identity)]
ortholog_identity_pca = prcomp(ortholog_identity2, scale. = TRUE)

ortholog_identity_scores = as.data.frame(ortholog_identity_pca$x)
ortholog_identity_labels <- read.csv(file = "Data/Grantham/Updated_ortholog_identity_file.csv", header= T, stringsAsFactors = F)
ortholog_identity_scores$label <- ortholog_identity_labels$Names

ortholog_identity_PCA_plot <- ggplot() + 
  xlab(paste("Principal component 1\n(",round(summary(ortholog_identity_pca)$importance[2] * 100,0),"% of variance)",sep="")) + 
  ylab(paste("Principal component 2\n(",round(summary(ortholog_identity_pca)$importance[5] * 100,0),"% of variance)",sep="")) + 
  geom_point(data = ortholog_identity_scores, aes(x = PC1, y = PC2), alpha = 0.5) +
  geom_text_repel(data = ortholog_identity_scores, aes(x = PC1, y = PC2, label = label), color = "red", size = 2, segment.color = "orange")
ortholog_identity_PCA_plot
ggsave(file = "Plots/Ortholog_identity_PCA_plot.pdf", ortholog_identity_PCA_plot, height = 2.25, width = 2.5)
```

```{r Showing ACE2 ortholog first 400 residues identity matrix heatmap}
ortholog_identity3 <- ortholog_identity2
colnames(ortholog_identity3) <- ortholog_identity_labels$Names
ortholog_identity3_matrix <- ortholog_identity3
colnames(ortholog_identity3_matrix) <- ortholog_identity_labels$Names
rownames(ortholog_identity3_matrix) <- ortholog_identity_labels$Names
ortholog_identity3_matrix_hclust <- hclust(as.dist(ortholog_identity3_matrix, diag = TRUE))
ortholog_identity3_matrix_hclust_order <- ortholog_identity3_matrix_hclust$labels[ortholog_identity3_matrix_hclust$order]
ortholog_identity3_matrix2 <- ortholog_identity3_matrix
ortholog_identity3_matrix2$species <- rownames(ortholog_identity3_matrix)
ortholog_identity3_matrix2_melted <- melt(ortholog_identity3_matrix2, id = "species")
ortholog_identity3_matrix2_melted$variable <- as.character(ortholog_identity3_matrix2_melted$variable)
ortholog_identity3_matrix2_melted$species <- factor(ortholog_identity3_matrix2_melted$species, levels = ortholog_identity3_matrix_hclust_order)
ortholog_identity3_matrix2_melted$variable <- factor(ortholog_identity3_matrix2_melted$variable, levels = ortholog_identity3_matrix_hclust_order)

Ortholog_identity_heatmap_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = NULL, y = NULL) +
  scale_fill_gradient(low = "white", high = "lightblue") +
  scale_x_discrete(position = "top") +
  geom_tile(data = ortholog_identity3_matrix2_melted, aes(x = species, y = variable, fill = value), color = "grey60")  +
  geom_text(data = ortholog_identity3_matrix2_melted, aes(x = species, y = variable, label = value), angle = 45, size = 2)
Ortholog_identity_heatmap_plot
ggsave(file = "Plots/Ortholog_identity_heatmap_plot.pdf", Ortholog_identity_heatmap_plot, height = 3, width = 3)
```

```{r Show ortholog heatmap again this time scaled and by itself without the human ACE2 mutants}
ortholog2_melt2 <- ortholog2_melt %>% filter(ortholog != "H.sapiens")
ortholog2_melt2$ortholog <- factor(ortholog2_melt2$ortholog, levels = rev(levels(ortholog2_melt2$ortholog)))

orthologs3_s2variants <- ggplot() + 
  labs(x = NULL, y = NULL) + 
  scale_fill_gradient(low = "white", high = "red") + 
  theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45), legend.position = "top") + 
  geom_tile(data = ortholog2_melt2, aes(y = ortholog, x = variable, fill = value)); orthologs3_s2variants
ggsave(file = "Plots/orthologs3_s2variants.pdf", orthologs3_s2variants, height = 3.2, width = 2)
```

### Showing residues of sequence difference between orthologs
```{r Difference between orthologs that we can ploy with Pymol}
ortholog_seqs <- read.csv(file = "Data/Grantham/Updated_ortholog_identity_file.csv", header = T, stringsAsFactors = F)
human_ace2_seq <- ortholog_seqs$Sequences[1]
ortholog_residue_diff_frame <- data.frame("position" = seq(1:nchar(human_ace2_seq)), "residues" = "", "diffnum" = 0)

for(x in 1:nrow(ortholog_residue_diff_frame)){
  temp_vector = c()
  for(y in 1:nrow(ortholog_seqs)){
    temp_vector <- c(temp_vector,substr(ortholog_seqs$Sequences[y],x,x))
  }
  ortholog_residue_diff_frame$residues[x] <- gsub(", ","",toString(unique(temp_vector)))
  ortholog_residue_diff_frame$diffnum[x] <- nchar(ortholog_residue_diff_frame$residues[x])
}

ortholog_residue_diff_frame_most <- ortholog_residue_diff_frame %>% filter(diffnum >= 4)
paste("select ortholog_variable, d614g_ace2 and resi", gsub(", ","+",toString(ortholog_residue_diff_frame_most$position)),"and not name c+n+o")

##Plot looking at overlap

ortholog_residue_diff_frame2 <- merge(ortholog_residue_diff_frame, data.frame("position" = unique(network_pairs2$ace2_position), "interaction" = 1), by = "position", all = T) %>% arrange(desc(diffnum))
ortholog_residue_diff_frame2[is.na(ortholog_residue_diff_frame2)] <- 0

ggplot() + geom_quasirandom(data = ortholog_residue_diff_frame2, aes(x = interaction, y = diffnum), varwidth = TRUE, nbins = 100)

Interaction_ortholog_histogram <- ggplot() + theme(panel.grid = element_blank()) + 
  labs(x = "Number of different\namino acids in dataset", y = "Number of positions\nin first 400 residues") +
  geom_vline(xintercept = 3.5, color = "red") +
  geom_hline(yintercept = 0) +
  geom_histogram(data = ortholog_residue_diff_frame2, aes(x = diffnum), binwidth = 1, color = "black", fill = "grey80") +
  geom_histogram(data = ortholog_residue_diff_frame2 %>% filter(diffnum >= 4), aes(x = diffnum), binwidth = 1, color = "black", fill = "pink") +
  facet_grid(rows = vars(interaction), scales = "free_y") +
  NULL; Interaction_ortholog_histogram
ggsave(file = "Plots/Interaction_ortholog_histogram.pdf", Interaction_ortholog_histogram, height = 2, width = 1.4)

ortholog_residue_diff_frame2 %>% filter(interaction == 1 & diffnum >= 4)

```

```{r Looking at Geomean infectivities across the dataset}
combined_ortholog_variant_unmelt <- rbind(
(ortholog1 %>% mutate(sample = ortholog))[,!colnames(ortholog1) %in% "ortholog"],
(low_kozak1 %>% mutate(sample = mutant))[,!colnames(low_kozak1) %in% "mutant"])

for(x in 1:nrow(combined_ortholog_variant_unmelt)){
  for(y in 1:(ncol(combined_ortholog_variant_unmelt)-1)){
    combined_ortholog_variant_unmelt[x,y] <- log10(combined_ortholog_variant_unmelt[x,y])
  }
}

combined_ortholog_variant_melt <- melt(combined_ortholog_variant_unmelt, id = "sample")
combined_ortholog_variant_melt_summary <- combined_ortholog_variant_melt %>% group_by(sample) %>% summarize(mean = mean(value)) %>% arrange(mean)
combined_ortholog_variant_melt_summary$sample <- factor(combined_ortholog_variant_melt_summary$sample, levels = combined_ortholog_variant_melt_summary$sample)
combined_ortholog_variant_melt$sample <- factor(combined_ortholog_variant_melt$sample, levels = combined_ortholog_variant_melt_summary$sample)

Infection_step_plot <- ggplot() + theme(panel.grid.major.x = element_blank(), axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) + 
  labs(x = NULL, y = "Log10 enrichment") +
  geom_point(data = combined_ortholog_variant_melt, aes(x = sample, y = value, color = variable), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_melt_summary, aes(x = sample, y = mean), size = 6, shape = 95) +
  NULL
Infection_step_plot
ggsave(file = "Plots/Infection_step_plot.pdf", Infection_step_plot, height = 1.75, width = 5)
```

## Below is an analysis method for comparing the differences in infection between the various SARS-CoV-2 RBD variants
```{r Create scatterplots for pairwise comparisons between SARS2 variants with ACE2 orthologs}

humutant_labels <- c("WT","K31D","K353D","D38H","E23K","K26E","I21N","E329K","G352V","E35K","D355N","G326E")

combined_ortholog_variant_unmelt$set_label <- "ortholog"

for(x in 1:nrow(combined_ortholog_variant_unmelt)){
  if(combined_ortholog_variant_unmelt$sample[x] %in% c(humutant_labels)){combined_ortholog_variant_unmelt$set_label[x] <- "humutant"}
}

```


```{r Create scatterplots for pairwise comparisons between SARS2 variants with ACE2 orthologs}
d614g_alpha_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"D614G"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"D614G"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Alpha"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Alpha"])))
lm_d614g_alpha <- lm(d614g_alpha_df[,"virus2"] ~ d614g_alpha_df[,"virus1"])
D614G_Alpha_scatterplot <- ggplot() +
  geom_abline(slope = lm_d614g_alpha$coefficients[2], intercept = lm_d614g_alpha$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = D614G, y = Alpha), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = D614G, y = Alpha), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = D614G, y = Alpha), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("M.musculus","R.shameli","R.sinicus_200","R.landeri","R.ferrumequinum","R.sinicus_215")), aes(x = D614G, y = Alpha, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; D614G_Alpha_scatterplot
ggsave(file = "Plots/D614G_Alpha_scatterplot.pdf", D614G_Alpha_scatterplot, height = 1.5, width = 2.25)

d614g_beta_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"D614G"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"D614G"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Beta"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Beta"])))
lm_d614g_beta <- lm(d614g_beta_df[,"virus2"] ~ d614g_beta_df[,"virus1"])
D614G_Beta_scatterplot <- ggplot() +
  geom_abline(slope = lm_d614g_beta$coefficients[2], intercept = lm_d614g_beta$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = D614G, y = Beta), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = D614G, y = Beta), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = D614G, y = Beta), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = D614G, y = Beta, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; D614G_Beta_scatterplot
ggsave(file = "Plots/D614G_Beta_scatterplot.pdf", D614G_Beta_scatterplot, height = 1.5, width = 2.25)

d614g_delta_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"D614G"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"D614G"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Delta"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Delta"])))
lm_d614g_delta <- lm(d614g_delta_df[,"virus2"] ~ d614g_delta_df[,"virus1"])
D614G_Delta_scatterplot <- ggplot() +
  geom_abline(slope = lm_d614g_delta$coefficients[2], intercept = lm_d614g_delta$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = D614G, y = Delta), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = D614G, y = Delta), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = D614G, y = Delta), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = D614G, y = Delta, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) 
ggsave(file = "Plots/D614G_Delta_scatterplot.pdf", D614G_Delta_scatterplot, height = 1.5, width = 2.25)


d614g_gamma_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"D614G"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"D614G"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Gamma"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Gamma"])))
lm_d614g_gamma <- lm(d614g_gamma_df[,"virus2"] ~ d614g_gamma_df[,"virus1"])
D614G_gamma_scatterplot <- ggplot() +
  geom_abline(slope = lm_d614g_gamma$coefficients[2], intercept = lm_d614g_gamma$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = D614G, y = Gamma), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = D614G, y = Gamma), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = D614G, y = Gamma), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = D614G, y = Gamma, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) + 
  NULL; D614G_gamma_scatterplot
ggsave(file = "Plots/D614G_Gamma_scatterplot.pdf", D614G_gamma_scatterplot, height = 1.5, width = 2.25)


d614g_ba1_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"D614G"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"D614G"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"BA1"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"BA1"])))
lm_d614g_ba1 <- lm(d614g_ba1_df[,"virus2"] ~ d614g_ba1_df[,"virus1"])
D614G_BA1_scatterplot <- ggplot() +
  geom_abline(slope = lm_d614g_ba1$coefficients[2], intercept = lm_d614g_ba1$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = D614G, y = BA1), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = D614G, y = BA1), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = D614G, y = BA1), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = D614G, y = BA1, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; D614G_BA1_scatterplot
ggsave(file = "Plots/D614G_BA1_scatterplot.pdf", D614G_BA1_scatterplot, height = 1.5, width = 2.25)


alpha_beta_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Alpha"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Alpha"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Beta"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Beta"])))
lm_alpha_beta <- lm(alpha_beta_df[,"virus2"] ~ alpha_beta_df[,"virus1"])
Alpha_Beta_scatterplot <- ggplot() +
  geom_abline(slope = lm_alpha_beta$coefficients[2], intercept = lm_alpha_beta$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = Alpha, y = Beta), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = Alpha, y = Beta), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = Alpha, y = Beta), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = Alpha, y = Beta, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; Alpha_Beta_scatterplot
ggsave(file = "Plots/Alpha_Beta_scatterplot.pdf", Alpha_Beta_scatterplot, height = 1.5, width = 2.25)


alpha_gamma_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Alpha"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Alpha"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Gamma"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Gamma"])))
lm_alpha_gamma <- lm(alpha_gamma_df[,"virus2"] ~ alpha_gamma_df[,"virus1"])
Alpha_Gamma_scatterplot <- ggplot() +
  geom_abline(slope = lm_alpha_gamma$coefficients[2], intercept = lm_alpha_gamma$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = Alpha, y = Gamma), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = Alpha, y = Gamma), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = Alpha, y = Gamma), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = Alpha, y = Gamma, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; Alpha_Gamma_scatterplot
ggsave(file = "Plots/Alpha_Gamma_scatterplot.pdf", Alpha_Gamma_scatterplot, height = 1.5, width = 2.25)


gamma_beta_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Gamma"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Gamma"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Beta"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Beta"])))
lm_gamma_beta <- lm(gamma_beta_df[,"virus2"] ~ gamma_beta_df[,"virus1"])

Gamma_Beta_scatterplot <- ggplot() +
  geom_abline(slope = lm_gamma_beta$coefficients[2], intercept = lm_gamma_beta$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = Gamma, y = Beta), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = Gamma, y = Beta), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = Gamma, y = Beta), color = "blue", size = 2, alpha = 0.4) +
  #geom_text_repel(data = combined_ortholog_variant_unmelt %>% filter(sample == "R.ferrumequinum"), aes(x = Gamma, y = Beta, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; Gamma_Beta_scatterplot
ggsave(file = "Plots/Gamma_Beta_scatterplot.pdf", Gamma_Beta_scatterplot, height = 1.5, width = 2.25)


gamma_delta_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Gamma"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Gamma"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Delta"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Delta"])))
lm_gamma_delta <- lm(gamma_delta_df[,"virus2"] ~ gamma_delta_df[,"virus1"])
Gamma_Delta_scatterplot <- ggplot() +
  geom_abline(slope = lm_gamma_delta$coefficients[2], intercept = lm_gamma_delta$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = Gamma, y = Delta), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = Gamma, y = Delta), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = Gamma, y = Delta), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = Gamma, y = Delta, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; Gamma_Delta_scatterplot
ggsave(file = "Plots/Gamma_Delta_scatterplot.pdf", Gamma_Delta_scatterplot, height = 1.5, width = 2.25)


gamma_ba1_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Gamma"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Gamma"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"BA1"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"BA1"])))
lm_gamma_ba1 <- lm(gamma_ba1_df[,"virus2"] ~ gamma_ba1_df[,"virus1"])
Gamma_BA1_scatterplot <- ggplot() +
  geom_abline(slope = lm_gamma_ba1$coefficients[2], intercept = lm_gamma_ba1$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = Gamma, y = BA1), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = Gamma, y = BA1), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = Gamma, y = BA1), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt, aes(x = Gamma, y = BA1, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; Gamma_BA1_scatterplot
ggsave(file = "Plots/Gamma_BA1_scatterplot.pdf", Gamma_BA1_scatterplot, height = 1.5, width = 2.25)


beta_ba1_df <- data.frame(virus1 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"Beta"]),
                                 mean(((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"Beta"]))),
                      virus2 = c(mean((combined_ortholog_variant_unmelt %>% filter(sample == "H.sapiens"))[,"BA1"]),
                                 mean((combined_ortholog_variant_unmelt %>% filter(sample == "dEcto"))[,"BA1"])))
lm_beta_ba1 <- lm(beta_ba1_df[,"virus2"] ~ beta_ba1_df[,"virus1"])
Beta_BA1_scatterplot <- ggplot() +
  geom_abline(slope = lm_beta_ba1$coefficients[2], intercept = lm_beta_ba1$coefficients[1], alpha = 0.1, size = 4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "ortholog"), aes(x = Beta, y = BA1), alpha = 0.4) +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(set_label == "humutant"), aes(x = Beta, y = BA1), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("H.sapiens","dEcto","Control")), aes(x = Beta, y = BA1), color = "blue", size = 2, alpha = 0.4) +
  geom_text_repel(data = combined_ortholog_variant_unmelt %>% filter(sample %in% c("R.sinicus_215","R.pearsonii","M.javanica","K31D","M.musculus")), aes(x = Beta, y = BA1, label = sample), color = "red", size = 1.5, segment.color = "orange", segment.alpha = 0.4, force = 0.5) +
  NULL; Beta_BA1_scatterplot
ggsave(file = "Plots/Beta_BA1_scatterplot.pdf", Beta_BA1_scatterplot, height = 1.5, width = 2.25)


## Make a dataframe of pairwise Pearsons's correlations
cor(combined_ortholog_variant_unmelt$Beta, combined_ortholog_variant_unmelt$Gamma, method = "pearson")^2

colnames(combined_ortholog_variant_unmelt)
ortholog_inf_cor_matrix_names <- c("Alpha","Beta","Delta","Gamma","BA1","D614G")

ortholog_inf_cor_matrix_df <- data.frame(index = seq(1,length(ortholog_inf_cor_matrix_names) * length(ortholog_inf_cor_matrix_names)))
ortholog_inf_cor_matrix_df$virus1 <- rep(ortholog_inf_cor_matrix_names, length(ortholog_inf_cor_matrix_names))
ortholog_inf_cor_matrix_df$virus2 <- rep(ortholog_inf_cor_matrix_names, each = length(ortholog_inf_cor_matrix_names))
ortholog_inf_cor_matrix_df$pearson_rsquared <- 0

for(x in 1:nrow(ortholog_inf_cor_matrix_df)){
    virus1 <- ortholog_inf_cor_matrix_df$virus1[x]
    virus2 <- ortholog_inf_cor_matrix_df$virus2[x]
    ortholog_inf_cor_matrix_df$pearson_rsquared[x] <- cor(combined_ortholog_variant_unmelt[,virus1], combined_ortholog_variant_unmelt[,virus2], method = "pearson")^2
}

ortholog_inf_cor_matrix_df$virus1 <- factor(ortholog_inf_cor_matrix_df$virus1, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))
ortholog_inf_cor_matrix_df$virus2 <- factor(ortholog_inf_cor_matrix_df$virus2, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))

Virus_pairwise_correlations <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "right", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = NULL, y = NULL) +
  scale_fill_gradient(low  = "white", high = "purple", limits = c(0,1)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = ortholog_inf_cor_matrix_df, aes(x = virus1, y = virus2, fill = pearson_rsquared)) +
  geom_text(data = ortholog_inf_cor_matrix_df, aes(x = virus1, y = virus2, label = round(pearson_rsquared, 2)), angle= 45, size = 2)
Virus_pairwise_correlations
ggsave(file = "Plots/Virus_pairwise_correlations.pdf", Virus_pairwise_correlations, height = 1.9, width = 2.8)
```

```{r Comparing sequence identity matrix with Pearson's correlation matrix}

#### Need to fix this chunk

rbd_identity3 <- rbd_identity2
colnames(rbd_identity3) <- c("D614G", "Alpha", "Beta","Delta","Gamma","BA1")
rbd_identity3$label <- c("D614G", "Alpha", "Beta","Delta","Gamma","BA1")

rbd_identity3_melt <- melt(rbd_identity3, id = "label")
colnames(rbd_identity3_melt) <- c("virus1","virus2","aa_diff")

rbd_identity_ortholog_cor_matrix <- merge(rbd_identity3_melt, ortholog_inf_cor_matrix_df, by = c("virus1","virus2"))

rbd_identity_ortholog_cor_matrix$virus1 <- factor(rbd_identity_ortholog_cor_matrix$virus1, levels = c("D614G", "Alpha", "Beta","Delta","Gamma","BA1"))
rbd_identity_ortholog_cor_matrix$virus2 <- factor(rbd_identity_ortholog_cor_matrix$virus2, levels = c("D614G", "Alpha", "Beta","Delta","Gamma","BA1"))

rbd_identity_ortholog_cor_matrix2 <- rbd_identity_ortholog_cor_matrix %>% filter(index %in% c(2,3,4,5,9,10,11,16,17,23,31,32,33,34,35))

rbd_color_manual <- c("D614G" = "#F8766D", "Alpha" = "#B79F00", "Beta" = "#00BA38", "Gamma" = "#00BFC4", "Delta" = "#619CFF", "BA1" = "#F564E3")

RBD_identity_vs_ortholog_infection_correlation <- ggplot() + theme(legend.position = "right", panel.grid.major = element_blank()) + 
  scale_y_continuous(limits = c(0,1)) + scale_x_continuous(limits = c(0,15)) +
  scale_color_manual(values = rbd_color_manual) +
  geom_point(data = rbd_identity_ortholog_cor_matrix2, aes(x = aa_diff, y = pearson_rsquared, color = virus1), shape = 16, size = 1.25, alpha = 0.5) +
  geom_point(data = rbd_identity_ortholog_cor_matrix2, aes(x = aa_diff, y = pearson_rsquared, color = virus2), shape = 1, size = 1.6, stroke = 1.25, alpha = 0.25) +
  NULL; RBD_identity_vs_ortholog_infection_correlation
ggsave(file = "Plots/RBD_identity_vs_ortholog_infection_correlation.pdf", RBD_identity_vs_ortholog_infection_correlation, height = 1.51, width = 2.65)
```

```{r Try a principal component analysis for ACE2 ortholog infectivites}
## Try PCA on ortholog usage
ortholog_infection_for_pca <- combined_ortholog_variant_unmelt[,1:6]
pca_ortholog_infection = prcomp(ortholog_infection_for_pca, scale. = TRUE)
pca_ortholog_infection_scores = as.data.frame(pca_ortholog_infection$x)
pca_ortholog_infection_scores$labels <- combined_ortholog_variant_unmelt$sample

pca_ortholog_infection_scores$set_label <- "ortholog"

for(x in 1:nrow(pca_ortholog_infection_scores)){
  if(pca_ortholog_infection_scores$labels[x] %in% c(humutant_labels)){pca_ortholog_infection_scores$set_label[x] <- "humutant"}
}

ortholog_infection_pca <- ggplot() + theme(panel.grid.major = element_blank()) +
  scale_x_continuous(limits = c(-2.3,4)) + scale_y_continuous(limits = c(-2.3,2.8)) +
  xlab(paste("Component 1 (",round(summary(pca_ortholog_infection)$importance[2] * 100,0),"% of variance)",sep="")) + 
  ylab(paste("Component 2\n(",round(summary(pca_ortholog_infection)$importance[5] * 100,0),"% of variance)",sep="")) + 
  geom_text_repel(data = pca_ortholog_infection_scores %>% filter(labels %in% c("R.ferrumequinum","R.shameli","R.sinicus_215","R.landeri","M.musculus","R.sinicus_200","S.scrofa","M.javanica","R.alcyone","R.affinis")), aes(x = PC1, y = PC2, label = labels), segment.color = 'grey90', size = 1.25, segment.alpha = 0.5, color = "red") + #point.padding = 0.1
  geom_point(data = pca_ortholog_infection_scores %>% filter(set_label == "ortholog"), aes(x = PC1, y = PC2), size = 1, alpha = 0.5) +
  geom_point(data = pca_ortholog_infection_scores %>% filter(set_label == "humutant"), aes(x = PC1, y = PC2), alpha = 0.4, size = 1, color = "grey50") +
  geom_point(data = pca_ortholog_infection_scores %>% filter(labels %in% c("H.sapiens(rep1)","H.sapiens(rep2)")), aes(x = PC1, y = PC2), size = 1, alpha = 0.5, color = "orange") +
   geom_point(data = pca_ortholog_infection_scores %>% filter(labels %in% c("dEcto")), aes(x = PC1, y = PC2), size = 1, alpha = 0.5, color = "green") +
  NULL; ortholog_infection_pca
ggsave(file = "Plots/Ortholog_infection_pca.pdf", ortholog_infection_pca, height = 1.5, width = 2.25)

## Look at loadings for principal components 1 and 2

## Loadings for first principal component
summary(pca_ortholog_infection)$rotation[,1]
## Loadings for second principal component
summary(pca_ortholog_infection)$rotation[,2]

ortholog_pca_loadings_frame <- cbind(data.frame(summary(pca_ortholog_infection)$rotation[,1]),data.frame(summary(pca_ortholog_infection)$rotation[,2]))
colnames(ortholog_pca_loadings_frame) <- c("PC 1","PC 2")
ortholog_pca_loadings_frame$label <- rownames(ortholog_pca_loadings_frame)
ortholog_pca_loadings_frame_melt <- melt(ortholog_pca_loadings_frame, id = "label")

ortholog_pca_loadings_frame_melt$label <- factor(ortholog_pca_loadings_frame_melt$label, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))

Ortholog_PCA_loadings_plot <- ggplot() + theme(panel.grid = element_blank(), axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.5)) +
  labs(x = NULL, y = "PCA loadings") +
  scale_y_continuous(breaks = c(-0.4,0,0.4)) +
  geom_hline(yintercept = 0, alpha = 0.5) +
  geom_point(data = ortholog_pca_loadings_frame_melt, aes(x = label, y = value), size = 1) + facet_grid(rows = vars(variable)) +
  NULL; Ortholog_PCA_loadings_plot
ggsave(file = "Plots/Ortholog_PCA_loadings_plot.pdf", Ortholog_PCA_loadings_plot, height = 1.25, width = 1.5)

```

```{r Coefficient of variation on ortholog susceptibility}
combined_ortholog_variant_melt2 <- combined_ortholog_variant_melt %>% mutate(value = value+2)
combined_ortholog_variant_melt_summary2 <- combined_ortholog_variant_melt2 %>% group_by(sample) %>% summarize(mean = mean(value), sd = sd(value)) %>% arrange(mean) %>% mutate(cv = sd/mean)
combined_ortholog_variant_melt_summary2$sample2 <- as.character(combined_ortholog_variant_melt_summary2$sample)
combined_ortholog_variant_melt_summary2$nchar <-  nchar(combined_ortholog_variant_melt_summary2$sample2)
combined_ortholog_variant_melt_summary3 <- rbind(combined_ortholog_variant_melt_summary2 %>% filter(nchar >= 6) %>% mutate(type = "ortholog"), combined_ortholog_variant_melt_summary2 %>% filter(nchar < 6) %>% mutate(type = "mutant"))

Ortholog_CV_plot <- ggplot() + scale_x_log10(limits = c(0.03,0.4)) + labs(x = "CV", y = "count") +
  geom_histogram(data = combined_ortholog_variant_melt_summary3, aes(x = cv), bins = 11, fill = "grey80", color = "black") + 
  facet_grid(rows = vars(type)) + 
NULL; Ortholog_CV_plot
ggsave(file = "Plots/Ortholog_CV_plot.pdf", Ortholog_CV_plot, height = 1.5, width = 1.3)

high_cv_set <- combined_ortholog_variant_melt_summary3 %>% filter(cv >= 0.11)
```

```{r Make a more generalizable function for comparing variants for all of their infetivities}
## For troubleshooting the function
mini_frame <- combined_ortholog_variant_unmelt[,c("Gamma","BA1","sample")]

virus_infection_comparison <- function(mini_frame){
  
  temp_df <- data.frame(virus1 = c(mean((mini_frame %>% filter(sample == "H.sapiens"))[,1]), mean((mini_frame %>% filter(sample == "dEcto"))[,1])), virus2 = c(mean((mini_frame %>% filter(sample == "H.sapiens"))[,2]), mean((mini_frame %>% filter(sample == "dEcto"))[,2])))
  
  lm1 <- lm(temp_df[,"virus2"] ~ temp_df[,"virus1"])
  
  ggplot() + 
    geom_point(data = mini_frame, aes(x = get(colnames(mini_frame)[1]), y = get(colnames(mini_frame)[2]))) +
    geom_text_repel(data = mini_frame, aes(x = get(colnames(mini_frame)[1]), y = get(colnames(mini_frame)[2]), label = sample), color = "red") +
    geom_abline(slope = lm1$coefficients[2], intercept = lm1$coefficients[1])
  
  lm1_res <- data.frame(sample = mini_frame$sample ,"residuals" = lm(mini_frame[,colnames(mini_frame)[2]] ~ mini_frame[,colnames(mini_frame)[1]])$residuals)
  lm1_res_sd <- c(mean(lm1_res$residuals) - sd(lm1_res$residuals), mean(lm1_res$residuals) + sd(lm1_res$residuals))
  
  ggplot() + geom_histogram(data = lm1_res, aes(x = residuals), binwidth = 0.2) + geom_vline(xintercept = lm1_res_sd)
  
  all_samples <- lm1_res %>% mutate(name1 = colnames(mini_frame)[1], name2 = colnames(mini_frame)[2], category = "all")
  outliers <- lm1_res %>% filter(residuals < lm1_res_sd[1] | residuals > lm1_res_sd[2]) %>% mutate(name1 = colnames(mini_frame)[1], name2 = colnames(mini_frame)[2], category = "outlier")
  return_samples <- rbind(all_samples, outliers)
  return_samples2 <- rbind(return_samples, c("correlation", cor(mini_frame[,1], mini_frame[,2]), colnames(mini_frame)[1], colnames(mini_frame)[2], "correlation"))
  return(return_samples2)
}

virus_list <- c("D614G", "Alpha","Beta","Gamma","Delta","BA1")
final_df <- data.frame("sample" = NA, "residuals" = NA, "name1" = NA, "name2" = NA, "category" = NA)
for(x in 1:length(virus_list)){
  for(y in 1:length(virus_list)){
    if(x != y){
    tempy <- virus_infection_comparison(combined_ortholog_variant_unmelt[,c(virus_list[x],virus_list[y],"sample")])
    final_df <- rbind(final_df, tempy)
    }
  }
}

final_df$name1 <- factor(final_df$name1, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))
final_df$name2 <- factor(final_df$name2, levels = c("D614G", "Alpha","Beta","Gamma","Delta","BA1"))

all_residuals <- final_df %>% filter(category == "all") %>% mutate(residuals = as.numeric(residuals))
correlations <- final_df %>% filter(category == "correlation") %>% mutate(residuals = as.numeric(residuals))
outlier_residuals <- final_df %>% filter(category == "outlier") %>% mutate(residuals = as.numeric(residuals))
```

```{r Making a histogram of residual values}
## Figuring out which residual values were the biggest
all_residuals2 <- all_residuals %>% arrange(desc(residuals))
all_residuals2$sample <- factor(all_residuals2$sample, levels = c("R.ferrumequinum","R.shameli","R.sinicus_215","R.landeri","M.musculus","R.sinicus_200","R.pearsonii","S.scrofa","M.javanica"))

Residuals_histogram <- ggplot() + theme(strip.text.y = element_text(angle = 0)) + 
  scale_x_continuous(breaks = c(-1,0,1)) + scale_y_continuous(breaks = c(0,5,10,100,200,300,400)) + 
  geom_histogram(data = all_residuals2, aes(x = residuals), binwidth = .1, color = "black", fill = "grey80") + facet_grid(rows = vars(sample), scales = "free_y"); Residuals_histogram
ggsave(file = "Plots/Residuals_histogram.pdf", Residuals_histogram, height = 6, width = 2.5)

#residuals_sd <- all_residuals %>% group_by(sample) %>% summarize(sd_residual = sd(residuals)) %>% arrange(desc(sd_residual))
#ggplot() + geom_histogram(data = residuals_sd, aes(x = sd_residual), binwidth = .05, fill = "grey80", color = "black")
#sd_residuals_high <- residuals_sd %>% filter(sd_residual >= 0.3)
```

```{r Calculate resiuals for particular ACE2 orthologs when comparing between pairs of SARS2 variants}
mouse_residuals <- all_residuals %>% filter(sample == "M.musculus")
mouse_residuals <- mouse_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
mouse_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "M.musculus", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = mouse_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = mouse_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
mouse_residuals_plot
ggsave(file = "Plots/mouse_residuals_plot.pdf", mouse_residuals_plot, height = 1.9, width = 2)

sinicus_215_residuals <- all_residuals %>% filter(sample == "R.sinicus_215")
sinicus_215_residuals <- sinicus_215_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
sinicus_215_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "R.sinicus_215", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = sinicus_215_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = sinicus_215_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
sinicus_215_residuals_plot
ggsave(file = "Plots/sinicus_215_residuals_plot.pdf", sinicus_215_residuals_plot, height = 1.9, width = 2)

sinicus_200_residuals <- all_residuals %>% filter(sample == "R.sinicus_200")
sinicus_200_residuals <- sinicus_200_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
sinicus_200_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "R.sinicus_200", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = sinicus_200_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = sinicus_200_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
sinicus_200_residuals_plot
ggsave(file = "Plots/sinicus_200_residuals_plot.pdf", sinicus_200_residuals_plot, height = 1.9, width = 2)

shameli_residuals <- all_residuals %>% filter(sample == "R.shameli")
shameli_residuals <- shameli_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
shameli_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "R.shameli", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = shameli_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = shameli_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
shameli_residuals_plot
ggsave(file = "Plots/shameli_residuals_plot.pdf", shameli_residuals_plot, height = 1.9, width = 2)

landeri_residuals <- all_residuals %>% filter(sample == "R.landeri")
landeri_residuals <- landeri_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
landeri_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "R.landeri", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = landeri_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = landeri_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
landeri_residuals_plot
ggsave(file = "Plots/landeri_residuals_plot.pdf", landeri_residuals_plot, height = 1.9, width = 2)

pearsonii_residuals <- all_residuals %>% filter(sample == "R.pearsonii")
pearsonii_residuals <- pearsonii_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
pearsonii_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "R.pearsonii", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = pearsonii_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = pearsonii_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
pearsonii_residuals_plot
ggsave(file = "Plots/pearsonii_residuals_plot.pdf", pearsonii_residuals_plot, height = 1.9, width = 2)

pangolin_residuals <- all_residuals %>% filter(sample == "M.javanica")
pangolin_residuals <- pangolin_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
pangolin_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "M.javanica", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = pangolin_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = pangolin_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
pangolin_residuals_plot
ggsave(file = "Plots/pangolin_residuals_plot.pdf", pangolin_residuals_plot, height = 1.9, width = 2)

alcyone_residuals <- all_residuals %>% filter(sample == "R.alcyone")
alcyone_residuals <- alcyone_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
alcyone_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "R.alcyone", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = alcyone_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = alcyone_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
alcyone_residuals_plot
ggsave(file = "Plots/alcyone_residuals_plot.pdf", alcyone_residuals_plot, height = 1.9, width = 2)

pig_residuals <- all_residuals %>% filter(sample == "S.scrofa")
pig_residuals <- pig_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
pig_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "S.scrofa", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = pig_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = pig_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
pig_residuals_plot
ggsave(file = "Plots/pig_residuals_plot.pdf", pig_residuals_plot, height = 1.9, width = 2)

ferrum_residuals <- all_residuals %>% filter(sample == "R.ferrumequinum")
ferrum_residuals <- ferrum_residuals[c(seq(1,5),seq(7,10),seq(13,15),seq(19,20),25),]
ferrum_residuals_plot <- ggplot() + 
  theme(panel.grid.major = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.ticks = element_blank(),
  legend.position = "none", axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0)) + 
  labs(x = "R.ferrumequinum", y = "Residuals") +
  scale_fill_gradient2(low  = "blue", mid = "white", high = "red", limits = c(-1.4,1.4)) +
  scale_x_discrete(position = "top") +
  geom_tile(data = ferrum_residuals, aes(x = name1, y = name2, fill = residuals))  +
  geom_text(data = ferrum_residuals, aes(x = name1, y = name2, label = round(residuals,1)), angle = 45, size = 2)
ferrum_residuals_plot
ggsave(file = "Plots/Ferrum_residuals_plot.pdf", ferrum_residuals_plot, height = 1.9, width = 2)

```

```{r Most variable minimal dataset}
most_variable_orthologs <- c("R.ferrumequinum","M.musculus","R.shameli","R.sinicus_215","R.sinicus_200","R.landeri","R.pearsonii","M.javanica")
ortholog_most_variable <- ortholog2 %>% filter(ortholog %in% most_variable_orthologs)
ortholog_most_variable_matrix <- as.matrix(ortholog_most_variable[,1:6])
rownames(ortholog_most_variable_matrix) <- ortholog_most_variable$ortholog #paste("r_",rbd_parsed_df$V1,sep="")

ortholog_most_variable_matrix_dist <- dist(ortholog_most_variable_matrix, method = 'euclidean')
ortholog_most_variable_matrix_hclust <- hclust(ortholog_most_variable_matrix_dist)

ortholog_most_variable_matrix_dist2 <- dist(t(ortholog_most_variable_matrix), method = 'euclidean')
ortholog_most_variable_matrix_hclust2 <- hclust(ortholog_most_variable_matrix_dist2)

ortholog3_melt <- ortholog2_melt %>% filter(ortholog %in% most_variable_orthologs)
ortholog3_melt$ortholog <- factor(ortholog3_melt$ortholog, levels = ortholog_most_variable_matrix_hclust$labels)
ortholog3_melt$variable <- factor(ortholog3_melt$variable, levels = c("D614G","Delta","Alpha","Beta","Gamma","BA1"))

Most_variable_orthologs_s2variants <- ggplot() + 
  labs(x = NULL, y = NULL) + 
  scale_fill_gradient(low = "white", high = "red") + 
  theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) + 
  geom_tile(data = ortholog3_melt, aes(x = variable, y = ortholog, fill = value)); Most_variable_orthologs_s2variants
ggsave(file = "Plots/Most_variable_orthologs_s2variants.pdf", Most_variable_orthologs_s2variants, height = 1.45, width = 2.8)

```

```{r Overlaying unique infection patterns on ACE2 ortholog PCA}

ortholog2_hclust <- hclust(dist(ortholog2[,1:6]))
ortholog2_hclust$labels <- ortholog2[,7]
plot(ortholog2_hclust)

ggdendrogram(ortholog2_hclust, rotate = FALSE, size = 2)

# c("R.ferrumequinum","R.shameli","R.sinicus_215","R.landeri","M.musculus","R.sinicus_200","R.pearsonii","S.scrofa","M.javanica"))


highly_infectable <- c("R.alcyone", "S.scrofa", "R.affinis", "H.sapiens")
poorly_infectable <- c("R.sinicus_275", "R.sinicus_472")
n501y_favoring <- c("M.musculus", "R.sinicus_200","R.sinicus_215")
n501y_disfavoring <- c("R.shameli")

ba1_favoring <- c("R.pearsonii")
high_but_d614g_disfavoring <- c("R.landeri")
high_but_ba1_disfavoring <- c("M.javanica")

ortholog_identity_PCA_plot <- ggplot() + 
  xlab(paste("Principal component 1\n(",round(summary(ortholog_identity_pca)$importance[2] * 100,0),"% of variance)",sep="")) + 
  ylab(paste("Principal component 2\n(",round(summary(ortholog_identity_pca)$importance[5] * 100,0),"% of variance)",sep="")) + 
  geom_point(data = ortholog_identity_scores, aes(x = PC1, y = PC2), alpha = 0.25) +
  geom_point(data = ortholog_identity_scores %>% filter(label %in% highly_infectable), aes(x = PC1, y = PC2), alpha = 0.75, color = "green") +
  geom_point(data = ortholog_identity_scores %>% filter(label %in% poorly_infectable), aes(x = PC1, y = PC2), alpha = 0.75, color = "black") +
  geom_point(data = ortholog_identity_scores %>% filter(label %in% n501y_favoring), aes(x = PC1, y = PC2), alpha = 0.75, color = "red") +
  geom_point(data = ortholog_identity_scores %>% filter(label %in% n501y_disfavoring), aes(x = PC1, y = PC2), alpha = 0.75, color = "blue") +
  geom_point(data = ortholog_identity_scores %>% filter(label %in% ba1_favoring), aes(x = PC1, y = PC2), alpha = 0.75, color = "purple") +
  geom_text_repel(data = ortholog_identity_scores, aes(x = PC1, y = PC2, label = label), color = "red", size = 2, segment.color = "orange"); ortholog_identity_PCA_plot
ggsave(file = "Plots/Ortholog_identity_PCA_plot_overlay.pdf", ortholog_identity_PCA_plot, height = 2.25, width = 2.5)

# https://stackoverflow.com/questions/59361706/color-one-point-with-several-colors-in-r-ggplot2
library(ggforce)
##
ortholog_identity_scores2 <- ortholog_identity_scores
ortholog_identity_scores2$ortholog <- ortholog_identity_scores2$label

ortholog2_melt3 <- ortholog2_melt2
ortholog2_melt3[ortholog2_melt3$value <= 0,"value"] <- 0

ortholog2_melt3$radius <- ortholog2_melt3$value/max(ortholog2_melt3$value)/2
ortholog2_melt3$amount <- 1/6

ortholog2_melt3[ortholog2_melt3$ortholog == "H.sapiens(rep1)","ortholog"] <- "H.sapiens"
ortholog2_melt3[ortholog2_melt3$ortholog == "H.sapiens(rep2)","ortholog"] <- "H.sapiens"

ortholog2_melt4 <- merge(ortholog2_melt3, ortholog_identity_scores2[,c("ortholog","PC1","PC2")], by = "ortholog") %>% group_by(ortholog, variable) %>% summarize(radius = median(radius), amount = mean(amount), PC1 = mean(PC1), PC2 = mean(PC2))

ortholog2_melt4$fraction <- 0
ortholog2_melt4$infectivity <- 0
for(x in 1:nrow(ortholog2_melt4)){
  temp_set <- ortholog2_melt4 %>% filter(ortholog == ortholog2_melt4$ortholog[x])
  ortholog2_melt4$fraction[x] <- ortholog2_melt4$radius[x] / sum(temp_set$radius)
  ortholog2_melt4$infectivity[x] <- max(temp_set$radius)
}

Pieplot <- ggplot() +
  theme(legend.position = "top", panel.grid = element_blank()) +
  scale_x_continuous(limits = c(-5.3,4)) + scale_y_continuous(limits = c(-2.5,2.3)) +
  coord_equal()+ # so one gets actually circles
  geom_point(data = ortholog_identity_scores, aes(x = PC1, y = PC2), size = 0.1) +
  geom_text_repel(data = ortholog_identity_scores, aes(x = PC1, y = PC2, label = label), color = "red", size = 1.5, segment.color = "orange", box.padding = 0.75, min.segment.length = unit(0, 'lines'), max.overlaps = 20, segment.alpha = 0.3) +
  geom_arc_bar(data = ortholog2_melt4 %>% filter(!(ortholog %in% c("R.landeri","R.alcyone","R.shameli","R.affinis"))), aes(x0=PC1, y0=PC2, r0=0, r=infectivity/2, amount=fraction, fill=variable), stat="pie", linewidth = 0.1) +
  geom_arc_bar(data = ortholog2_melt4 %>% filter((ortholog %in% c("R.landeri"))), aes(x0=PC1-0.25, y0=PC2, r0=0, r=infectivity/2, amount=fraction, fill=variable), stat="pie", linewidth = 0.1) +
  geom_arc_bar(data = ortholog2_melt4 %>% filter((ortholog %in% c("R.alcyone"))), aes(x0=PC1+0.25, y0=PC2, r0=0, r=infectivity/2, amount=fraction, fill=variable), stat="pie", linewidth = 0.1) +
  geom_arc_bar(data = ortholog2_melt4 %>% filter((ortholog %in% c("R.shameli"))), aes(x0=PC1, y0=PC2+0.2, r0=0, r=infectivity/2, amount=fraction, fill=variable), stat="pie", linewidth = 0.1) +
  geom_arc_bar(data = ortholog2_melt4 %>% filter((ortholog %in% c("R.affinis"))), aes(x0=PC1+0.2, y0=PC2, r0=0, r=infectivity/2, amount=fraction, fill=variable), stat="pie", linewidth = 0.1) +
  geom_point(data = ortholog_identity_scores %>% filter(label %in% c("R.sinicus_275","R.sinicus_472")), aes(x = PC1, y = PC2), size = 0.05, color = "white") +
  NULL; Pieplot
ggsave(file = "Plots/Pieplot.pdf", Pieplot, height = 3, width = 3.75)

```

```{r Cumulative host compatibility plot}
#ortholog2_melt_nozero <- ortholog2_melt
#ortholog2_melt_nozero[ortholog2_melt_nozero$value < 0.01,"value"] <- 0.01

ortholog2_melt_nozero <- ortholog2_melt
ortholog2_melt_nozero[ortholog2_melt_nozero$value < 0.01,"value"] <- 0.01

ortholog2_melt_nozero$compatible <- FALSE
ortholog2_melt_nozero[ortholog2_melt_nozero$value >= 0.2,"compatible"] <- TRUE

Ortholog_compatibility_histogram <- ggplot()  + 
  labs(x = "Normalized\ninfection", y = "Count") + 
  scale_y_continuous(limits = c(0, 35), expand = c(0,0)) +
  scale_x_log10(limits = c(0.008,3), expand = c(0,0), breaks = c(0.1,1)) + 
  #geom_vline(xintercept = 0.014, linetype = 1, color = "purple") +
  geom_vline(xintercept = 0.2, linetype = 2) +
  geom_histogram(data = ortholog2_melt_nozero, aes(x = value), bins = 27, color = "black", fill = "grey80") +
  geom_histogram(data = ortholog2_melt_nozero %>% filter(compatible == FALSE & value == 0.01), aes(x = value), bins = 27, color = "black", fill = "cyan") +
  geom_histogram(data = ortholog2_melt_nozero %>% filter(compatible == TRUE), aes(x = value), bins = 27, color = "black", fill = "green") +
  NULL; Ortholog_compatibility_histogram
ggsave(file = "Plots/Ortholog_compatibility_histogram.pdf", Ortholog_compatibility_histogram, height = 1.6, width = 1.25)
```


```{r Cumulative host compatibility plot}
ortholog2_melt_nozero2 <- ortholog2_melt_nozero
ortholog2_melt_nozero2$compat <- as.integer(as.logical(ortholog2_melt_nozero2$compatible))

ortholog2_melt_nozero2$ortholog <- factor(ortholog2_melt_nozero2$ortholog, levels = c(ortholog_identity3_matrix_hclust_order,"Control"))
ortholog2_melt_nozero2 <- ortholog2_melt_nozero2 %>% filter(!is.na(ortholog))

Variant_compatibility_sampling_6set <- ggplot() + 
  labs(x = NULL, y = NULL) + 
  scale_fill_gradient(low = "white", high = "green") + 
  theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) + 
  geom_tile(data = ortholog2_melt_nozero2, aes(x = variable, y = ortholog, fill = compat), color = "black"); Variant_compatibility_sampling_6set
ggsave(file = "Plots/Variant_compatibility_sampling_6set.pdf", Variant_compatibility_sampling_6set, height = 2.2, width = 2.8)


ortholog2_nozero <- cast(ortholog2_melt_nozero %>% mutate(value = compatible) %>% select(!compatible))

## Tally in order
ortholog2_nozero3 <- ortholog2_nozero
ortholog2_nozero3$zero_var <- ortholog2_nozero3$D614G
ortholog2_nozero3$one_var <- rowSums(ortholog2_nozero3[,c("D614G","Alpha")])
ortholog2_nozero3$two_var <- rowSums(ortholog2_nozero3[,c("D614G","Alpha","Beta")])
ortholog2_nozero3$three_var <- rowSums(ortholog2_nozero3[,c("D614G","Alpha","Beta","Gamma")])
ortholog2_nozero3$four_var <- rowSums(ortholog2_nozero3[,c("D614G","Alpha","Beta","Gamma", "Delta")])
ortholog2_nozero3$five_var <- rowSums(ortholog2_nozero3[,c("D614G","Alpha","Beta","Gamma", "Delta", "BA1")])

ortholog2_nozero3[ortholog2_nozero3$zero_var >= 1, "zero_var"] <- 1
ortholog2_nozero3[ortholog2_nozero3$one_var >= 1, "one_var"] <- 1
ortholog2_nozero3[ortholog2_nozero3$two_var >= 1, "two_var"] <- 1
ortholog2_nozero3[ortholog2_nozero3$three_var >= 1, "three_var"] <- 1
ortholog2_nozero3[ortholog2_nozero3$four_var >= 1, "four_var"] <- 1
ortholog2_nozero3[ortholog2_nozero3$five_var >= 1, "five_var"] <- 1

ortholog2_nozero4 <- data.frame(ortholog2_nozero3[,c("ortholog","zero_var","one_var","two_var","three_var","four_var","five_var")])
ortholog2_nozero4$ortholog <- as.character(ortholog2_nozero4$ortholog)
ortholog2_nozero4_melt <- melt(ortholog2_nozero4, id = "ortholog")

ortholog2_nozero4_melt$ortholog <- factor(ortholog2_nozero4_melt$ortholog, levels = c(ortholog_identity3_matrix_hclust_order,"Control"))
ortholog2_nozero4_melt <- ortholog2_nozero4_melt %>% filter(!is.na(ortholog))

Variant_compatibility_cumulative <- ggplot() + 
  labs(x = NULL, y = NULL) + 
  scale_fill_gradient(low = "white", high = "darkgreen") + 
  theme(axis.text.x = element_text(hjust = 1, vjust = 1, angle = 45)) + 
  geom_tile(data = ortholog2_nozero4_melt, aes(x = variable, y = ortholog, fill = value), color = "black"); Variant_compatibility_cumulative
ggsave(file = "Plots/Variant_compatibility_cumulative.pdf", Variant_compatibility_cumulative, height = 2.25, width = 2.8)


## Finally make a line chart

ortholog2_nozero4_melt_summary <- ortholog2_nozero4_melt %>% filter(variable %in% c("zero_var","two_var","three_var","five_var")) %>% group_by(variable) %>% summarize(sum = sum(value)) %>% mutate(index = c(0,2,3,5), fraction = sum/14)


Variant_compatibility_cumulative_linegraph <- ggplot() + 
  theme(axis.text.x = element_blank(), panel.grid = element_blank(), axis.ticks.x = element_blank()) + 
  labs(x = NULL, y = NULL) + 
  scale_fill_gradient(low = "white", high = "darkgreen") + 
  scale_y_continuous(limits = c(0,1), expand = c(0,0), breaks = c(0,0.5,1), position = "right") +
  geom_hline(yintercept = 0.5, linetype = 2, alpha = 0.4) +
  geom_line(data = ortholog2_nozero4_melt_summary, aes(x = index, y = fraction), alpha = 0.5) +
  geom_point(data = ortholog2_nozero4_melt_summary, aes(x = index, y = fraction), size = 2, color = "black") +
  geom_point(data = ortholog2_nozero4_melt_summary, aes(x = index, y = fraction), color = "green"); Variant_compatibility_cumulative_linegraph
ggsave(file = "Plots/Variant_compatibility_cumulative_linegraph.pdf", Variant_compatibility_cumulative_linegraph, height = 0.5, width = 2.4)
```

```{r Final comparison of variant correlations}

final_variant_correlations_combined <- merge(merge(merge(struct_cor_matrix_df[,c("virus1","virus2","pearson_rsquared")], ortholog_inf_cor_matrix_df[,c("virus1","virus2","pearson_rsquared")], by = c("virus1","virus2")), mut_inf_cor_matrix_df[,c("virus1","virus2","pearson_rsquared")], by = c("virus1","virus2")), rbd_identity3_melt,  by = c("virus1","virus2"))

colnames(final_variant_correlations_combined) <- c("virus1","virus2","structure","ortholog_inf","mut_inf","aa_diff")

ggplot() + geom_point(data = final_variant_correlations_combined, aes(x = structure, y = aa_diff))

ggplot() + geom_point(data = final_variant_correlations_combined, aes(x = aa_diff, y = mut_inf))
ggplot() + geom_point(data = final_variant_correlations_combined, aes(x = aa_diff, y = ortholog_inf))

Final_correlations_plot <- ggplot() + scale_x_continuous(limits = c(-0.01,1.01)) + scale_y_continuous(limits = c(0.2,1.01)) +
  geom_point(data = final_variant_correlations_combined, aes(x = mut_inf+0.01, y = ortholog_inf, fill = virus2), shape = 24) +
  geom_point(data = final_variant_correlations_combined, aes(x = mut_inf-0.01, y = ortholog_inf, fill = virus1), shape = 25) +
  geom_point(data = final_variant_correlations_combined, aes(x = mut_inf, y = ortholog_inf, fill = virus1), size = 0.5) +
  NULL; Final_correlations_plot
ggsave(file = "Plots/Final_correlations_plot.pdf", Final_correlations_plot, height = 2, width = 4)

Final_correlations_plot <- ggplot() + scale_x_continuous(limits = c(-0.01,1.01)) + scale_y_continuous(limits = c(0.2,1.02)) +
  geom_point(data = final_variant_correlations_combined, aes(x = mut_inf, y = ortholog_inf+0.01, fill = virus2), shape = 24, size = 1) +
  geom_point(data = final_variant_correlations_combined, aes(x = mut_inf, y = ortholog_inf-0.01, fill = virus1), shape = 25, size = 1) +
  #geom_point(data = final_variant_correlations_combined, aes(x = mut_inf, y = ortholog_inf, fill = virus1), size = 0.5) +
  NULL; Final_correlations_plot
ggsave(file = "Plots/Final_correlations_plot.pdf", Final_correlations_plot, height = 2, width = 3.5)


final_variant_correlations_combined_filtered <- final_variant_correlations_combined %>% filter(structure != 1) %>% arrange(desc(ortholog_inf))
final_variant_correlations_combined_filtered$index <- seq(1,nrow(final_variant_correlations_combined_filtered))
final_variant_correlations_combined_filtered2 <- final_variant_correlations_combined_filtered %>% filter(index %in% seq(1,29,2))

final_variant_correlations_combined_filtered2$virus1 <- factor(final_variant_correlations_combined_filtered2$virus1, levels = c("D614G", "Alpha", "Beta","Delta","Gamma","BA1"))
final_variant_correlations_combined_filtered2$virus2 <- factor(final_variant_correlations_combined_filtered2$virus2, levels = c("D614G", "Alpha", "Beta","Delta","Gamma","BA1"))

rbd_color_manual <- c("D614G" = "#F8766D", "Alpha" = "#B79F00", "Beta" = "#00BA38", "Gamma" = "#00BFC4", "Delta" = "#619CFF", "BA1" = "#F564E3")

Final_correlations_plot <- ggplot() + theme(legend.position = "right", panel.grid.major = element_blank()) + 
  scale_color_manual(values = rbd_color_manual) + 
  scale_x_continuous(limits = c(-0.01,1.01)) + scale_y_continuous(limits = c(0.2,1.02)) +
  geom_point(data = final_variant_correlations_combined_filtered2, aes(x = mut_inf, y = ortholog_inf, color = virus1), shape = 16, size = 1.25, alpha = 0.5) +
  geom_point(data = final_variant_correlations_combined_filtered2, aes(x = mut_inf, y = ortholog_inf, color = virus2), shape = 1, size = 1.6, stroke = 1.25, alpha = 0.25) +
  NULL; Final_correlations_plot
ggsave(file = "Plots/Final_correlations_plot.pdf", Final_correlations_plot, height = 2.25, width = 3.5)


```

```{r Additional plots as part of the revision process}

```