Skip to content

Commit

Permalink
PREDICT extra records and higher taxonomy added
Browse files Browse the repository at this point in the history
  • Loading branch information
Colin J. Carlson committed Jul 19, 2021
1 parent 42fabe5 commit affc13b
Show file tree
Hide file tree
Showing 21 changed files with 37,461 additions and 23,204 deletions.
Expand Up @@ -306,7 +306,7 @@ predict %>% filter(is.na(Host)) %>% pull(HostOriginal) %>% unique()

predict %<>% select(-VirusIntermediate)

write_csv(predict, "Intermediate/Unformatted/PREDICTUnformatted.csv")
write_csv(predict, "Intermediate/Unformatted/PREDICTMainUnformatted.csv")

# ##### Double check the NCBItaxonomy on the viruses I guess
# # test %>% filter(is.na(HostGenus))
5 changes: 3 additions & 2 deletions Code/02_2b_Format PREDICT.R
Expand Up @@ -64,10 +64,11 @@ predict %<>% left_join(classer)

### Format

predict <- bind_rows(temp, predict)
predict <- bind_rows(temp, predict %>% mutate(HostTaxID = as.double(HostTaxID),
VirusTaxID = as.double(VirusTaxID)))

predict %<>% mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", "HostClass",
"Virus", "VirusGenus", "VirusFamily", "VirusOrder", "VirusClass"),
tolower)

write_csv(predict, "Intermediate/Formatted/PREDICTFormatted.csv")
write_csv(predict, "Intermediate/Formatted/PREDICTMainFormatted.csv")
115 changes: 115 additions & 0 deletions Code/02_2c_Digest PREDICT PCR.R
@@ -0,0 +1,115 @@

if(!exists('vdict')) {source('Code/001_TaxizeFunctions.R')}

library(tidyverse)
library(magrittr)
library(lubridate)
library(naniar)

predict.1 <- read_csv("~/Github/ept/PredictData (2).csv")
predict.2 <- read_csv("~/Github/ept/PREDICT_PCR_Tests.csv")

predict.1 %<>% # select(`Species Scientific Name Based on Field Morphology`,
#Virus) %>%
rename(Host = "Species Scientific Name Based on Field Morphology") %>%
distinct() %>%
mutate(Host = str_replace(Host, " \\*", "")) %>%
mutate(Host = str_replace(Host, " cf.", "")) %>%
mutate(Virus = str_replace(Virus, "strain of ", "")) %>%
mutate(Host = str_to_lower(Host), Virus = str_to_lower(Virus))

predict.2 %<>% # select(ScientificName, Virus) %>%
rename(Host = ScientificName) %>%
filter(!is.na(Virus)) %>%
mutate(Host = str_replace(Host, " cf.", "")) %>%
distinct() %>%
mutate(Host = str_to_lower(Host), Virus = str_to_lower(Virus))

# Only grab the 50 or so records the original data are missing
predict.raw <- anti_join(predict.2, predict.1, by = c("Host", "Virus"))

# A couple sanity checks
# table(predict.raw$TestResult)
# table(predict.raw$TestType)

predict.raw %<>% select(Host,
Virus,
PREDICT_SampleID,
GenbankAccessionNumber) %>%

# Rename the columns
rename(NCBIAccession = "GenbankAccessionNumber") %>%

# Collapse the Genbank info
group_by_at(vars(-NCBIAccession)) %>%
summarize(NCBIAccession = str_c(NCBIAccession, collapse = ", ")) %>%
unique() %>%

# Clean up the host info
# First, remove fuzzy names
mutate(HostFlagID = str_detect(Host, "cf."),
Host = str_replace(Host, " cf.", "")) %>%
mutate(Virus = word(Virus, 1, sep = "\\("))

# Let's do some higher classifications to this

twowords <- function(x) {
q = word(x, 1:2, sep=" ")
if(is.na(q[1])) {return(x)} else {return(str_c(na.omit(q), collapse = " "))}
}
host.tax <- hdict(predict.raw$Host %>% unique() %>% sapply(., twowords))
predict.raw %>% rename(HostOriginal = "Host") %>%
left_join(host.tax) -> predict.raw

# Now the viruses

# First some cleaning

predict.raw %<>% mutate(Virus = recode(Virus, !!!c("influenza a" = "influenza a virus",
"alpha coronavirus nl63" = "coronavirus nl63")))

predict.raw %>% pull(Virus) %>% unique() %>% sort() -> ncbi.names

ncbi.tax <- vdict(ncbi.names)

ncbi.tax[str_detect(ncbi.tax$Virus, "predict_cov"),"VirusFamily"] <- "coronaviridae"
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_cov"),"VirusOrder"] <- "nidovirales"

ncbi.tax[str_detect(ncbi.tax$Virus, "predict_pmv"),"VirusFamily"] <- "paramyxoviridae"
ncbi.tax[str_detect(ncbi.tax$Virus, "predict_pmv"),"VirusOrder"] <- "mononegavirales"

ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusGenus"] <- "betacoronavirus"
ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusFamily"] <- "coronaviridae"
ncbi.tax[ncbi.tax$Virus=="philippines/diliman1525g2/2008","VirusOrder"] <- "nidovirales"

ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusGenus"] <- "alphacoronavirus"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusFamily"] <- "coronaviridae"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky66/65/63/60","VirusOrder"] <- "nidovirales"

ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusGenus"] <- "alphacoronavirus"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusFamily"] <- "coronaviridae"
ncbi.tax[ncbi.tax$Virus=="kenya bat coronavirus/btky83/59/58","VirusOrder"] <- "nidovirales"

if(ncbi.tax[ncbi.tax$VirusOriginal=="predict_pmv-95", "Virus"]=="Atractiella rhizophila") {
ncbi.tax[ncbi.tax$VirusOriginal=="predict_pmv-95",] <- c("predict_pmv-95", NA, FALSE, "predict_pmv-95", NA, "paramyxoviridae", "mononegavirales", "monjiviricetes")
}

predict.raw %<>% rename(VirusOriginal = "Virus") %>%
left_join(ncbi.tax, by = "VirusOriginal")

# Finally, grab the date info

meta <- read_csv("~/Github/ept/PREDICT_Animals_Sampled.csv")

meta %<>%
rename(PREDICT_SampleID = PREDICT_IndividualID) %>%
select(PREDICT_SampleID, SampleDate) %>%
mutate(CollectionYear = year(SampleDate),
CollectionMonth = month(SampleDate),
CollectionDay = day(SampleDate)) %>%
select(-SampleDate)

predict.raw %<>% left_join(meta)%>%
select(-PREDICT_SampleID)

write_csv(predict.raw, "Intermediate/Unformatted/PREDICTPCRUnformatted.csv")
70 changes: 70 additions & 0 deletions Code/02_2d_Format PREDICT PCR.R
@@ -0,0 +1,70 @@

library(tidyverse)

predict <- read_csv("Intermediate/Unformatted/PREDICTSupplementUnformatted.csv")

temp <- data.frame(Host = character(),
Virus = character(),
HostTaxID = double(),
VirusTaxID = double(),
HostNCBIResolved = logical(),
VirusNCBIResolved = logical(),
HostGenus = character(),
HostFamily = character(),
HostOrder = character(),
HostClass = character(),
HostOriginal = character(),
HostSynonyms = character(),
VirusGenus = character(),
VirusFamily = character(),
VirusOrder = character(),
VirusClass = character(),
VirusOriginal = character(),
HostFlagID = logical(),
VirusFlagContaminant = logical(),
DetectionMethod = character(),
DetectionOriginal = character(),
Database = character(),
DatabaseVersion = character(),
PublicationYear = double(),
ReferenceText = character(),
PMID = double(),
NCBIAccession = character(),
ReleaseYear = double(),
ReleaseMonth = double(),
ReleaseDay = double(),
CollectionYear = double(),
CollectionMonth = double(),
CollectionDay = double(),
stringsAsFactors = FALSE)

# Grab the VirusClass values

classer <- data.frame(VirusOrder = unique(na.omit(predict$VirusOrder)),
VirusClass = NA)

for (i in 1:nrow(classer)) {
ncbi.high <- taxize::classification(get_uid(classer$VirusOrder[i]), db = "ncbi")
classer$VirusClass[i] <- ncbi.high[[1]]$name[which(ncbi.high[[1]]$rank=='class')]
}

predict %<>% left_join(classer)

### Format

predict <- bind_rows(temp, predict)

predict %<>% mutate_at(c("Host", "HostGenus", "HostFamily", "HostOrder", "HostClass",
"Virus", "VirusGenus", "VirusFamily", "VirusOrder", "VirusClass"),
tolower)

predict %<>% mutate(DetectionMethod = "PCR/Sequencing",
DetectionOriginal = "PREDICT",
Database = "PREDICT",
DatabaseVersion = "June282021PCRTests",
ReleaseYear = 2021,
ReleaseMonth = 8,
ReleaseDay = 28)


write_csv(predict, "Intermediate/Formatted/PREDICTPCRFormatted.csv")
33 changes: 33 additions & 0 deletions Code/02_2e_Merge PREDICT and add genera.R
@@ -0,0 +1,33 @@

p1 <- read_csv("Intermediate/Formatted/PREDICTMainFormatted.csv")
p2 <- read_csv("Intermediate/Formatted/PREDICTPCRFormatted.csv")

predict <- bind_rows(p1, p2)

spill <- read_csv("~/Github/ept/SpilloverRankings.csv")
spill %<>% mutate(`Virus Species` = str_replace(`Virus Species`, "PREDICT ", "PREDICT_")) %>%
mutate(`Virus Species` = str_replace(`Virus Species`, "Adeno-Associated Virus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Adenovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Arenavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Coronavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Lentivirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Mamastrovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Mammastrovirus PREDICT", "PREDICT"), # There's one typo :)
`Virus Species` = str_replace(`Virus Species`, "Paramyxovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Picobirnavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Polyomaovirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Posavirus PREDICT", "PREDICT"),
`Virus Species` = str_replace(`Virus Species`, "Poxvirus PREDICT", "PREDICT")) %>%
select(`Virus Species`, `Virus Genus`) %>%
filter(!(`Virus Genus`=="Unassigned"))

for (i in 1:nrow(spill)){
if(nrow(predict[str_to_lower(str_replace(predict$VirusOriginal,"strain of ","")) == str_to_lower(spill$`Virus Species`[i]),'VirusGenus'])>0){
predict[str_to_lower(str_replace(predict$VirusOriginal,"strain of ","")) == str_to_lower(spill$`Virus Species`[i]),'VirusGenus'] <- str_to_lower(spill[spill$`Virus Species`==spill$`Virus Species`[i],'Virus Genus'])
}
}

predict$VirusGenus[predict$VirusOriginal=="strain of Eidolon bat coronavirus"] <- "betacoronavirus"
predict$VirusGenus[predict$VirusOriginal=="strain of Bat coronavirus Hipposideros"] <- "betacoronavirus" # this can be reconstructed from the predict.2 object (the PCR Tests) but NOT the HealthMap copy

write_csv(predict, "Intermediate/Formatted/PREDICTAllFormatted.csv")
File renamed without changes.
2 changes: 1 addition & 1 deletion Code/03_Merge clean files.R
Expand Up @@ -6,7 +6,7 @@ library(tidyverse); library(magrittr); library(vroom)
gb <- vroom("Intermediate/Formatted/GenbankFormatted.csv.gz")
clo <- read_csv("Intermediate/Formatted/CloverFormatted.csv")
#sra <- read_csv("Intermediate/Formatted/SRAFormatted.csv")
pred <- read_csv("Intermediate/Formatted/PredictFormatted.csv")
pred <- read_csv("Intermediate/Formatted/PREDICTAllFormatted.csv")
globi <- read_csv("Intermediate/Formatted/GLOBIFormatted.csv")

if(class(clo$NCBIAccession)=='numeric') {clo %<>% mutate(NCBIAccession = as.character(NCBIAccession))}
Expand Down
File renamed without changes.

0 comments on commit affc13b

Please sign in to comment.