/
FindSARSCoV2Dels.R
69 lines (62 loc) · 2.23 KB
/
FindSARSCoV2Dels.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
FindSARSCoV2Dels=function() #This finds deletions in a region around the cleavage site for the sequence file called subseqs_09-16-20.fasta
{
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ShortRead")
library(ShortRead)
Bigdata=readDNAStringSet("subseqs_09-16-20.fasta") #Unzip this file from Github.
Bigdata2=as.matrix(Bigdata)
#Find the errors
errnums=0
listerrs=array(dim=c(10000,6))
for (a in 1:100188)
{
if(length(which(Bigdata2[a,]=="-"))>2)
{
temperrs=which(Bigdata2[1,]!=Bigdata2[a,])
for(ab in 1:length(temperrs))
{
listerrs[errnums+ab,1]=Bigdata2[1,temperrs[ab]][[1]] #Ref base
listerrs[errnums+ab,2]=Bigdata2[a,temperrs[ab]][[1]] #Mut base
listerrs[errnums+ab,3]=temperrs[ab] #base location in alignment
listerrs[errnums+ab,4]=a #Which sequence it came from in the alignment
listerrs[errnums+ab,5]=names(Bigdata)[a] #Which sequence it came from in the alignment
}
errnums=errnums+length(temperrs)
}
}
listerrs=listerrs[1:errnums,]
dat4="Error_List_16_9_20"
write.table(listerrs,dat4)
print(paste("There were",dim(listerrs)[1],"errors.",sep=" "))
#Del adjuster Moves length of deletion to col 1
listerrs=read.table(dat4,as.is=T) #This is 100% not the way to do this- just array to table or something
dropthese=c(errnums+1) #Makes sure blank is removed
listerrs[errnums+1,2]="BLANK" #Adds a blank
listerrs[errnums+1,3]=0
a=1
while (a<(dim(listerrs)[1])) #So messy but too tired to think of a better way to remove all gaps.
{
n=1
if (listerrs[a,2]=="-")
{
while(listerrs[a+n,2]=="-"&(listerrs[a+n,3]-listerrs[a+n-1,3])==1)
{
n=n+1
}
listerrs[a,1]=n
if (n>1)
{
dropthese=c(dropthese,seq(a+1,a+n-1))
a=a+n-1
}
}
a=a+1
}
listerrs=listerrs[-dropthese,]
#Concatenate and list out errors
listerrs[,6]=paste(listerrs[,1],listerrs[,2],listerrs[,3],sep="_")
listerrs=listerrs[which(listerrs[,2]=="-"),]
print(listerrs)
print("I then checked to see whether these deletions were from human sequences in the cleavage site which had not been passaged on Vero cells and did not result in a frameshift mutation. I found 2 sequences England/20238034404/2020 and USA/MO-WUSTL069/2020")
}