/
aDNA_GenoCaller.py
686 lines (555 loc) · 24.6 KB
/
aDNA_GenoCaller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
#!/usr/bin/env python
# -*- coding: ASCII -*-
###This program calls genotypes from bam files at positions/regions specified in a bed file while taking into account post mortem damage as estimate by MapDamage.
###Otherwise the algorithm is the same as GATK Unified Genotype for diploid calls
###This version of the program will take any genotype with a low quality heterozygote call (Q<30) and convert to the next best homozygote call
###Three files are created:
###An emit all vcf file noting the call for all base pairs in the bed file
###An vcf file with only those sites with evidence for at least one alternative allele and that passes a predetermined QUAL filter
###An haploid emit all vcf, that gives you the most likely base under a haploid model. If two or more basepairs are tied, the reported allele is randomly chosen.
###pysam and matplotlib need to be installed.
###to run type:
##aDNA_GenoCaller <indexed bamfile> <bed file> <reference genome> <5C-T mapdamage file> <3G-A mapdamage file>
from sys import argv
import pysam
import math
import numpy as np
import string
import numpy.ma as ma
import random
import matplotlib
matplotlib.use('Agg')
from matplotlib import rcParams
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.optimize import minimize
from scipy.optimize import fmin
from scipy.optimize import fminbound
from scipy.optimize import curve_fit
import time
from random import randint
filenamein=argv[1]
filenameinB=argv[2]
ref_file=argv[3]
mapdamageCT=argv[4]
mapdamageGA=argv[5]
min_RD=1
MQ=15
BQ=15
GQ=30
QUALpass=30
theta=0.001
if filenamein[-4:] == '.bam':
filenameout=string.split(filenamein,'.bam')[0]
else:
filenameout=filenamein[:]
if '/' in filenamein:
filenameout=string.split(filenameout,'/')[-1]
plotfile=filenameout+'.'+filenameinB+'.aDNA.expMDfit_WB.pdf'
MDfile=filenameout+'.'+filenameinB+'.aDNA.fitted_model_params_WB'
filenameout1=filenameout+'.'+filenameinB+'.aDNA.emit_all.vcf'
filenameout2=filenameout+'.'+filenameinB+'.aDNA.vcf'
filenameout3=filenameout+'.'+filenameinB+'.aDNA.haploid.emit_all.vcf_like'
def phred2prob(x):
return 10.0**(-x/10.0)
def prob2phred(x):
return -10*math.log10(x)
#exponential function
def exp_fit(x,a,b,c):
return a*np.exp(-x*b)+c
#stretched exponential (weibell) function
def weibull_fit(x,a,b,c):
return a*np.exp(-(x**c)*b)
#genotype called that incorporates damage for aDNA
def geno_caller_10GT_aDNA(X):
GL=np.zeros(10) #all 10 possible genotypes and order = AA,AC,AG,AT,CC,CG,CT,GG,GT,TT
hap=np.zeros((len(X),4)) #all 4 haploid possibilities, A,C,G,T
all_dic={}
all_dic['A']=0
all_dic['C']=1
all_dic['G']=2
all_dic['T']=3
count=0
for g in range(len(X)):
if all_dic.has_key(X[g][0])==False:
continue
err=phred2prob(X[g][1])
hap[g]=err/3.0
if X[g][0]=='A':
hap[g][0]=1-err
hap[g][2]=((1-X[g][3])*(err/3.0))+(X[g][3]*(1-err))
elif X[g][0]=='C':
hap[g][1]= ((1-X[g][2])*(1-err))+(X[g][2]*(err/3.0))
elif X[g][0]=='G':
hap[g][2]= ((1-X[g][3])*(1-err))+(X[g][3]*(err/3.0))
elif X[g][0]=='T':
hap[g][3]=1-err
hap[g][1]=((1-X[g][2])*(err/3.0))+(X[g][2]*(1-err))
GL[0]=GL[0]+math.log10(hap[g][0])
GL[1]=GL[1]+math.log10((hap[g][0]+hap[g][1])/2)
GL[2]=GL[2]+math.log10((hap[g][0]+hap[g][2])/2)
GL[3]=GL[3]+math.log10((hap[g][0]+hap[g][3])/2)
GL[4]=GL[4]+math.log10(hap[g][1])
GL[5]=GL[5]+math.log10((hap[g][1]+hap[g][2])/2)
GL[6]=GL[6]+math.log10((hap[g][1]+hap[g][3])/2)
GL[7]=GL[7]+math.log10(hap[g][2])
GL[8]=GL[8]+math.log10((hap[g][2]+hap[g][3])/2)
GL[9]=GL[9]+math.log10(hap[g][3])
count+=1
if count==0:
GL.fill(-9)
return GL
#open up mapdamage files and put results in an array
fileCT=open(mapdamageCT,'r')
CTdata=fileCT.read()
fileCT.close()
CTdata=string.split(CTdata,'\n')
if CTdata[-1]=='':
del(CTdata[-1])
fileGA=open(mapdamageGA,'r')
GAdata=fileGA.read()
fileGA.close()
GAdata=string.split(GAdata,'\n')
if GAdata[-1]=='':
del(GAdata[-1])
X_CT=[]
X_GA=[]
Y_CT=[]
Y_GA=[]
for g in range(1,len(CTdata)):
k=string.split(CTdata[g],'\t')
X_CT.append(float(k[0]))
Y_CT.append(float(k[1]))
k=string.split(GAdata[g],'\t')
X_GA.append(float(k[0]))
Y_GA.append(float(k[1]))
X_CT=np.asarray(X_CT)
Y_CT=np.asarray(Y_CT)
X_GA=np.asarray(X_GA)
Y_GA=np.asarray(Y_GA)
try:
#Perform curve fitting of MapDamage results to an exponential function
print '\nFitting the following weibull model to the MapDamage data : a*exp(-(x^c)*b)'
fitParams_CT = curve_fit(weibull_fit, X_CT, Y_CT)
CTa=fitParams_CT[0][0]
CTb=fitParams_CT[0][1]
CTc=fitParams_CT[0][2]
fitParams_GA = curve_fit(weibull_fit, X_GA, Y_GA)
GAa=fitParams_GA[0][0]
GAb=fitParams_GA[0][1]
GAc=fitParams_GA[0][2]
out='Fit the following weibull model to the MapDamage data : a*exp(-(x^c)*b)\n\n'
out=out+'Fitted coefficients for CT changes:\na\t'+str(CTa)+'\nb\t'+str(CTb)+'\nc\t'+str(CTc)+'\n'
out=out+'\nFitted coefficients for GA changes:\na\t'+str(GAa)+'\nb\t'+str(GAb)+'\nc\t'+str(GAc)+'\n'
print '\n'+out
#print coefficient inference to file
fileMD=open(MDfile,'w')
fileMD.write(out)
fileMD.close()
print '\nWrote fitted coefficients to '+MDfile
#set up plot area
rcParams['figure.figsize'] = 10, 6
plt.ylabel('Substitution Frequency', fontsize = 16)
plt.xlabel('Read position', fontsize = 16)
plt.xlim(0,26)
#plot points and fitted curve
plt.plot(X_CT,Y_CT,'ro')
plt.plot(X_GA,Y_GA,'bo')
plt.plot(X_CT,weibull_fit(X_CT, fitParams_CT[0][0], fitParams_CT[0][1], fitParams_CT[0][2]),'r',ms=10,linewidth=2.0,label='C>T')
plt.plot(X_GA,weibull_fit(X_GA, fitParams_GA[0][0], fitParams_GA[0][1], fitParams_GA[0][2]),'b',ms=10,linewidth=2.0,label='G>A')
plt.legend()
# save plot to a file
plt.savefig(plotfile, bbox_inches=0, dpi=600)
plt.close()
print '\nPlotted MapDamage curve fit to '+plotfile
#Make a reference list to determine how much to adjust a particular quality score given it's read position (maxed at 300 here)
CT_decay=[]
GA_decay=[]
for g in range(300):
CTpoint=weibull_fit(g+1,CTa,CTb,CTc)-theta
GApoint=weibull_fit(g+1,GAa,GAb,GAc)-theta
if CTpoint<0.0:
CTpoint=0.0
if GApoint<0.0:
GApoint=0.0
CT_decay.append(CTpoint)
GA_decay.append(GApoint)
#in case weibull does not fit (occurs for low damage) use exponential fit
except:
#Perform curve fitting of MapDamage results to an exponential function
print '\nCould not fit weibull'
print '\nFitting the following exponential model to the MapDamage data : a*exp(-x*b)+c'
fitParams_CT = curve_fit(exp_fit, X_CT, Y_CT)
CTa=fitParams_CT[0][0]
CTb=fitParams_CT[0][1]
CTc=fitParams_CT[0][2]
fitParams_GA = curve_fit(exp_fit, X_GA, Y_GA)
GAa=fitParams_GA[0][0]
GAb=fitParams_GA[0][1]
GAc=fitParams_GA[0][2]
out='Fit the following exponential model to the MapDamage data : a*exp(-x*b)+c\n\n'
out=out+'Fitted coefficients for CT changes:\na\t'+str(CTa)+'\nb\t'+str(CTb)+'\nc\t'+str(CTc)+'\n'
out=out+'\nFitted coefficients for GA changes:\na\t'+str(GAa)+'\nb\t'+str(GAb)+'\nc\t'+str(GAc)+'\n'
print '\n'+out
#print coefficient inference to file
fileMD=open(MDfile,'w')
fileMD.write(out)
fileMD.close()
print '\nWrote fitted coefficients to '+MDfile
#set up plot area
rcParams['figure.figsize'] = 10, 6
plt.ylabel('Substitution Frequency', fontsize = 16)
plt.xlabel('Read position', fontsize = 16)
plt.xlim(0,26)
#plot points and fitted curve
plt.plot(X_CT,Y_CT,'ro')
plt.plot(X_GA,Y_GA,'bo')
plt.plot(X_CT,exp_fit(X_CT, fitParams_CT[0][0], fitParams_CT[0][1], fitParams_CT[0][2]),'r',ms=10,linewidth=2.0,label='C>T')
plt.plot(X_GA,exp_fit(X_GA, fitParams_GA[0][0], fitParams_GA[0][1], fitParams_GA[0][2]),'b',ms=10,linewidth=2.0,label='G>A')
plt.legend()
# save plot to a file
plt.savefig(plotfile, bbox_inches=0, dpi=600)
plt.close()
print '\nPlotted MapDamage curve fit to '+plotfile
#Make a reference list to determine how much to adjust a particular quality score given it's read position (maxed at 300 here)
CT_decay=[]
GA_decay=[]
for g in range(300):
CT_decay.append(exp_fit(g+1,CTa,CTb,CTc)-theta)
GA_decay.append(exp_fit(g+1,GAa,GAb,GAc)-theta)
###open up reference file
ref=pysam.FastaFile(ref_file)
###set up various look up dictionaries
all_dic={}
all_dic={}
all_dic['A']=0
all_dic['C']=1
all_dic['G']=2
all_dic['T']=3
all_dic['N']=-99
all_dic[0]='A'
all_dic[1]='C'
all_dic[2]='G'
all_dic[3]='T'
all_dic[-9]='.'
all_dic[-99]='N'
GL_dic={}
GL_dic['AA']=0
GL_dic['AC']=1
GL_dic['AG']=2
GL_dic['AT']=3
GL_dic['CC']=4
GL_dic['CG']=5
GL_dic['CT']=6
GL_dic['GG']=7
GL_dic['GT']=8
GL_dic['TT']=9
GL_dic[0]='AA'
GL_dic[1]='AC'
GL_dic[2]='AG'
GL_dic[3]='AT'
GL_dic[4]='CC'
GL_dic[5]='CG'
GL_dic[6]='CT'
GL_dic[7]='GG'
GL_dic[8]='GT'
GL_dic[9]='TT'
GL_dic[-9]='./.'
homs=[0,4,7,9]
alt_dic={}
alt_dic[0]=[0,0]
alt_dic[1]=[0,1]
alt_dic[2]=[0,2]
alt_dic[3]=[0,3]
alt_dic[4]=[1,1]
alt_dic[5]=[1,2]
alt_dic[6]=[1,3]
alt_dic[7]=[2,2]
alt_dic[8]=[2,3]
alt_dic[9]=[3,3]
tri_dic={}
tri_dic[1]='A,C'
tri_dic[2]='A,G'
tri_dic[3]='A,T'
tri_dic[5]='C,G'
tri_dic[6]='C,T'
tri_dic[8]='G,T'
LL0_map={}
LL0_map[0]=[0]
LL0_map[1]=[4]
LL0_map[2]=[7]
LL0_map[3]=[9]
LL1_map={}
LL1_map[0]=[1,2,3]
LL1_map[1]=[1,5,6]
LL1_map[2]=[2,5,8]
LL1_map[3]=[3,6,8]
LL2_map={}
LL2_map[0]=[4,5,6,7,8,9]
LL2_map[1]=[0,2,3,7,8,9]
LL2_map[2]=[0,1,3,4,6,9]
LL2_map[3]=[0,1,2,4,5,7]
homs=[0,4,7,9]
###make a list of regions to be interogated
file = open(filenameinB)
data=file.read()
data=string.split(data,'\n')
file.close()
if data[-1]=='':
del(data[-1])
SNPdir={}
SNPll={}
SNPlist=[]
count=0
for g in range(len(data)):
k=string.split(data[g])
key=k[0]+'_'+k[1]+'_'+k[2]
## chromo=k[0]
## start=int(k[1])
## end=int(k[2])
## seq=ref.fetch(chromo,start,end)
## seq=seq.upper()
SNPlist.append(key)
SNPdir[key]=k
###open up bam file (must be indexed)
samfile = pysam.AlignmentFile(filenamein, "rb")
samp_name=samfile.header['RG'][0]['SM']
count=0
###set up output files
fileout=open(filenameout1,'w')
fileout2=open(filenameout2,'w')
fileout3=open(filenameout3,'w')
###set up output headers
out1='##fileformat=VCFv4.1\n'
out2='##fileformat=VCFv4.1\n'
out3='##fileformat=Custom variant caller for_aDNA, emit all haploid\n'
outA='##Caller_arguments=<MappingQuality_filter='+str(MQ)+',BaseQuality_filter='+str(BQ)
outA=outA+',GenotypeQuality_filter='+str(GQ)+',minRD='+str(min_RD)
outA=outA+',bam_in='+filenamein+',bedfile='+filenameinB
outA=outA+',C>T_damagefile='+mapdamageCT
outA=outA+',G>A_damagefile='+mapdamageGA+'>\n'
outB='##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">\n'
outB=outB+'##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">\n'
outB=outB+'##INFO=<ID=MQ,Number=1,Type=Float,Description="mean Mapping Quality (not RMS)">\n'
outB=outB+'##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">\n'
outB=outB+'##INFO=<ID=SGC,Number=1,Type=Integer,Description="low GQ heterozygote switched to best homozygote">\n'
outC='##Time created='+(time.strftime("%H:%M:%S"))+' '+(time.strftime("%d/%m/%Y"))+'\n'
for i in range(len(ref.lengths)):
outC=outC+'##contig=<ID='+ref.references[i]+',length='+str(ref.lengths[i])+'>\n'
outC=outC+'##reference=file:'+ref_file+'\n'
out_e='#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+samp_name+'\n'
out_f='#CHROM\tPOS\tREF\tCALL\tBEST_PROB\tALL_PROB\tAD\n'
fileout.write(out1+outA+outB+outC+out_e)
fileout2.write(out2+outA+outB+outC+out_e)
fileout3.write(out3+outA+outC+out_f)
###Precomputed prios for QUAL estimation
theta_prior=[0,theta/1,theta/2]
theta_prior[0]=1-sum(theta_prior)
###Start running through regions in the bed
for gg in range(len(SNPlist)):
print 'Calculating genotype likelihoods for locus '+str(gg+1)+' : '+SNPlist[gg]
chromo=SNPdir[SNPlist[gg]][0]
pos_start=int(SNPdir[SNPlist[gg]][1])
pos_end=int(SNPdir[SNPlist[gg]][2])
###set up arrays to store results for a given bed entry (probably not to efficient for SNP data, more suited for long regions)
seq_len=pos_end-pos_start
GLs=np.zeros((seq_len,10),dtype='float32') ##Genotype likelihoods
PLs=np.zeros((seq_len,10),dtype='float32') ##Phred-scale normalized likelihoods
RDs=np.zeros((seq_len,4),dtype='int32') ##Read depth for each of the four bases
GQs=np.zeros((seq_len),dtype='float64') ##Estimated genotype qualitues
REFs=np.zeros((seq_len),dtype='int32') ##Reference allele index
GTs=np.zeros((seq_len),dtype='int32') ##Genotype index [order is AA,AC,AG,AT,CC,CG,CT,GG,GT,TT]
POS=np.zeros((seq_len),dtype='int32') ##1-based position
ALTs=np.zeros((seq_len),dtype='int32') ##Alternate allele index. If trinucleotide, store as 10
QUALs=np.zeros((seq_len),dtype='float32') ##Bayesian quality score
MQ0=np.zeros((seq_len),dtype='int32') ##number of reads with mapping 0
MQm=np.zeros((seq_len),dtype='float32') #mean mapping score
SEG=np.zeros((seq_len),dtype='int32') #hom ref, het with ref, hom alt, het without ref (tri)
SWGQ=np.zeros((seq_len),dtype='int32') #low GQhet switched to hom ref = 1
###make a list of positions
count=0
for ggg in range(pos_start,pos_end):
POS[count]=ggg+1
count+=1
###prepopulate arrays with -9s
GTs.fill(-9)
REFs.fill(-9)
ALTs.fill(-9)
SEG.fill(-9)
###Go base by base through each region in the bam file
for pileupcolumn in samfile.pileup(chromo,pos_start,pos_end,truncate=True,stepper='all'):
nucl=ref.fetch(chromo,pileupcolumn.pos,pileupcolumn.pos+1) ##grab reference sequence for region
nucl=nucl.upper()
var_list=[] ##a list to store read bases
map_list=[] ##a list to store mapping qualities
index=pileupcolumn.pos-pos_start ##tells us what position we are in for the arrays given the nucleotide position
REFs[index]=all_dic[nucl] ##recrod the reference allele
###go read by read for each position
for pileupread in pileupcolumn.pileups:
if not pileupread.is_del and not pileupread.is_refskip: ##ensure not an indel or duplicate
map_list.append(pileupread.alignment.mapping_quality) ##record mapping quality of read
if (pileupread.alignment.mapping_quality>=MQ) and (ord(pileupread.alignment.qual[pileupread.query_position])-33>=BQ) and (REFs[index]<>-99): ###Add base calls meeting the MQ and BQ filters
var_list.append([pileupread.alignment.query_sequence[pileupread.query_position],ord(pileupread.alignment.qual[pileupread.query_position])-33,CT_decay[pileupread.query_position],GA_decay[len(pileupread.alignment.query_sequence)-1-pileupread.query_position]])
###rescale qualities that are greater than 40 to a max of 40.
for ggg in range(len(var_list)):
if var_list[ggg][1]>40:
var_list[ggg][1]=40
###record read depth for each basetype
if len(var_list)>0:
all_list=list(zip(*var_list)[0])
RDs[index]=[all_list.count('A'),all_list.count('C'),all_list.count('G'),all_list.count('T')]
###work out MQ0 and mean mapping quality for the snp (not RMSE)
MQ0[index]=map_list.count(0)
try:
MQm[index]=sum(map_list)/float(len(map_list))
except:
MQm[index]=0.0
###if minimum read depth is met, try calling the genotype
if len(var_list)>=min_RD:
GLs[index]=geno_caller_10GT_aDNA(var_list) ##ancient DNA aware calculation of genotype likelihoods
GTs[index]=np.argmax(GLs[index]) ##record best genotype
PLs[index]=(GLs[index]-np.max(GLs[index]))*-10 ###calculte Phred-scale values
GQs[index]=np.msort(PLs[index])[1] ###record genotype quality
###if GQ is less than threshold and site is heterozygous, swith to best homozygous genotype
if (alt_dic[GTs[index]][0]<>alt_dic[GTs[index]][1]) and (GQs[index]<GQ):
GTs[index]=homs[np.argmax(GLs[index][homs])]
SWGQ[index]=1
###Work out basesian QUALity score
## LL_0=np.sum(10**GLs[index][LL0_map[REFs[index]]])
## LL_1=np.sum(10**GLs[index][LL1_map[REFs[index]]])
## LL_2=np.sum(10**GLs[index][LL2_map[REFs[index]]])
## norconst1=sum([LL_0,LL_1,LL_2]) ###Depristo says this, but it really should be multiplied by the prior
## norconst2=sum([LL_0*theta_prior[0],LL_1*theta_prior[1],LL_2*theta_prior[2]]) ###Depristo says this, but it really should be multiplied by the prior
##
## Pr0=(theta_prior[0]*LL_0)/norconst2
## Pr1=(theta_prior[1]*LL_1)/norconst2
## Pr2=(theta_prior[2]*LL_2)/norconst2
###this new version add constant to log likelihoods to avoid underflow
LL_0=np.sum(10**(GLs[index][LL0_map[REFs[index]]]-np.max(GLs[index]))) ##total likelihood for q=0|X
LL_1=np.sum(10**(GLs[index][LL1_map[REFs[index]]]-np.max(GLs[index]))) ##total likelihood for q=1|X, three different genotypes summed
LL_2=np.sum(10**(GLs[index][LL2_map[REFs[index]]]-np.max(GLs[index]))) ##total likelihood for q=2|X, six different genotypes summed
###calculate normalizing constant for bayes formula
## norconst1=sum([LL_0,LL_1,LL_2]) ###Depristo says this, but it really should be multiplied by the prior
norconst2=sum([LL_0*theta_prior[0],LL_1*theta_prior[1],LL_2*theta_prior[2]]) ###Depristo says this, but it really should be multiplied by the prior
###calculate posterio probabilities for q=0,1 and 2
Pr0=(theta_prior[0]*LL_0)/norconst2
Pr1=(theta_prior[1]*LL_1)/norconst2
Pr2=(theta_prior[2]*LL_2)/norconst2
###work out QUAL. If Pr=0 is greatest, work out qual from probability for q not equalling 0, otherwise, work out from qual=0
if LL0_map[REFs[index]][0]==GTs[index]:
try:
QUALs[index]=-10*math.log10(1-Pr0)
except:
QUALs[index]=100.0 #add hoc solution to deal with extremely small ref homozygote probability
SEG[index]=0
else:
try:
QUALs[index]=-10*math.log10(Pr0)
except:
QUALs[index]=1000.0 #add hoc solution to deal with extremely small ref homozygote probability
###Work out what the reference allele is, and if there is in fact a trinucleotide to
if REFs[index]==alt_dic[GTs[index]][0]: ##homo,alt heterozygote
ALTs[index]=alt_dic[GTs[index]][1]
SEG[index]=1
elif REFs[index]==alt_dic[GTs[index]][1]: ##homo,alt heterozygote
ALTs[index]=alt_dic[GTs[index]][0]
SEG[index]=1
elif alt_dic[GTs[index]][0]==alt_dic[GTs[index]][1]: #homozygote alternate
ALTs[index]=alt_dic[GTs[index]][0]
SEG[index]=2
else: #must be trinucleotide
ALTs[index]=10
SEG[index]=3
###start writing calls to vcf file
for ggg in range(len(GLs)):
LQ=0 ##indicator for a low quality site
out=chromo+'\t'+str(POS[ggg])+'\t.\t'+all_dic[REFs[ggg]]+'\t'
if ALTs[ggg]<10:
out=out+all_dic[ALTs[ggg]] #write none or single alternate allele
else:
out=out+tri_dic[GTs[ggg]] #write trinucleotide alleles
if SEG[ggg]<>-9: ###this sites was callable, even if QUAL was low
out=out+'\t'+str(round(QUALs[ggg],2))+'\t'
###indicate whether site had low QUAL score or not
if QUALs[ggg]>=QUALpass:
out=out+'PASS\t'
LQ=1
else:
out=out+'low_quality\t'
###Info tag information, alternate allele count, frequency, as well as mapping qualities
if SEG[ggg]==0:
out=out+'AC=0;AF=0.0;MQ='+str(round(MQm[ggg],2))+';MQ0='+str(MQ0[ggg])
elif SEG[ggg]==1:
out=out+'AC=1;AF=0.5;MQ='+str(round(MQm[ggg],2))+';MQ0='+str(MQ0[ggg])
elif SEG[ggg]==2:
out=out+'AC=2;AF=1.0;MQ='+str(round(MQm[ggg],2))+';MQ0='+str(MQ0[ggg])
elif SEG[ggg]==3:
out=out+'AC=1,1;AF=0.5,0.5;MQ='+str(round(MQm[ggg],2))+';MQ0='+str(MQ0[ggg])
###Make a note that heterozygote call was converted to homozygous call
if SWGQ[ggg]==1:
out=out+';SGC='+GL_dic[np.argmax(GLs[ggg])]
if SEG[ggg]==0: #site is homozygous reference, just note, GT,DP and GQ
out=out+'\tGT:DP:GQ:PL\t0/0:'+str(RDs[ggg][REFs[ggg]])+':'
if GQs[ggg]>99:
out=out+'99:'
else:
out=out+str(int(round(GQs[ggg])))+':'
##output all 10 PL values. Note the order is not the same as GATK
for gggg in range(len(PLs[ggg])):
out=out+str(int(round(PLs[ggg][gggg])))+','
out=out[:-1]+'\n'
else: #Site is heterozygous so note more information
out=out+'\tGT:AD:DP:GQ:PL\t'
if SEG[ggg]<3: #there is only one alternate allele
if SEG[ggg]==1:
out=out+'0/1:'
elif SEG[ggg]==2:
out=out+'1/1:'
out=out+str(RDs[ggg][REFs[ggg]])+','+str(RDs[ggg][ALTs[ggg]])+':'+str(np.sum(RDs[ggg]))+':'
else: #there are two alternate alleles
out=out+'1/2:'
out=out+str(RDs[ggg][REFs[ggg]])+','+str(RDs[ggg][alt_dic[GTs[ggg]][0]])+','+str(RDs[ggg][alt_dic[GTs[ggg]][1]])+':'+str(np.sum(RDs[ggg]))+':'
##cap reported GQ score to 99
if GQs[ggg]>99:
out=out+'99:'
else:
out=out+str(int(round(GQs[ggg])))+':'
##output all 10 PL values. Note the order is not the same as GATK
for gggg in range(len(PLs[ggg])):
out=out+str(int(round(PLs[ggg][gggg])))+','
out=out[:-1]+'\n'
else: ###Site had not data, i.e. usable read depth 0
out=out+'\t.\t.\t.\t.\t./.\n'
fileout.write(out) ##write to emit all
if (SEG[ggg]>=1) and (LQ==1):
fileout2.write(out) ##write to variable only vcf that passes QUAL threshold
###bayesian best haploid call
out_h=chromo+'\t'+str(POS[ggg])+'\t'+all_dic[REFs[ggg]]+'\t'
if SEG[ggg]>=0:
###Use GLs for homozygous genotypes to calculate probablity of A,C,G,T. Reference unaware, so prior probability of a particular base is equal to the other three bases
norm_const3=np.sum(10**(GLs[ggg][homs]-np.max(GLs[ggg][homs]))*0.25)
post=(10**(GLs[ggg][homs]-np.max(GLs[ggg][homs]))*0.25)/norm_const3
##this section checks if the highest posterior is unique or found for multiple bases
max_val=np.max(post)
matches=[]
post_str=''
for gggg in range(len(post)):
if post[gggg]==max_val:
matches.append(gggg)
post_str=post_str+str(round(post[gggg],3))+','
##if max val is 0.25, there is not information at this site, so set as N
if max_val<=0.25:
out_h=out_h+'N\t0.25\t0.25,0.25,0.25,0.25'
else:
if len(matches)==1: ##get the base with the highest LL
out_h=out_h+all_dic[matches[0]]+'\t'+str(round(post[matches[0]],3))+'\t'+post_str[:-1]
else:
rand_allele=randint(0,len(matches)-1) ##if two or more bases are tied for best likelihood, randomly choose one
out_h=out_h+all_dic[matches[rand_allele]]+'\t'+str(round(post[matches[rand_allele]],3))+'\t'+post_str[:-1]
out_h=out_h+'\t'+str(RDs[ggg][0])+','+str(RDs[ggg][1])+','+str(RDs[ggg][2])+','+str(RDs[ggg][3])+'\n'
else:
out_h=out_h+'N\t0.25\t0.25,0.25,0.25,0.25\t0,0,0,0\n'
fileout3.write(out_h)
fileout.close()
fileout2.close()
fileout3.close()