/
LynchPool_caller.py
149 lines (110 loc) · 4.78 KB
/
LynchPool_caller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
# -*- coding: ASCII -*-
#####This python script applyies to allele frequency estimator for pool seq data from Lynch et al. 2014 GBE.
#####Please note it is slow, and is only designed for obtain allele frequencies at specific SNPs in smallish numbers (tens to hundreds) of individuals. I do not recommend using this as a general variant caller.
#####Pysam and Numpy must be installed in the version of python used.
#####Bam files must be indexed.
#####To guard against mis-mappings and CNVs, the program only outputs sites with coverage between a third and twice the mean at the set of SNPs considered
#####The SNP file must have the tab seperated fields in the following order: chromosome, position (one-based), reference allele, alternate allele
#####If you want to change things like mapping and base quality threshold, edit the python code under the section "Input arguments"
#####Written (poorly) by Krishna Veeramah (krishna.veeramah@stonybrook.edu)
#####usage is ./LynchPool_called.py <bamfile> <fileoutname> <target_SNP_file> <reference_genome>
###import libraries
import string
import numpy as np
import pysam
import gzip
import math
import copy
from sys import argv
import time
###Input arguments
BAMin=argv[1] #filename of bam with poolseq data
filenameout=argv[2] #creates a vcf
SNPfile=argv[3] #must have the tab seperated fields in the following order chromosome, position (one-based), reference allele, alternate allele
#ref_file=argv[4]
MQ_t=20 #mapping quality threshold
BQ_t=20 #base_qualitythreshold
###converts phred score to probability
def phred2prob(x):
return 10.0**(-x/10.0)
###converts probability to phred score
def prob2phred(x):
return -10*math.log10(x)
###extract base of reads for a give position in a bam.
def extract_bam_SNP_base_only(samfile,chromo,pos,BQ,MQ):
var_list=[]
for pileupcolumn in samfile.pileup(chromo,pos-1,pos,truncate=True,stepper='all'):
for pileupread in pileupcolumn.pileups:
if not pileupread.is_del and not pileupread.is_refskip:
if (pileupread.alignment.mapping_quality>=MQ) and (ord(pileupread.alignment.qual[pileupread.query_position])-33>=BQ):
var_list.append(pileupread.alignment.query_sequence[pileupread.query_position])
return var_list
#####open reference file
##ref=pysam.FastaFile(ref_file)
##chromos=ref.references
###Read SNPlist
file=open(SNPfile,'r')
SNPs=file.read()
SNPs=string.split(SNPs,'\n')
if SNPs[-1]=='':
del(SNPs[-1])
nb_SNPs=len(SNPs)
samfile = pysam.AlignmentFile(BAMin, "rb")
samp_name=samfile.header['RG'][0]['SM']
all_counts=np.zeros((nb_SNPs,3),dtype='float32') #major_allele, minor_allele, other_allele
Mm_allele=np.zeros((nb_SNPs),dtype='int32') #0=ref is major allele, 1=alt is major allele. In a tie, ref is chosen as major
###Iterate through SNPs
for g in range(len(SNPs)):
k=string.split(SNPs[g])
chromo=k[0]
k[1]=int(k[1])
SNPs[g]=k
ref_all=k[2]
alt_all=k[3]
read_list=extract_bam_SNP_base_only(samfile,k[0],k[1],BQ_t,MQ_t)
#all_counts[g]=[read_list.count('A'),read_list.count('C'),read_list.count('G'),read_list.count('T')]
if read_list.count(ref_all)>=read_list.count(alt_all):
all_counts[g]=[read_list.count(ref_all),read_list.count(alt_all),len(read_list)-read_list.count(ref_all)-read_list.count(alt_all)]
else:
all_counts[g]=[read_list.count(alt_all),read_list.count(ref_all),len(read_list)-read_list.count(ref_all)-read_list.count(alt_all)]
Mm_allele[g]=1
if g%100==0:
print g,k
###calulate depth stats
depth=np.sum(all_counts,axis=1)
min_DP=round(np.mean(depth)/3)
max_DP=round(np.mean(depth)*2)
###first iteration through Lynch
#calculate error
p_e=all_counts[:,2]/depth
E=3*p_e/2
p_m=all_counts[:,0]/np.sum(all_counts[:,0:2],axis=1)
p_hat=p_m*(1.0-(2.0*E/3.0))-(E/3.0)/1-(4*E/3)
###second iteration
a=p_hat>0.9
b=depth<min_DP
c=depth>max_DP
exclude=a+b+c
exclude_swap=exclude == False
E_mean=np.mean(E[exclude_swap])
for g in range(len(E)):
if p_hat[g]>0.9:
E[g]=E_mean
p_hat2=p_m*(1.0-(2.0*E/3.0))-(E/3.0)/1-(4*E/3)
outfile=open(filenameout,'w')
header='chrom\tpos\tref\talt\talt_AF\talt_AF_correct\tref:alt:other_dp\tDP\tDP_ok?\n'
outfile.write(header)
for g in range(len(SNPs)):
out=SNPs[g][0]+'\t'+str(SNPs[g][1])+'\t'+SNPs[g][2]+'\t'+SNPs[g][3]+'\t'
if Mm_allele[g]==0:
out=out+str(1-round(p_hat[g],3))+'\t'+str(1-round(p_hat2[g],3))+'\t'+str(int(all_counts[g][0]))+':'+str(int(all_counts[g][1]))
else:
out=out+str(round(p_hat[g],3))+'\t'+str(round(p_hat2[g],3))+'\t'+str(int(all_counts[g][1]))+':'+str(int(all_counts[g][0]))
out=out+':'+str(int(all_counts[g][2]))+'\t'+str(int(depth[g]))+'\t'
if min_DP<=depth[g]<=max_DP:
out=out+'Y\n'
else:
out=out+'N\n'
outfile.write(out)
outfile.close()