/
vcf2fasta.py
101 lines (77 loc) · 3.06 KB
/
vcf2fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# -*- coding: ASCII -*-
#####This python script takes a vcf file and creates fasta and paml input files for regions specified in a bed file for all samples
#####When encountering a heteroygote, a random allele is chosen. Sequence not noted in the vcf file is inferred from the reference genome
#####Pysam and Pyvcfmust be installed in the version of python used.
#####vcf files must be bgzipped and tabix indexed. Reference genome must also be indexed
#####The region bed file must have the tab seperated fields in the following order: chromosome, start position, end position
#####Written (poorly) by Krishna Veeramah (krishna.veeramah@stonybrook.edu)
#####usage is ./vcf2fasta.py <target_region_file.bed> <out_stem> <reference_genome>
###import libraries
import string
import vcf
import pysam
from sys import argv
from random import randint
###Input arguments
VCFin=argv[1]# tabindex and bgzipped vcf file
filenameout=argv[2] #stem of output file
region_file=argv[3] #must have the tab seperated fields in the following order chromosome, start position, end position (zero_based)
ref_file=argv[4] #reference genome
###Read region file
file=open(region_file,'r')
regions=file.read()
regions=string.split(regions,'\n')
if regions[-1]=='':
del(regions[-1])
###Load up reference genome
ref=pysam.FastaFile(ref_file)
###Load up vcf file
vcf_reader = vcf.Reader(open(VCFin, 'r'))
###Get sample names
samples=vcf_reader.samples
paml_names=[]
for g in range(len(samples)):
if len(samples[g])<10:
paml_names.append(samples[g]+' '*(10-len(samples[g])))
else:
paml_names.append(samples[g][:10])
nb_samps=len(samples)
###Iterate through regions
for g in range(len(regions)):
print 'running '+regions[g]
k=string.split(regions[g])
chromo=k[0]
start=int(k[1])
end=int(k[2])
base=ref.fetch(chromo,start,end)
base=base.upper()
seqs=[]
for gg in range(nb_samps):
seqs.append(list(base))
for record in vcf_reader.fetch(chromo, start, end):
alleles=[record.REF]
for gg in range(len(record.ALT)):
alleles.append(str(record.ALT[gg]))
pos=record.POS
index=pos-start-1
for gg in range(len(record.samples)):
geno=record.samples[gg]['GT']
if (geno<>'0/0') and (geno<>None):
all12=[alleles[int(geno[0])],alleles[int(geno[-1])]]
change_all=all12[randint(0,1)]
seqs[gg][index]=change_all
region_string=string.join(string.split(regions[g],'\t'),"_")
fileout1=open(filenameout+'_'+region_string+'.fasta','w')
fileout2=open(filenameout+'_'+region_string+'.paml','w')
out2=' '+str(len(paml_names))+'\t'+str(len(base))+'\n'
fileout2.write(out2)
for gg in range(len(seqs)):
out1='>'+samples[gg]+'\n'
for ggg in range(0,len(seqs[gg]),60):
out1=out1+string.join(seqs[gg][ggg:ggg+60],'')+'\n'
fileout1.write(out1)
out2=paml_names[gg]+' '+string.join(seqs[gg],'')+'\n'
fileout2.write(out2)
fileout1.close()
print 'Finished. Laterz'