-
Notifications
You must be signed in to change notification settings - Fork 0
/
MobTyper.py
1161 lines (995 loc) · 44 KB
/
MobTyper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import pysam
import glob
import argparse, sys
import math, time, re, numpy
from intervaltree import Interval, IntervalTree
from collections import Counter, defaultdict
from argparse import RawTextHelpFormatter
from scipy.stats import mode
__author__ = "Ryan Smith (ryanpsmith@wustl.edu)"
__version__ = "$Revision: 0.1.0 $"
__date__ = "$Date: 2015-14-8 $"
def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
svtyper\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: Compute genotype of MEI variants based on breakpoint depth")
parser.add_argument('-r', '--ref_bams', type=str, required=False, nargs="+", default=False, help="Reference alignments BAM file or files (space delimited)")
parser.add_argument('-d', '--mob_dirs', type=str, required=True, nargs="+", help="Mobster output dirs (space delimited)")
parser.add_argument('-o', '--output', type=str, required=False, default="/dev/stdout", help='Output VCF')
parser.add_argument('-f', '--splflank', type=int, required=False, default=20, help='min number of split read query bases flanking breakpoint on either side [20]')
parser.add_argument('-F', '--discflank', type=int, required=False, default=20, help='min number of discordant read query bases flanking breakpoint on either side. (should not exceed read length) [20]')
parser.add_argument('-n', '--num_samp', type=int, required=False, default=1000000, help='number of pairs to sample from BAM file for building insert size distribution [1000000]')
parser.add_argument('-v', '--vcf_in', type=argparse.FileType('r'), required=False, default=False, help='Input VCF file. Overrides Mobster predictions: only the input variants will be genotyped')
parser.add_argument('-e', '--excludes', type=argparse.FileType('r'), required=False, default=False, help='Regions to exclude (BED-like file)')
parser.add_argument('-m', '--merge_only', required=False, default=False, action='store_true', help='Only produce a merged VCF from the given mobster dirs')
parser.add_argument('--no_merge', required=False, default=False, action='store_true', help='Do not merge calls by confidence intervals; instead merge by insertion site (taking the largest CI around it)')
parser.add_argument('-q', '--quiet', required=False, default=False, action='store_true', help='Turn off stderr status outputs')
parser.add_argument('-p', '--pair', required=False, default=False, action='store_true', help='Only use paired-end reads')
parser.add_argument('-s', '--split', required=False, default=False, action='store_true', help='Only use split reads')
# parse the arguments
args = parser.parse_args()
#check for improper ref_bam and merge_only args combination
if not args.ref_bams and not args.merge_only:
sys.stderr.write(args.usage)
exit("Error: At least one reference BAM (-r) is required unless --merge_only (-m) is specified\n")
#check matching ref and mobster file lens
if args.ref_bams:
if len(set([len(args.ref_bams), len(args.mob_dirs)])) > 1:
exit("Error: input mobster and bam file lists must be same length\n")
return args
def main():
args = get_args()
#for merge_only mode
if args.merge_only:
mob_vcf, variants = merge_mobster(args.mob_dirs, args.no_merge)
if args.excludes:
variants = filter_excludes(variants, args.excludes)
vcf_out = open(args.output, 'w')
vcf_out.write(mob_vcf.get_header()+"\n")
for var in variants:
vcf_out.write(var.get_var_string()+"\n")
vcf_out.close()
#normal mode
else:
genotype_MEIs(args.ref_bams, args.mob_dirs, args.vcf_in, args.excludes, args.output, args.num_samp, args.splflank, args.discflank, args.no_merge, args.quiet, args.pair, args.split)
return
def filter_excludes(variants, exclude_file):
#need a list of interval tuples to construct the excludes interval tree
excludes = defaultdict(IntervalTree)
filtered = []
for line in exclude_file:
if line[0] != "#":
line = line.strip().split("\t")
chrom = line[0]
start = int(line[1])
stop = int(line[2])
#just using for present/absent so data is meaningless
#intervaltree.addi(start, stop, data)
#excludes are .bed (0-based), variants are .vcf (1-based),
excludes[chrom].addi(start+1, stop+2, 1)
for variant in variants:
if len(excludes[variant.chrom][variant.pos]) == 0:
filtered.append(variant)
return filtered
def merge_mobster(mobster_dirs, no_merge):
predictions = []
for path in mobster_dirs:
if not path.endswith("/"):
path+="/"
try:
calls = glob.glob(path+"*predictions.txt")[0]
predictions.append(open(calls, 'r'))
except:
sys.stderr.write("Error: mobster dirs do not contain correct bam files. Need *_mappedpotentials, *_splitanchors, and *_anchors\n")
exit(0)
return read_mobster(predictions, no_merge)
def read_vcf(vcf_file, samples=True):
'''Parses a vcf file, returning a vcf object and list of variants'''
in_header = True
header = []
vcf = Vcf()
variants = []
# read input VCF
for line in vcf_file:
if in_header:
if line[0] == '#':
header.append(line)
# if line[1] != '#':
# vcf_samples = line.rstrip().split('\t')[9:]
continue
else:
in_header = False
vcf.add_header(header)
#remove samples if specified
if not samples:
vcf.sample_list = []
v = line.rstrip().split('\t')
var = Variant(v, vcf)
variants.append(var)
vcf_file.close()
return vcf, variants
def n_median(num_list):
return int(numpy.median(numpy.array(num_list)))
def n_mode(num_list):
return int(mode(numpy.array(num_list))[0])
def genotype_MEIs(all_bam, mobster_dirs, vcf_in, excludes, vcf_out, num_samp, splflank, discflank, no_merge, quiet, pair_only, split_only):
'''main genotyping function'''
vcf_out = open(vcf_out, 'w+')
#open bam files
all_bam = [pysam.Samfile(bam, 'rb') for bam in all_bam]
#lists to populate with mobster files
predictions = []
realigns = []
splits = []
pairs = []
for path in mobster_dirs:
if not path.endswith("/"):
path+="/"
try:
calls = glob.glob(path+"*predictions.txt")[0]
realign = glob.glob(path+"*mappedpotentials.bam")[0]
split = glob.glob(path+"*splitanchors.bam")[0]
pair = glob.glob(path+"*anchors.bam")[0]
predictions.append(open(calls, 'r'))
realigns.append(pysam.Samfile(realign, 'rb'))
splits.append(pysam.Samfile(split, 'rb'))
pairs.append(pysam.Samfile(pair, 'rb'))
except:
sys.stderr.write("Error: mobster dirs do not contain correct bam files. Need *_mappedpotentials, *_splitanchors, and *_anchors\n")
exit(0)
#if vcf file specified, read it without including samples (we are re-genotyping).
if vcf_in:
get_samples = False
mob_vcf, variants = read_vcf(vcf_in, get_samples)
else:
mob_vcf, variants = read_mobster(predictions, no_merge)
if excludes:
variants = filter_excludes(variants, excludes)
#grab sample names
for bam in all_bam:
mob_vcf.add_sample(bam.header['RG'][0]['SM'])
#create Sample objects for each pair of bam files
sample_list = []
for ref, realign, split, pair in zip(all_bam, realigns, splits, pairs):
sample_list.append(Mob_Sample(ref, realign, split, pair, num_samp))
sample_names = [sample.name for sample in sample_list]
#write VCF header
vcf_out.write(mob_vcf.get_header()+"\n")
#set max insert size z score for fetching read pairs
z=3
#loop through the MEI calls:
var_count = 1
for var in variants:
#skip untig and MT calls (these have caused errors, namely negative coords in GL calls.)
if var.chrom.startswith("GL") or var.chrom == "MT":
continue
if not quiet:
sys.stderr.write(str(var.chrom)+"\t"+str(var.pos)+"\n")
#borders = var.info['CI'].split(",")
var.sample_list = mob_vcf.sample_list
var.reset_genotypes()
# up, down = map(int, var.info['CI'].split(","))
# ci_start = var.pos + up - 1
# ci_stop = var.pos + down - 1
#vcf is 1-based, BAM is 0-based; we're querying BAMs with VCF coords
insert = int(var.pos) - 1
#start = insert + int(borders[0])
#stop = insert + int(borders[1])
mei_type = var.info["MEI"]
#aggregate evidence counts
PE = 0
SR = 0
SU = 0
for sample in sample_list:
#set counters to zero
split_alt, split_ref, pair_alt, pair_ref = 0,0,0,0
split_excludes = set()
if not pair_only:
#count split MEI read events at the site:
for me_read in sample.split.fetch(var.chrom, insert-5, insert+5):
me_tag = me_read.opt('ME').split(";")
# if me_tag[2] == mei_type or me_tag[-2] == "polyA" or me_tag[-2] == "polyT":
if me_tag[2] == mei_type:
split_alt += 1
split_excludes.add(me_read.qname)
#fetch reads overlapping the insertion site
#count ref split reads at the site
split_ref = 0
for ref_read in sample.bam.fetch(var.chrom, insert-5, insert+5):
if not (ref_read.is_duplicate or
ref_read.is_secondary or
ref_read.mapq < 25 or
ref_read.is_unmapped or
ref_read.mate_is_unmapped or
ref_read.qname in sample.excludes or
ref_read.qname in split_excludes):
for p in xrange(ref_read.pos + 1, ref_read.aend + 1):
if p - ref_read.pos >= splflank and ref_read.aend - p >= splflank:
split_ref += 1
break
#count pair MEI and pair ref events at the site
if not split_only:
pair_ref, pair_alt = count_mei_pairedend(var.chrom, var.pos, var.info['CI'], sample, mei_type, z, discflank, split_excludes)
#increment total supporting evidence counts
PE += pair_alt
SR += split_alt
SU += (pair_alt+split_alt)
if split_alt + pair_alt + pair_ref + split_ref > 0:
#use dupe expected alt/ref likelihoods for each GT
is_dup = True
gt_lplist = bayes_gt(split_ref+pair_ref, split_alt+pair_alt, is_dup)
gt_idx = gt_lplist.index(max(gt_lplist))
try:
allele_balance = (split_alt+pair_alt)/float(split_alt+pair_alt+split_ref+pair_ref)
except ZeroDivisionError:
allele_balance = "."
var.genotype(sample.name).set_format('GL', ','.join(['%.0f' % x for x in gt_lplist]))
var.genotype(sample.name).set_format('DP', split_ref+pair_ref+split_alt+pair_alt)
var.genotype(sample.name).set_format('RO', split_ref+pair_ref)
var.genotype(sample.name).set_format('AO', split_alt+pair_alt)
var.genotype(sample.name).set_format('RS', split_ref)
var.genotype(sample.name).set_format('AS', split_alt)
var.genotype(sample.name).set_format('RP', pair_ref)
var.genotype(sample.name).set_format('AP', pair_alt)
var.genotype(sample.name).set_format('AB', allele_balance)
# assign genotypes
gt_sum = 0
for gt in gt_lplist:
try:
gt_sum += 10**gt
except OverflowError:
gt_sum += 0
if gt_sum > 0:
gt_sum_log = math.log(gt_sum, 10)
sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
if 1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log) == 0:
phred_gq = 200
else:
phred_gq = abs(-10 * math.log(1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log), 10))
var.genotype(sample.name).set_format('GQ', phred_gq)
var.genotype(sample.name).set_format('SQ', sample_qual)
var.qual += sample_qual
if gt_idx == 1:
var.genotype(sample.name).set_format('GT', '0/1')
elif gt_idx == 2:
var.genotype(sample.name).set_format('GT', '1/1')
elif gt_idx == 0:
var.genotype(sample.name).set_format('GT', '0/0')
else:
var.genotype(sample.name).set_format('GQ', '.')
var.genotype(sample.name).set_format('SQ', '.')
var.genotype(sample.name).set_format('GT', './.')
else:
var.genotype(sample.name).set_format('GT', './.')
var.qual = 0
var.genotype(sample.name).set_format('GQ', '.')
var.genotype(sample.name).set_format('SQ', '.')
var.genotype(sample.name).set_format('GL', '.')
var.genotype(sample.name).set_format('DP', 0)
var.genotype(sample.name).set_format('AO', 0)
var.genotype(sample.name).set_format('RO', 0)
# if detailed:
var.genotype(sample.name).set_format('AS', 0)
var.genotype(sample.name).set_format('RS', 0)
var.genotype(sample.name).set_format('AP', 0)
var.genotype(sample.name).set_format('RP', 0)
var.genotype(sample.name).set_format('AB', '.')
# after all samples have been processed, write
var.set_info('PE', PE)
var.set_info('SR', SR)
var.set_info('SU', SU)
var.var_id = var_count
var_count += 1
vcf_out.write(var.get_var_string() + '\n')
vcf_out.flush()
vcf_out.close()
def read_mobster(prediction_files, no_merge):
'''returns a VCF object and union of variants for all Mobster prediction files given'''
#create Vcf object
mob_vcf = Vcf()
#add ALT header fields
mob_vcf.add_alt("INS:ME:ALU", "Insertion of ALU element")
mob_vcf.add_alt("INS:ME:L1", "Insertion of L1 element")
mob_vcf.add_alt("INS:ME:SVA", "Insertion of SVA element")
mob_vcf.add_alt("INS:ME:HERVK", "Insertion of HERVK element")
#add INFO header fields
mob_vcf.add_info("SVTYPE", "1", "String", "Type of structural variant")
mob_vcf.add_info("MEI", "1", "String", "Type of mobile element insertion")
mob_vcf.add_info("CI", "2", "Integer", "5-prime and 3-prime borders of insertion position")
#mob_vcf.add_info("TSD", "1", "String", "Target site duplication (True/False/Unknown)")
mob_vcf.add_info("SU", "1", "Integer", "Number of paired-end reads supporting the variant across all samples")
mob_vcf.add_info("PE", "1", "Integer", "Number of pieces of evidence supporting the variant across all samples")
mob_vcf.add_info("SR", "1", "Integer", "Number of split reads supporting the variant across all samples")
#Add FORMAT field for genotype
mob_vcf.add_format('GQ', 1, 'Float', 'Genotype quality')
mob_vcf.add_format('SQ', 1, 'Float', 'Phred-scaled probability that this site is variant (MEI insertion in this sample')
mob_vcf.add_format('GL', 'G', 'Float', 'Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy')
mob_vcf.add_format('DP', 1, 'Integer', 'Read depth')
mob_vcf.add_format('RO', 1, 'Integer', 'Reference allele observation count')
mob_vcf.add_format('AO', 'A', 'Integer', 'Alternate allele observation count')
# mob_vcf.add_format('QR', 1, 'Integer', 'Sum of quality of reference observations')
# mob_vcf.add_format('QA', 'A', 'Integer', 'Sum of quality of alternate observations')
mob_vcf.add_format('RS', 1, 'Integer', 'Reference allele split-read observation count')
mob_vcf.add_format('AS', 'A', 'Integer', 'Alternate allele split-read observation count')
mob_vcf.add_format('RP', 1, 'Integer', 'Reference allele paired-end observation count')
mob_vcf.add_format('AP', 'A', 'Integer', 'Alternate allele paired-end observation count')
mob_vcf.add_format('AB', 'A', 'Float', 'Allele balance, fraction of observations from alternate allele, QA/(QR+QA)')
#list will hold union set of vars
variants = []
for file in prediction_files:
for entry in file:
entry = entry.strip().split("\t")
#skip header rows
if entry[0].startswith("#") or entry[0] == "Chr":
continue
chrom = entry[0]
if chrom.startswith("chr"):
chrom = chrom[3:]
ME = entry[1]
#VCF is one-based
bnd = int(entry[2]) + 1
#Mobster file 0-based; VCF 1-based
start = int(entry[3]) + 1
stop = int(entry[4]) + 1
rel_start = start - bnd
rel_stop = stop - bnd
support5 = entry[10]
if support5 == 'NA':
support5 = 0
support5 = int(support5)
support3 = entry[11]
if support3 == 'NA':
support3 = 0
support3 = int(support3)
split5 = int(entry[12])
split3 = int(entry[13])
pair5 = support5 - split5
pair3 = support3 - split3
#crappy hacks to feed data to Variant() as a vcf row
alt = "<INS:ME:{0}>".format(ME)
#info str template (should match the mob_vcf info list)
info_str = "SVTYPE=INS;MEI={0};CI={1},{2};SU={3};PE={4};SR={5}"
#insert values into info str
info_str = info_str.format(ME, rel_start, rel_stop, support5+support3, pair5+pair3, split5+split3)
var_list = [chrom, bnd, 0, "N", alt, ".", ".", info_str]
var = Variant(var_list, mob_vcf)
variants.append(var)
merged = merge_overlaps(variants, no_merge)
return mob_vcf, merged
def merge_overlaps(variants, no_merge):
mei_types = defaultdict(list)
"load MEI dict with variants"
for var in variants:
mei_types[var.info['MEI']].append(var)
#sort the union set
for ME, varlist in mei_types.viewitems():
mei_types[ME] = sort_vars(varlist, True)
#lets merge by overlapping bordered intervals
merged = []
pos_list = []
current_int = False
current_var = False
if no_merge:
for ME, varlist in mei_types.viewitems():
for var in varlist:
dumped = False
up, down = map(int, var.info['CI'].split(","))
start = var.pos + up
stop = var.pos + down
if not current_var:
current_var = var
current_int = (start, stop)
else:
#if same position:
if var.chrom == current_var.chrom and var.pos == current_var.pos:
#if the interval is larger that previous, take it.
if stop-start > current_int[1] - current_int[0]:
current_int = (start, stop)
else:
rel_start = current_int[0] - current_var.pos
rel_stop = current_int[1] - current_var.pos
current_var.set_info('CI', "{0},{1}".format(rel_start, rel_stop))
merged.append(current_var)
current_var = var
current_int = (start, stop)
dumped = True
if not dumped:
rel_start = current_int[0] - current_var.pos
rel_stop = current_int[1] - current_var.pos
current_var.set_info('CI', "{0},{1}".format(rel_start, rel_stop))
merged.append(current_var)
else:
for ME, varlist in mei_types.viewitems():
for var in varlist:
dumped = False
up, down = map(int, var.info['CI'].split(","))
start = var.pos + up
stop = var.pos + down
if not current_int:
current_int = (start, stop)
current_var = var
pos_list.append(var.pos)
else:
if var.chrom == current_var.chrom and start <= current_int[1]:
start = min(start, current_int[0])
stop = max(stop, current_int[1])
current_int = (start, stop)
pos_list.append(var.pos)
else:
m_pos = n_mode(pos_list)
rel_start = current_int[0] - m_pos
rel_stop = current_int[1] - m_pos
current_var.pos = m_pos
current_var.set_info('CI', "{0},{1}".format(rel_start,rel_stop))
merged.append(current_var)
#reset
current_int = (start, stop)
current_var = var
pos_list = [var.pos]
dumped = True
if not dumped:
m_pos = n_mode(pos_list)
rel_start = current_int[0] - m_pos
rel_stop = current_int[1] - m_pos
current_var.pos = m_pos
current_var.set_info('CI', "{0},{1}".format(rel_start,rel_stop))
merged.append(current_var)
return sort_vars(merged, False)
def count_mei_pairedend(chrom,
pos,
CI,
sample,
mei_type,
z,
discflank,
split_excludes):
"""Function to count number of concordant read pairs supporting the reference call at given var coordinates."""
pair_ref = 0
pair_alt = 0
fetch_flank = sample.get_fetch_flank(z)
tlens = []
# up, down = map(int, CI.split(","))
# ci_start = pos + up - 1
# ci_stop = pos + down - 1
insert = pos - 1
for read in sample.pair.fetch(chrom, max(insert-(fetch_flank), 0), insert+(fetch_flank)):
me_tag = read.opt('ME').split(";")
if me_tag[2] == mei_type:
pair_alt += 1
#pair_excludes = set()
for read in sample.bam.fetch(chrom, max(insert - (fetch_flank), 0), insert+(fetch_flank)):
lib = sample.get_lib(read.opt('RG'))
#improper orientation, unmapped, too far, different chrom, poor mate mapq
if (read.is_reverse == read.mate_is_reverse
or read.is_secondary
or read.is_unmapped
or read.mate_is_unmapped
or read.is_duplicate
or read.aend - discflank < insert
or read.rname != read.rnext
or read.mapq < 25
or read.opt('MQ') < 25
or read.qname in sample.excludes
or read.qname in split_excludes
or read.tlen <= 0):
continue
else:
#get start and end coords for read pair
if read.is_reverse:
end = read.pos
start = get_mate_5prime(sample.bam, read)
if not read.is_reverse:
end = get_mate_5prime(sample.bam, read)
start = read.pos
#continue if reads dont overlap with insert by at least discflank
if not ((end - insert < discflank) or (insert - start < discflank)):
if abs(zscore(read.tlen, lib.mean, lib.sd)) <= 3:
pair_ref+=1
#pair_excludes.add(read.qname)
#return pair_excludes, pair_ref, pair_alt
return pair_ref, pair_alt
class Vcf(object):
def __init__(self):
self.file_format = 'VCFv4.2'
# self.fasta = fasta
self.reference = ''
self.sample_list = []
self.info_list = []
self.format_list = []
self.alt_list = []
self.add_format('GT', 1, 'String', 'Genotype')
def add_header(self, header):
for line in header:
if line.split('=')[0] == '##fileformat':
self.file_format = line.rstrip().split('=')[1]
elif line.split('=')[0] == '##reference':
self.reference = line.rstrip().split('=')[1]
elif line.split('=')[0] == '##INFO':
a = line[line.find('<')+1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_info(*[b.split('=')[1] for b in r.findall(a)])
elif line.split('=')[0] == '##ALT':
a = line[line.find('<')+1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_alt(*[b.split('=')[1] for b in r.findall(a)])
elif line.split('=')[0] == '##FORMAT':
a = line[line.find('<')+1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_format(*[b.split('=')[1] for b in r.findall(a)])
elif line[0] == '#' and line[1] != '#':
self.sample_list = line.rstrip().split('\t')[9:]
# return the VCF header
def get_header(self):
header = '\n'.join(['##fileformat=' + self.file_format,
'##fileDate=' + time.strftime('%Y%m%d'),
'##reference=' + self.reference] + \
[i.hstring for i in self.info_list] + \
[a.hstring for a in self.alt_list] + \
[f.hstring for f in self.format_list] + \
['\t'.join([
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO',
'FORMAT'] + \
self.sample_list
)])
return header
def add_info(self, id, number, type, desc):
if id not in [i.id for i in self.info_list]:
inf = self.Info(id, number, type, desc)
self.info_list.append(inf)
def add_alt(self, id, desc):
if id not in [a.id for a in self.alt_list]:
alt = self.Alt(id, desc)
self.alt_list.append(alt)
def add_format(self, id, number, type, desc):
if id not in [f.id for f in self.format_list]:
fmt = self.Format(id, number, type, desc)
self.format_list.append(fmt)
def add_sample(self, name):
self.sample_list.append(name)
# get the VCF column index of a sample
# NOTE: this is zero-based, like python arrays
def sample_to_col(self, sample):
return self.sample_list.index(sample) + 9
class Info(object):
def __init__(self, id, number, type, desc):
self.id = str(id)
self.number = str(number)
self.type = str(type)
self.desc = str(desc)
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##INFO=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
class Alt(object):
def __init__(self, id, desc):
self.id = str(id)
self.desc = str(desc)
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##ALT=<ID=' + self.id + ',Description=\"' + self.desc + '\">'
class Format(object):
def __init__(self, id, number, type, desc):
self.id = str(id)
self.number = str(number)
self.type = str(type)
self.desc = str(desc)
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##FORMAT=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
class Variant(object):
def __init__(self, var_list, vcf):
self.chrom = var_list[0]
self.pos = int(var_list[1])
self.var_id = var_list[2]
self.ref = var_list[3]
self.alt = var_list[4]
if var_list[5] == '.':
self.qual = 0
else:
self.qual = float(var_list[5])
self.filter = var_list[6]
self.sample_list = vcf.sample_list
self.info_list = vcf.info_list
self.info = dict()
self.format_list = vcf.format_list
self.active_formats = list()
self.gts = dict()
# fill in empty sample genotypes
if len(var_list) < 8:
sys.stderr.write('\nError: VCF file must have at least 8 columns\n')
exit(1)
if len(var_list) < 9:
var_list.append("GT")
# make a genotype for each sample at variant
for s in self.sample_list:
try:
s_gt = var_list[vcf.sample_to_col(s)].split(':')[0]
self.gts[s] = Genotype(self, s, s_gt)
# import the existing fmt fields
for j in zip(var_list[8].split(':'), var_list[vcf.sample_to_col(s)].split(':')):
self.gts[s].set_format(j[0], j[1])
except IndexError:
self.gts[s] = Genotype(self, s, './.')
self.info = dict()
i_split = [a.split('=') for a in var_list[7].split(';')] # temp list of split info column
for i in i_split:
if len(i) == 1:
i.append(True)
self.info[i[0]] = i[1]
def reset_genotypes(self):
# make a genotype for each sample at variant
for s in self.sample_list:
self.gts[s] = Genotype(self, s, './.')
def set_info(self, field, value):
if field in [i.id for i in self.info_list]:
self.info[field] = value
else:
sys.stderr.write('\nError: invalid INFO field, \"' + field + '\"\n')
exit(1)
def get_info(self, field):
return self.info[field]
def get_info_string(self):
i_list = list()
for info_field in self.info_list:
if info_field.id in self.info.keys():
if info_field.type == 'Flag':
i_list.append(info_field.id)
else:
i_list.append('%s=%s' % (info_field.id, self.info[info_field.id]))
return ';'.join(i_list)
def get_format_string(self):
f_list = list()
for f in self.format_list:
if f.id in self.active_formats:
f_list.append(f.id)
return ':'.join(f_list)
def genotype(self, sample_name):
if sample_name in self.sample_list:
return self.gts[sample_name]
else:
sys.stderr.write('\nError: invalid sample name, \"' + sample_name + '\"\n')
def get_var_string(self):
s = '\t'.join(map(str,[
self.chrom,
self.pos,
self.var_id,
self.ref,
self.alt,
'%0.2f' % self.qual,
self.filter,
self.get_info_string(),
self.get_format_string(),
'\t'.join(self.genotype(s).get_gt_string() for s in self.sample_list)
]))
return s
class Genotype(object):
def __init__(self, variant, sample_name, gt):
self.format = dict()
self.variant = variant
self.set_format('GT', gt)
def set_format(self, field, value):
if field in [i.id for i in self.variant.format_list]:
self.format[field] = value
if field not in self.variant.active_formats:
self.variant.active_formats.append(field)
# sort it to be in the same order as the format_list in header
self.variant.active_formats.sort(key=lambda x: [f.id for f in self.variant.format_list].index(x))
else:
sys.stderr.write('\nError: invalid FORMAT field, \"' + field + '\"\n')
exit(1)
def get_format(self, field):
return self.format[field]
def get_gt_string(self):
g_list = list()
for f in self.variant.active_formats:
if f in self.format:
if type(self.format[f]) == float:
g_list.append('%0.2f' % self.format[f])
else:
g_list.append(self.format[f])
else:
g_list.append('.')
return ':'.join(map(str,g_list))
# efficient combinatorial function to handle extremely large numbers
def log_choose(n, k):
r = 0.0
# swap for efficiency if k is more than half of n
if k * 2 > n:
k = n - k
for d in xrange(1,k+1):
r += math.log(n, 10)
r -= math.log(d, 10)
n -= 1
return r
# holds a library's insert size and read length information
class Library(object):
def __init__(self, bam, name, num_samp):
self.bam = bam
self.name = name
self.num_samp = num_samp
self.readgroups = list()
self.read_length = int()
self.hist = dict()
self.dens = dict()
self.mean = float()
self.sd = float()
def add_readgroup(self, rg):
self.readgroups.append(rg)
# get read length
def calc_read_length(self):
max_rl = 0
counter = 0
num_samp = 10000
for read in self.bam.fetch():
if read.opt('RG') not in self.readgroups:
continue
if read.qlen > max_rl:
max_rl = read.qlen
if counter == num_samp:
break
counter += 1
self.read_length = max_rl
# generate empirical histogram of the sample's insert size distribution
# CC NOTE: REMOVE BEYOND Z stdev!!!!
def calc_insert_hist(self):
counter = 0
skip = 0
skip_counter = 0
mads = 10
ins_list = []
# Each entry in valueCounts is a value, and its count is
# the number of instances of that value observed in the dataset.
# So valueCount[5] is the number of times 5 has been seen in the data.
valueCounts = Counter()
for read in self.bam:
if skip_counter < skip:
skip_counter += 1
continue
if (read.is_reverse
or not read.mate_is_reverse
or read.is_unmapped
or read.mate_is_unmapped
or read.is_secondary
or read.tlen <= 0
or read.opt('RG') not in self.readgroups):
continue
else:
valueCounts[read.tlen] += 1
counter += 1
if counter == self.num_samp:
break
# remove outliers
med = median(valueCounts)
u_mad = upper_mad(valueCounts, med)
for x in [x for x in list(valueCounts) if x > med + mads * u_mad]:
del valueCounts[x]
self.hist = valueCounts
self.mean = mean(self.hist)
self.sd = stdev(self.hist)
# calculate the density curve for and insert size histogram
def calc_insert_density(self):
dens = Counter()
for i in list(self.hist):
dens[i] = float(self.hist[i])/countRecords(self.hist)
self.dens = dens
# holds each sample's BAM and library information
class Sample(object):
def __init__(self, bam, spl_bam, num_samp):
self.name = bam.header['RG'][0]['SM']
self.bam = bam
self.spl_bam = spl_bam
num_samp
self.lib_dict = dict()
# parse library design
self.rg_to_lib = dict()
for r in self.bam.header['RG']:
try:
lib_name=r['LB']
except KeyError, e:
lib_name=''
# add the new library
if lib_name not in self.lib_dict:
new_lib = Library(self.bam, lib_name, num_samp)
self.lib_dict[lib_name] = new_lib
self.rg_to_lib[r['ID']] = self.lib_dict[lib_name]
self.lib_dict[lib_name].add_readgroup(r['ID'])
# execute calculations
for name in self.lib_dict:
self.lib_dict[name].calc_read_length()
self.lib_dict[name].calc_insert_hist()
self.lib_dict[name].calc_insert_density()
# get the maximum fetch flank for reading the BAM file
def get_fetch_flank(self, z):
return max([lib.mean + (lib.sd * z) for lib in self.lib_dict.values()])
# return the library object for a specified read group
def get_lib(self, readgroup):
return self.rg_to_lib[readgroup]
# holds each sample's BAM and library information
class Mob_Sample(object):
def __init__(self, bam, mob_realign, mob_split, mob_pair, num_samp):
self.name = bam.header['RG'][0]['SM']
self.bam = bam
self.realign = mob_realign
self.split = mob_split
self.pair = mob_pair
num_samp
self.excludes = set()
for read in mob_realign:
if not read.is_unmapped:
name = read.qname[2:-2]
self.excludes.add(name)
self.lib_dict = dict()
# parse library design
self.rg_to_lib = dict()
for r in self.bam.header['RG']:
try:
lib_name=r['LB']
except KeyError, e:
lib_name=''
# add the new library
if lib_name not in self.lib_dict:
new_lib = Library(self.bam, lib_name, num_samp)
self.lib_dict[lib_name] = new_lib
self.rg_to_lib[r['ID']] = self.lib_dict[lib_name]
self.lib_dict[lib_name].add_readgroup(r['ID'])
# execute calculations
for name in self.lib_dict:
self.lib_dict[name].calc_read_length()
self.lib_dict[name].calc_insert_hist()
self.lib_dict[name].calc_insert_density()
# get the maximum fetch flank for reading the BAM file
def get_fetch_flank(self, z):
return max([lib.mean + (lib.sd * z) for lib in self.lib_dict.values()])
# return the library object for a specified read group
def get_lib(self, readgroup):
return self.rg_to_lib[readgroup]
def sort_vars(var_list, CI=False):
'''Sorts a list of var objects'''