/
extract_arrays.py
228 lines (220 loc) · 11 KB
/
extract_arrays.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from collections import defaultdict, namedtuple
from pyfaidx import Fasta
import pysam
import random
def get_refseq(ref):
refseq = None
fa = Fasta(ref)
for genome_id in fa.keys():
genome = genome_id
refseq = str(fa[genome_id])
return refseq
def site_is_polymorphic(refseq, pileupcolumn, af):
"""
Quick and dirty method to determine if a site
within the user-inputted region is 'polymorphic' in the BAM.
"""
ref_base = refseq[pileupcolumn.pos]
alt_bases = []
# Start counting the number of 'ref' pileup bases.
# Any 'alt' bases will be added to the list above.
read_ref_bases = 0
for pileupread in pileupcolumn.pileups:
if pileupread.alignment.mapping_quality < 20 or pileupread.query_position is None:
continue
# Store gaps with respect to the reference.
if pileupread.is_del or pileupread.is_refskip:
alt_bases.append('-')
continue
read_base = pileupread.alignment.query_sequence[pileupread.query_position]
if read_base == ref_base:
read_ref_bases += 1
elif read_base != ref_base:
alt_bases.append(read_base)
bases = ['A', 'T', 'C', 'G']
# Count the frequency of each alternate base in the pileup.
alt_count = [alt_bases.count(nuc) for nuc in bases]
depth = pileupcolumn.n
# Get the alternate base with the highest frequency in the pileup.
max_nuc = bases[alt_count.index(max(alt_count))]
# Check if the pileup site has an 'alt' allele frequency greater than
# the user inputted limit, and is thus "polymorphic."
if max(alt_count) / float(sum(alt_count) + read_ref_bases) > af:
af = max(alt_count) / float(sum(alt_count) + read_ref_bases)
return True, ref_base, max_nuc, af, depth
else:
return False, ref_base, max_nuc, None, depth
def get_clip(ct):
"""
Ientify the number of bases that are soft or
hard-clipped prior to a supplementary alignment of
a query sequence (i.e., ONT read) using the pysam `cigar_tuple`.
We use the number of bases clipped prior to an alignment as a proxy
for that alignment's relative order in the original query, and therefore,
its relative position in the viral genome.
"""
MATCH = 0
SOFT_CLIP = 4
HARD_CLIP = 5
clip = '-'
# Look at the first CIGAR entry in the CIGAR string.
if ct[0][0] in (SOFT_CLIP, HARD_CLIP): clip = ct[0][1]
elif ct[0][0] == MATCH: clip = 0
return clip
def create_coordinate_dict(bam, chromosome='gi|335317|gb|M35027.1|VACCG',
left_coord=30363, right_coord=30629):
"""
Catalog the start and end coordinates of each alignment
within a query sequence (i.e., sequencing read). We use this
to store the absolute start and end of each read with respect
to the reference, for later filtering.
"""
coords = defaultdict(list)
samfile = pysam.Samfile(bam, "rb")
for pileupread in samfile.fetch(chromosome, left_coord, right_coord):
ref_end, ref_start = pileupread.reference_end, pileupread.reference_start
q_name = str(pileupread.query_name)
# Add the start and end coordinates of the alignment to the `coords` dictionary.
coords[q_name].append(ref_start)
coords[q_name].append(ref_end)
return coords
def create_random_arrays(arrays, max_cn=5, af=0.5):
"""
Method to generate a population of uniformly mutated
viruses, such that the copy number distribution of the
population is the same as the input population, and the
allele frequency of the mutated population is the same
as the input.
"""
cn_freqs = []
max_len = max([len(a) for a in arrays])
for cn in range(1, max_len):
# Get individual CN's frequency in population of arrays
cn_freq = sum([1 if len(x) == cn else 0 for x in arrays]) / float(len(arrays))
cn_freqs.append(cn_freq)
# Set simulated population size
init = len(arrays)
pop_sizes = [int(round(init * x)) for x in cn_freqs]
population = []
# Initialize each array in the population as a list
# of wild-type (0) copies.
for cn in range(1, max_len):
lst = [[0] * cn for _ in range(pop_sizes[cn - 1])]
population.extend(lst)
# Set a limit on the total number of H47R alleles that could
# be observed in the population, given our sequenced data.
total_copies = sum([x * (pop_sizes.index(x) + 1) for x in pop_sizes])
limit = int(round(total_copies * af))
x = 0
# Loop over every copy within every array in the population.
# Mutate each copy at a probability equal to the allele frequency
# of the mutation in the experimental population
random.shuffle(population)
for array in population:
for idx, copy in enumerate(array):
if x > limit: break
probability = random.random()
if probability < af:
array[idx] = 1
x = sum([sum(x) for x in population])
return population
def extract_arrays(bam, refseq, chromosome='gi|335317|gb|M35027.1|VACCG',
left_coord=30300, right_coord=30800,
af=0.05, pos=30489, slop=150, copy_filter=None):
"""
Identify the presence or absence of a variant site in every alignment
within all query sequences.
"""
array_dict = defaultdict(list)
sa_tags = defaultdict(int)
# Create a dictionary of the starts and ends of each alignment from each query sequence.
coordinate_dict = create_coordinate_dict(bam, chromosome, left_coord, right_coord)
samfile = pysam.Samfile(bam, "rb")
# Iterate through the pileup columns within all reads that map to the locus.
for pileupcolumn in samfile.pileup(chromosome, left_coord, right_coord, max_depth=100000, stepper="nofilter"):
# Rather than examining every pileup column within the reads, only look at
# columns within the portion of the reads that map to the locus.
if pileupcolumn.pos != pos: continue
# Get the ref/alt bases, as well as hacky estimates of allele frequency
# and depth at the site.
polymorphic, ref_base, alt_base, af, depth = site_is_polymorphic(refseq, pileupcolumn, af)
# Iterate through each read that maps to the pileup column.
for pileupread in pileupcolumn.pileups:
mapq = pileupread.alignment.mapping_quality
low_qual = False
if mapq < 20: low_qual = True
q_name = pileupread.alignment.query_name
# Extract the number of supplementary alignments associated with
# the query sequence, which we'll use as a proxy for copy number.
try: sa_cn = len(pileupread.alignment.get_tag("SA").split(";"))
except KeyError: sa_cn = 1
sa_tags[q_name] = sa_cn
indel, short = False, False
# Get the soft/hard clip value from the 'left' end of the alignment.
clip_value = get_clip(pileupread.alignment.cigartuples)
# Catalog alignments that have deletions or gaps (with respect to the reference)
# at the pileup site. We may want to filter them out in downstream analyses.
if pileupread.is_del or pileupread.is_refskip:
indel = True
ref_end = pileupread.alignment.reference_end
ref_start = pileupread.alignment.reference_start
# Check that the individual alignment maps fully to at least the ends of the locus.
# If the alignment doesn't map fully, add a placeholder entry to the `array_dict`.
# As before, we need to catalog its existence (if we want to filter later).
if ref_start > left_coord or ref_end < right_coord:
short = True
q_pos = pileupread.query_position
q_seq = pileupread.alignment.query_sequence
if q_pos is not None:
read_base = q_seq[q_pos]
else: read_base = None
# Double check that the alignment's parent query sequence is in our
# dictionary of every query's coordinates. We'll be checking that the
# terminal ends of the query map uniquely outside of the region later.
# Allele calling.
allele = 'mismatch'
if read_base == ref_base: allele = 0
elif read_base == alt_base: allele = 1
# If the alignment has some "imperfection" (is missing a nucleotide at the mutated site,
# doesn't align fully to the locus, low mapping quality, etc.), store it,
# but store its "allele" as a value we can filter on later.
if indel: allele = 'indel'
if low_qual: allele = 'low_qual'
if short: allele = 'short'
# Check that the full query maps outside of the putative duplication.
# If not, skip it.
if q_name not in coordinate_dict: continue
if min(coordinate_dict[q_name]) > left_coord - slop or max(coordinate_dict[q_name]) < right_coord + slop:
continue
array_dict[q_name].append((allele, clip_value, ref_base, alt_base))
# Now, we go back through the dictionary of arrays, and sort each array 5' to 3'.
# Additionally, we remove any arrays with "imperfect" K3L alignments (i.e., copies).
# Going back through the dictionary is time-consuming, but in some cases
# we want to catalog every copy in the array of interest before we can apply filters.
sorted_arrays = []
for array in array_dict:
# Sort each query's set of alignments by each alignment's clipping value.
# A larger clipping value indicates that the alignment is "later" in a query.
# Ultimately, we want to sort with respect to the reference genome.
all_clips = [x[1] for x in array_dict[array]]
contains_bad_clip = any([x == '-' for x in all_clips])
if contains_bad_clip: continue
# Sort clip values w/r/t the reference.
ordered_array = sorted(array_dict[array], key=lambda tup: tup[1])
ordered_array = [x[0] for x in ordered_array]
# Check that we haven't catalogued more copies than there are SA.
if len(ordered_array) > sa_tags[array]: continue
# Setting "hard" filter to be any K3L copy with an indel at position 30490,
# a copy that doesn't align across the whole locus, or both.
if copy_filter == 'hard':
if any([copy in ('indel', 'short', 'mismatch', 'low_qual') for copy in ordered_array]):
continue
# The "soft" filter, used only to calculate gross copy number in the population,
# includes alignments with indels or mismatches at position 30490, since these
# alignments are otherwise indicative of a full K3L copy.
elif copy_filter == 'soft' and any([copy in ('short', 'low_qual') for copy in ordered_array]):
continue
elif copy_filter == 'none': pass
sorted_arrays.append(ordered_array)
results = namedtuple('Results', ['arrays', 'af', 'ref', 'alt', 'depth'])
return results(sorted_arrays, af, ref_base, alt_base, depth)