/
parse_genie.py
300 lines (249 loc) · 11.1 KB
/
parse_genie.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
"""
Author: Dylan Maghini
Date: March 17, 2019
Generate sample by gene matrix for downstream analysis
"""
import sys
import argparse
import synapseclient
import re
def make_panel_dictionary(input_file):
"""
Creates dictionary from panel to genes screened by panel
Parameters:
input_file (str): path to file containing panel to gene info
Returns:
dict: Dictionary mapping panels to set of genes
"""
panels = {}
with open(input_file, 'r') as f:
header = f.readline()
line = f.readline()
# iterate through file to pull out relevant info
while line:
split = line.split('\t')
center = split[5]
gene = split[3]
tf = split[7]
tf = tf.strip()
feat = split[6]
# if True/False, add gene to panel set
if (tf == 'True') and (gene != "") and (feat == 'exon'):
if center in panels:
panels[center].add(gene)
else:
panels[center] = set([gene])
line = f.readline()
return panels
# open clinical sample sheet, pull out sample ids and assays for luad
def find_tumor_ids(input_file, cancers):
"""
Parse out sample IDs and assays for all lung adenocarcinoma tumors
Parameters:
input_file (str): Path to file with sample metadata
cancers (list): A list of oncotree codes for cancer types
Returns:
dict: Dictionary mapping lung adeno sample IDs to sample panel
"""
tumors = {}
with open(input_file, 'r') as f:
header = f.readline()+f.readline()+f.readline()+f.readline()+\
f.readline()
line = f.readline()
# parse out oncotree code, sample ID, and sequence assay
while line:
split = line.strip().split('\t')
sample_id = split[1]
oncotree_code = split[3]
seq_assay_id = split[5]
if oncotree_code in cancers:
tumors[sample_id] = seq_assay_id
line = f.readline()
return tumors
def pull_sample_mutations(input_file, tumors):
"""
Parse out lung adenocarcinoma samples and mutations from mutation file
This function parses a file with extended mutation information to pull out
all relevant mutation information. It constructs a dictionary that maps
sample IDs to a list of lists, where each sublist includes the complete
data for a given mutation in that sample.
Parameters:
input_file (str): Path to file of mutation data
tumors (dict): dictionary mapping sample IDs to assay IDs
Returns:
dict: Dictionary mapping LUAD sample IDs to lists of mutation data
"""
sample_data = {}
with open(input_file, 'r') as f:
f.readline()
header = f.readline()
line = f.readline()
# parse important mutation data from each line
while line:
split = line.strip().split('\t')
sample_id = split[15]
hugo_symbol = split[0]
entrez_gene = split[1]
chrom = split[4]
start = split[5]
stop = split[6]
strand = split[7]
classification = split[8]
variant_type = split[9]
hgvsc = split[34]
hgvsp = split[35]
gene = split[47]
feature = split[48]
symbol = split[60]
hgnc_id = split[62]
swissprot = split[67]
polyphen = split[72]
variant_class = split[94]
# if the sample ID has already been added, add additional mutation
if sample_id in sample_data:
sample_data[sample_id].append([hugo_symbol, entrez_gene, chrom,\
start, stop, strand, \
classification, variant_type, \
hgvsc, hgvsp, gene, feature, \
symbol, hgnc_id, swissprot, \
polyphen, variant_class])
# if sample ID has not already been added, add sample and mutation
elif sample_id in tumors:
sample_data[sample_id] = [[hugo_symbol, entrez_gene, chrom, \
start, stop, strand, classification,\
variant_type, hgvsc, hgvsp, gene, \
feature, symbol, hgnc_id, \
swissprot, polyphen, variant_class]]
line = f.readline()
return sample_data
def make_mutations_list(sample_data):
"""
Creates list of all mutations included in dataset
This function creates a list of all mutations that occur in the sample set,
and filters for certain mutation types (missense, nonsense, frameshift). It
returns a list of all genes, codon symbols, and full mutations (i.e. KRAS,
KRASp.G12, KRASp.G12D).
Parameters:
sample_data (dict): Dictionary from sample to mutation lists
Returns:
list: List of all included mutations
"""
mutations = []
# iterate through all sample mutation data
for sample_id in sample_data:
for mutation in sample_data[sample_id]:
symbol = mutation[0] + mutation[9]
mut_type = mutation[6]
# construct codon symbol (gene name and codon mutated, not including
# amino acid change)
ints = re.findall('\d+', symbol)
if len(ints) >= 1:
codonsymbol = symbol[0:symbol.find(ints[0])] + ints[0]
# add full mutation symbol to mutations
if (symbol not in mutations) and (mut_type == \
"Missense_Mutation" or mut_type == "Nonsense_Mutation" or \
"Frame_Shift" in mut_type):
mutations.append(symbol)
# add codon symbol to mutations
if (codonsymbol not in mutations) and (mut_type == \
"Missense_Mutation" or mut_type == "Nonsense_Mutation" or \
"Frame_Shift" in mut_type):
mutations.append(codonsymbol)
# add gene symbol to mutations
if mutation[0] not in mutations and (mut_type == \
"Missense_Mutation" or mut_type == "Nonsense_Mutation" or \
"Frame_Shift" in mut_type):
mutations.append(mutation[0])
return mutations
# make dict of all mutations that should be detected in each panel, following
# the indexing schema in the 'mutations' list
def make_panel_to_muts(panels, mutations):
"""
Makes dictionary mapping panel to all mutations screened by panel
Parameters:
panels (dict): Dictionary mapping panels to all genes screened by panel
mutations (list): complete list of all mutations in GENIE dataset
Returns:
dict: Dictionary mapping panel to mutations screened in panel
"""
panel_to_muts = {}
for panel in panels:
panel_to_muts[panel] = [False] * len(mutations) # initalize empty list
# look for panel genes in mutations list, add mutation if contains gene
for gene in panels[panel]:
for i, mutation in enumerate(mutations):
if gene in mutation:
panel_to_muts[panel][i] = True
return panel_to_muts
def make_tumor_mutations(sample_data, mutations, panel_to_muts, tumors):
"""
Constructs dictionary mapping mutation presences for each sample
This function parses through all mutations in each sample and constructs a
dictionary mapping sample IDs to integers representing the presence or
absence of each mutation, where -1 represents 'not screened', 0 represents
'not mutated', and 1 represents a mutation.
Parameters:
sample_data (dict): mapping sample IDs to lists of mutation data
mutations (list): all mutations included in dataset
panel_to_muts (dict): mapping panels to mutations screened by panel
tumors (dict): mapping sample IDs to sample assays
Returns:
dict: mapping sample IDs to integers representing presence of mutations
"""
tumor_mutations = {}
# iterate through all mutations in all samples, update mutation info
for sample_id in sample_data:
tumor_mutations[sample_id] = [-1]*len(mutations)
for mutation in sample_data[sample_id]:
symbol = mutation[0] + mutation[9]
mut_type = mutation[6]
ints = re.findall('\d+', symbol)
if len(ints) >= 1:
codonsymbol = symbol[0:symbol.find(ints[0])] + ints[0]
# if mutation present, update gene, symbol, and codon symbol to 1
if (mut_type == "Missense_Mutation") or (mut_type == \
"Nonsense_Mutation") or ("Frame_Shift" in mut_type):
tumor_mutations[sample_id][mutations.index(symbol)] = 1
tumor_mutations[sample_id][mutations.index(mutation[0])] = 1
tumor_mutations[sample_id][mutations.index(codonsymbol)] = 1
# check against panel for not screened vs not mutated
panel_mutations = panel_to_muts[tumors[sample_id]]
for i, val in enumerate(tumor_mutations[sample_id]):
if val == -1:
if panel_mutations[i]==True:
tumor_mutations[sample_id][i] = 0
return tumor_mutations
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--username", dest = "username", \
help="User name", required=True)
parser.add_argument("-p", "--password", dest = "password", help="Password",\
required=True)
parser.add_argument("-c", "--cancer", nargs = "*", type = str, \
dest = "cancers", help="Cancer Types", \
required=True)
args = parser.parse_args()
for i, cancer in enumerate(args.cancers):
args.cancers[i] = cancer.strip().upper()
# use the synapse client to download relevant files
syn = synapseclient.Synapse()
syn.login(args.username, args.password)
combinedbed = syn.get('syn13251251')
mutationsextended = syn.get('syn13251247')
clinicalpatient = syn.get('syn13251229')
# parse all files and extract relevant information
panels = make_panel_dictionary(combinedbed.path) # panel to gene
tumors = find_tumor_ids(clinicalpatient.path, args.cancers)
sample_data = pull_sample_mutations(mutationsextended.path, tumors)
mutations = make_mutations_list(sample_data) # find all mutations
panel_to_muts = make_panel_to_muts(panels, mutations) # panel to mutation
tumor_mutations = make_tumor_mutations(sample_data, mutations, \
panel_to_muts, tumors)
# parse dict containing all mutations for each tumor, output to tsv
outname = "complete_mutations_table_%s.txt" % "_".join(args.cancers)
with open(outname, "w") as output:
output.write("\t".join(["Sample"] + mutations)+"\n")
for i in tumor_mutations:
output.write("\t".join([i]+list(map(str, tumor_mutations[i])))+"\n")
if __name__ == "__main__":
main()