-
Notifications
You must be signed in to change notification settings - Fork 0
/
PALB2_Analysis.py
435 lines (390 loc) · 20.5 KB
/
PALB2_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
import sys
import re
import pandas as pd
from Bio.Seq import Seq
import csv
"""
This is a python script that analyse FASTQ file of 20000 PALB2 variants library
The script classify which variant is a forward and reverse sequence based on the attB
position and then identify the reading sequences, barcodes, and twist barcodes.
Finally, the script identifies which mutation are at which indices and bases on both the variant
and the PALB2 sequences, and then names which amino acids are mutated
All the data are exported into a csv
"""
# create an empty dataFrame object to store the information of the sequences
# 'original_index' refers to the index on the PABL2 gene only (not the entire sequence)
# 'mutation_index' refers to the index on the library variant (the entire sequence)
df = pd.DataFrame(columns=['Header','Variant', 'Sequence', 'Orientation', 'attB_index',
'read_1', 'read_2', 'barcode', 'barcode_length', 'bar_type',
'twist1', 'twist2', 'twist_barcode', 'original_index',
'mutation_index', 'mutation', 'AA_mutation', 'Variant_number'])
# The original PALB2 sequence that is used to find mutation
original_PALB2 = "ATGGACGAGCCTCCCGGGAAGCCCCTCAGCTGTGAGGAGAAGGAAAAGTTAAAGGAGAAACTTGCTTTTTTGAAAAGGGAATACAGCAAGACACTAGCCCGCCTTCAGCGTGCCCAAAGAGCTGAAAAGATTAAGCATTCTATTAAGAAAACAGTAGAAGAACAAGATTGTTTGTCTCAGCAGGATCTCTCACCGCAGCTAAAACACTCAGAACCTAAAAATAAAATATGTGTTTATGACAAGTTACACATCAAAACCCATCTTGATGAAGAAACTGGAGAAAAGACATCTATCACACTTGATGTTGGGCCTGAGTCCTTTAACCCTGGAGATGGCCCAGGAGGATTACCTATACAAAGAACAGATGACACCCAAGAACATTTTCCCCACAGGGTCAGTGACCCTAGTGGTGAGCAAAAGCAGAAGCTGCCAAGCAGAAGAAAGAAGCAGCAGAAGAGGACATTTATTTCACAGGAGAGAGACTGTGTCTTTGGCACTGATTCACTCAGATTGTCTGGGAAAAGACTAAAGGAACAGGAAGAAATCAGTAGCAAAAATCCTGCTAGATCACCAGTAACTGAAATAAGAACTCACCTTTTAAGTCTTAAATCTGAACTTCCAGATTCTCCAGAACCAGTTACAGAAATTAATGAAGACAGTGTATTAATTCCACCAACTGCCCAACCAGAAAAAGGTGTTGATACATTCCTAAGAAGACCTAATTTCACCAGGGCGACTACAGTTCCTTTACAGACTCTATCAGATAGCGGTAGTAGTCAGCACCTTGAACACATTCCTCCTAAAGGTAGCAGTGAACTTACTACTCACGACCTAAAAAACATTAGATTTACTTCACCTGTAAGTTTGGAGGCACAAGGCAAAAAAATGACTGTCTCTACAGATAACCTCCTTGTAAATAAAGCTATAAGTAAAAGTGGCCAACTGCCCACAAGTTCTAATTTAGAGGCAAATATTTCATGTTCTCTAAATGAACTCACCTACAATAACTTACCAGCAAATGAAAACCAAAACTTAAAAGAACAAAATCAAACAGAGAAATCTTTAAAATCTCCCAGTGACACTCTTGATGGCAGGAATGAAAATCTTCAGGAAAGTGAGATTCTAAGTCAACCTAAGAGTCTTAGCCTGGAAGCAACCTCTCCTCTTTCTGCAGAAAAACATTCTTGCACAGTGCCTGAAGGCCTTCTGTTTCCTGCAGAATATTATGTTAGAACAACACGAAGCATGTCCAATTGCCAGAGGAAAGTAGCCGTGGAGGCTGTCATTCAGAGTCATTTGGATGTCAAGAAAAAAGGGTTTAAAAATAAAAATAAGGATGCAAGTAAAAATTTAAACCTTTCCAATGAGGAAACTGACCAAAGTGAAATTAGGATGTCTGGCACATGCACAGGACAACCAAGTTCAAGAACCTCTCAGAAACTTCTCTCATTAACTAAAGTCAGCTCTCCCGCTGGGCCCACTGAAGATAATGACTTGTCTAGGAAGGCAGTTGCCCAAGCACCTGGTAGAAGATACACAGGAAAAAGAAAATCAGCCTGCACCCCAGCATCAGATCATTGTGAACCACTTTTGCCAACTTCTAGCCTGTCGATTGTTAACAGGTCCAAGGAAGAAGTCACCTCACACAAATATCAGCACGAAAAATTATTTATTCAAGTGAAAGGGAAGAAAAGTCGTCATCAAAAAGAGGATTCCCTTTCTTGGAGTAATAGTGCTTATTTATCCTTGGATGATGATGCTTTCACGGCTCCATTTCATAGGGATGGAATGCTGAGTTTAAAGCAACTACTGTCTTTTCTCAGTATCACAGACTTTCAGTTACCTGATGAAGACTTTGGACCTCTTAAGCTTGAAAAAGTGAAGTCCTGCTCAGAAAAACCAGTGGAGCCCTTTGAGTCAAAAATGTTTGGAGAGAGACATCTTAAAGAGGGAAGCTGTATTTTTCCAGAGGAACTGAGTCCTAAACGCATGGATACAGAAATGGAGGACTTAGAGGAAGATCTTATTGTTCTACCAGGAAAATCACATCCCAAAAGGCCAAACTCGCAAAGCCAGCATACAAAGACGGGCCTTTCTTCATCCATATTACTTTATACTCCTTTAAATACGGTTGCGCCTGATGATAATGACAGGCCTACCACAGACATGTGTTCACCTGCTTTCCCCATCTTAGGTACTACTCCAGCCTTTGGCCCTCAAGGCTCCTATGAAAAAGCATCTACAGAAGTTGCTGGACGAACTTGCTGCACACCCCAACTTGCTCATTTGAAAGACTCAGTCTGTCTTGCCAGTGATACTAAACAATTCGACAGTTCAGGCAGCCCAGCAAAACCACATACCACCCTGCAAGTGTCAGGCAGGCAAGGACAACCTACCTGTGACTGTGACTCTGTCCCGCCAGGAACACCTCCACCCATTGAGTCATTCACTTTTAAAGAAAATCAGCTCTGTAGAAACACATGCCAGGAGCTGCATAAACATTCCGTCGAACAGACTGAAACAGCAGAGCTTCCTGCTTCTGATAGCATAAACCCAGGCAACCTACAATTGGTTTCAGAGTTAAAGAATCCTTCAGGTTCCTGTTCCGTAGATGTGAGTGCCATGTTTTGGGAAAGAGCCGGTTGTAAAGAGCCATGTATCATAACTGCTTGCGAAGATGTAGTTTCTCTTTGGAAAGCTCTGGATGCTTGGCAGTGGGAAAAACTTTATACCTGGCACTTCGCAGAGGTTCCAGTATTACAGATAGTTCCAGTGCCTGATGTGTATAATCTCGTGTGTGTAGCTTTGGGAAATTTGGAAATCAGAGAGATCAGGGCATTGTTTTGTTCCTCTGATGATGAAAGTGAAAAGCAAGTACTACTGAAGTCTGGAAATATAAAAGCTGTGCTTGGCCTGACAAAGAGGAGGCTAGTTAGTAGCAGTGGGACCCTTTCTGATCAACAAGTAGAAGTCATGACGTTTGCAGAAGATGGAGGAGGCAAAGAAAACCAATTTTTGATGCCCCCTGAGGAGACTATACTAACTTTTGCTGAGGTCCAAGGGATGCAAGAAGCTCTGCTTGGTACTACTATTATGAACAACATTGTTATTTGGAATTTAAAAACTGGTCAACTCCTGAAAAAGATGCACATTGATGATTCTTACCAAGCTTCAGTCTGTCACAAAGCCTATTCTGAAATGGGGCTTCTCTTTATTGTCCTGAGTCATCCCTGTGCCAAAGAGAGTGAGTCGTTGCGAAGCCCTGTGTTTCAGCTCATTGTGATTAACCCTAAGACGACTCTCAGCGTGGGTGTGATGCTGTACTGTCTTCCTCCAGGGCAGGCTGGCAGGTTCCTGGAAGGTGACGTGAAAGATCACTGTGCAGCAGCAATCTTGACTTCTGGAACAATTGCCATTTGGGACTTACTTCTCGGTCAGTGTACTGCCCTCCTCCCACCTGTCTCTGACCAACATTGGTCTTTTGTGAAATGGTCGGGTACAGACTCTCATTTGCTGGCTGGACAAAAAGATGGAAATATATTTGTATACCACTATTCATAA"
# Search for the attB sequence and return its indices
# If forward strand, then attB is forward and vice versea
def attB_match(search_seq):
attB_forward = "CCGGCTTGTCGACGACGGCGGTCTCCGTCGTCAGGATCATCC"
attB_reverse = "GGATGATCCTGACGACGGAGACCGCCGTCGTCGACAAGCCGG"
forward_match = re.search(attB_forward, search_seq)
reverse_match = re.search(attB_reverse, search_seq)
if forward_match:
return ["forward", forward_match.start()]
if reverse_match:
return ["reverse", reverse_match.start()]
else:
return ["N/A", "N/A"]
# Search for both the reading 1 and 2 sequences and return their indices
# The reading sequence indices are used to determine the barcode in between
def reading_match(search_seq):
read_1 = "ACAAATAGTT"
read_2 = "TGCGAGTAGT"
read1_index = 0
read2_index = 0
read1_match = re.search(read_1, search_seq)
read2_match = re.search(read_2, search_seq)
if read1_match:
read1_index = read1_match.start()
else:
read1_index = "N/A"
if read2_match:
read2_index = read2_match.start()
else:
read2_index = "N/A"
return [read1_index, read2_index]
# Find the barcode sequence in between the reading 1 and 2 sequences and its length
def find_barcode(sequence, read_1, read_2):
if isinstance(read_1, int) and isinstance(read_2, int):
return seq[read_1 + 10:read_2]
# Return the length of the bar code
def bar_length(bar):
try:
return len(bar)
except:
return "N/A"
# Return the types of each barcode: normal, indel, or unbar
def bar_type(length):
try:
if length == 18:
return "normal"
if length <= 6:
return "unbar"
else:
return "indel"
except:
return "N/A"
# Search for both the twist reading 1 and 2 sequences and return their indices
# The twist reading sequence indices are used to determine the twist barcode in between
def twist_match(search_seq):
twist_1 = "AGAGTCAGATCAGTCTCGAG"
twist_2 = "GAATTCCTGACCTCCTTCTC"
twist1_index = 0
twist2_index = 0
twist1_match = re.search(twist_1, search_seq)
twist2_match = re.search(twist_2, search_seq)
if twist1_match:
twist1_index = twist1_match.start()
else:
twist1_index = "N/A"
if twist2_match:
twist2_index = twist2_match.start()
else:
twist2_index = "N/A"
return [twist1_index, twist2_index]
# Find the twist barcode sequence in between the twist reading 1 and 2 sequences and its length
def find_twist(sequence, twist_1, twist_2):
if isinstance(twist_1, int) and isinstance(twist_2, int):
return seq[twist_1 + 20:twist_2]
# Find the start amino acid of the PABL2 sequence to find the mutation
def find_startAA(search_seq):
startAA = "ATGGACGAGCCT"
startAA_match = re.search(startAA, search_seq)
if startAA_match:
return startAA_match.start()
else:
return "N/A"
# Iterate through the PABL2 sequence and compare it with the input sequence
# Only find the mutation if the twist bar and the start amino acid of PABL2 are present
# The indices of the original sequqnce and the PABL sequence are recorded
# The change in bases is also recorded
def findMutation(seq, twist_bar):
if twist_bar is None:
return ["N/A", "N/A", "N/A"]
original_index = []
mutation_index = []
mutation_map = []
j = 0
mutation_counter = 0
startIndex = find_startAA(seq)
if isinstance(startIndex, int):
for i in range(startIndex, len(original_PALB2)):
if mutation_counter >= 10:
break
try:
if seq[i] != original_PALB2[j]:
original_index.append(j)
mutation_index.append(i)
mutation_map.append(str(original_PALB2[j]) + ">" + str(seq[i]))
mutation_counter += 1
except:
return "N/A"
j = j + 1
return [original_index, mutation_index, mutation_map]
else:
return ["N/A", "N/A", "N/A"]
# Translate a codon of 3 bases into the appropriate amino acids
def amino_acids(aa):
if aa == "TTT" or aa == "TTC":
return "Phe"
if aa == "TTA" or aa == "TTG" or aa == "CTT" or aa == "CTC" or aa == "CTA" or aa == "CTG":
return "Leu"
if aa == "ATT" or aa == "ATC" or aa == "ATA":
return "Ile"
if aa == "ATG":
return "Met"
if aa == "GTT" or aa == "GTC" or aa == "GTA" or aa == "GTG":
return "Val"
if aa == "TCT" or aa == "TCC" or aa == "TCA" or aa == "TCG" or aa == "AGT" or aa == "AGC":
return "Ser"
if aa == "CCT" or aa == "CCC" or aa == "CCA" or aa == "CCG":
return "Pro"
if aa == "ACT" or aa == "ACC" or aa == "ACA" or aa == "ACG":
return "Thr"
if aa == "GCT" or aa == "GCC" or aa == "GCA" or aa == "GCG":
return "Ala"
if aa == "TAT" or aa == "TAC":
return "Tyr"
if aa == "TAA" or aa == "TAG" or aa == "TGA":
return "Stop"
if aa == "CAT" or aa == "CAC":
return "His"
if aa == "CAA" or aa == "CAG":
return "Gln"
if aa == "AAT" or aa == "AAC":
return "Asn"
if aa == "AAA" or aa == "AAG":
return "Lys"
if aa == "GAT" or aa == "GAC":
return "Asp"
if aa == "GAA" or aa == "GAG":
return "Glu"
if aa == "TGT" or aa == "TGC":
return "Cys"
if aa == "TGG":
return "Trp"
if aa == "CGT" or aa == "CGC" or aa == "CGA" or aa == "CGG" or aa == "AGA" or aa == "AGG":
return "Arg"
if aa == "GGT" or aa == "GGC" or aa == "GGA" or aa == "GGG":
return "Gly"
def amino_acids2(aa):
if aa == "TTT" or aa == "TTC":
return "F"
if aa == "TTA" or aa == "TTG" or aa == "CTT" or aa == "CTC" or aa == "CTA" or aa == "CTG":
return "L"
if aa == "ATT" or aa == "ATC" or aa == "ATA":
return "I"
if aa == "ATG":
return "M"
if aa == "GTT" or aa == "GTC" or aa == "GTA" or aa == "GTG":
return "V"
if aa == "TCT" or aa == "TCC" or aa == "TCA" or aa == "TCG" or aa == "AGT" or aa == "AGC":
return "S"
if aa == "CCT" or aa == "CCC" or aa == "CCA" or aa == "CCG":
return "P"
if aa == "ACT" or aa == "ACC" or aa == "ACA" or aa == "ACG":
return "T"
if aa == "GCT" or aa == "GCC" or aa == "GCA" or aa == "GCG":
return "A"
if aa == "TAT" or aa == "TAC":
return "Y"
if aa == "TAA" or aa == "TAG" or aa == "TGA":
return "X"
if aa == "CAT" or aa == "CAC":
return "H"
if aa == "CAA" or aa == "CAG":
return "Q"
if aa == "AAT" or aa == "AAC":
return "N"
if aa == "AAA" or aa == "AAG":
return "K"
if aa == "GAT" or aa == "GAC":
return "D"
if aa == "GAA" or aa == "GAG":
return "E"
if aa == "TGT" or aa == "TGC":
return "C"
if aa == "TGG":
return "W"
if aa == "CGT" or aa == "CGC" or aa == "CGA" or aa == "CGG" or aa == "AGA" or aa == "AGG":
return "R"
if aa == "GGT" or aa == "GGC" or aa == "GGA" or aa == "GGG":
return "G"
# A helper method to find which mutation index goes with other indices as part of a codon
# For example, a index of 1 will have 0 and 2 as part of the same codon
def helper_findSameCodonIndex(index):
if index % 3 == 0:
return [index + 1, index + 2]
if index % 3 == 1:
return [index - 1, index + 1]
if index % 3 == 2:
return [index - 1, index - 2]
# A helper method that uses the helper_takeSameCodon_2
# This returns a dictionary that list the each indices that belong to the same codon
# For example, AA1': [687, 689], 'AA2': [1602, 1603, 1604], 'AA3': [1605, 1606]
def helper_takeSameCodon_1(mutation_index):
if mutation_index == "N/A":
return "N/A"
if isinstance(mutation_index, str):
return "N/A"
mutation_index_copy = mutation_index.copy()
aa_map = {}
count = 1
helper_takeSameCodon_2(mutation_index_copy, aa_map, count)
return aa_map
# A recursion helper method that uses helper_findSameCodonIndex
def helper_takeSameCodon_2(mutation_index_copy, aa_map, count):
if len(mutation_index_copy) == 0:
return
taken_index = []
# Use the helper method to find all possible indices that is in the same codon
findSameCodonIndex = helper_findSameCodonIndex(mutation_index_copy[0])
# Append the taken indices and then remove it
taken_index.append(mutation_index_copy[0])
mutation_index_copy.remove(mutation_index_copy[0])
# Find the indices that is in the same codon as the the taken index
for i in range(0, len(mutation_index_copy)):
if mutation_index_copy[i] in findSameCodonIndex:
taken_index.append(mutation_index_copy[i])
# Remove all taken indices from the list of remaining mutation indices
mutation_index_copy = [i for i in mutation_index_copy if i not in taken_index]
aa_map["AA" + str(count)] = taken_index
count = count + 1
# Recursion
helper_takeSameCodon_2(mutation_index_copy, aa_map, count)
# Return the remaining indices of the same codon given a list of indices
# For example, input [0,2] will return [0,1,2]
def helper_getCodonIndex(index_list):
if len(index_list) == 3:
return index_list
if len(index_list) == 2:
if index_list[0] % 3 == 0 and index_list[1] % 3 == 1:
return [index_list[0], index_list[1], index_list[1] + 1]
if index_list[0] % 3 == 1 and index_list[1] % 3 == 2:
return [index_list[0] - 1, index_list[0], index_list[1]]
if index_list[0] % 3 == 0 and index_list[1] % 3 == 2:
return [index_list[0], index_list[0] + 1, index_list[1]]
if len(index_list) == 1:
if index_list[0] % 3 == 0:
return [index_list[0], index_list[0] + 1, index_list[0] + 2]
if index_list[0] % 3 == 1:
return [index_list[0] - 1, index_list[0], index_list[0] + 1]
if index_list[0] % 3 == 2:
return [index_list[0] - 2, index_list[0] - 1, index_list[0]]
# This concantenate a codon of bases from the index list
def helper_getCodonBase(index_list, variant):
codon_base = ""
for i in index_list:
codon_base = codon_base + str(variant[i])
return codon_base
# Map the mutation from the original bases
# This is because variant sequence do not all % 3 so some bases are not part of any codon
# That is why we cannot take the index directly from the varaint, but map from the PALB2
def helper_mutation_map(original_map, seq):
startAA_index = find_startAA(seq)
mutation_map = []
for item in original_map:
new_index = startAA_index + item
mutation_map.append(new_index)
return mutation_map
# Return a dictionary that show which amino acids are mutated and which mutation it is
# For example, return AA1': 'Gln>Ser', 'AA2': 'Thr>Leu', 'AA3': 'Gly>Tyr'
def translate_AA(original_map, seq):
if original_map == "N/A":
return "N/A"
final_map = {}
# Iterate through both the dictionary of codons in the variant as well as the PALB2 gene
# for (k,v), (k2,v2) in zip(original_map.items(), mutation_map.items()):
for k, v in original_map.items():
# Get all 3 bases indices of the codon
get_OriCodon = helper_getCodonIndex(v)
# get_MutCodon = helper_getCodonIndex(v2)
get_MutCodon = helper_mutation_map(get_OriCodon, seq)
# Get the bases of the codon
ori_base = helper_getCodonBase(get_OriCodon, original_PALB2)
mut_base = helper_getCodonBase(get_MutCodon, seq)
# Get the amino acids that the codon translate to
ori_aa = amino_acids2(ori_base)
mut_aa = amino_acids2(mut_base)
# Update the dictionary
final_map[k] = str(ori_aa) + str(round(get_MutCodon[0]/3)-373) + str(mut_aa)
return final_map
# Make a csv file we'll just keep writing to
csv_file = open('Output_PALB2_Analysis2.csv', 'w')
writer = csv.writer(csv_file)
writer.writerow(['Header','Variant', 'Sequence', 'Orientation', 'attB_index',
'read_1', 'read_2', 'barcode', 'barcode_length', 'bar_type',
'twist1', 'twist2', 'twist_barcode', 'original_index',
'mutation_index', 'mutation', 'AA_mutation', 'Variant_number'])
# Add row to the dataframe by iterating through the fastq file
with open(sys.argv[1], 'r') as fh:
count = 1
lines = []
for line in fh:
lines.append(line.rstrip())
# Only choose lines with length 4 which is how fastq file structures
if len(lines) == 4:
# Get the sequence
header = lines[0]
seq = lines[1]
# Find attB
attB = attB_match(seq)
# If the sequence is in reverse orientation, then make it forward
if attB[0] == "reverse":
seq = str(Seq(seq).reverse_complement())
# Find the reading 1 and 2 sequence
reading = reading_match(seq)
# Find the bar code and its length
barcode = find_barcode(seq, reading[0], reading[1])
length = bar_length(barcode)
# Determine the bar type
bartype = bar_type(length)
# Find the twist reading sequence
twist_read = twist_match(seq)
# Find the twist bar code
twist_bar = find_twist(seq, twist_read[0], twist_read[1])
# Find the index mutation
mutation_read = findMutation(seq, twist_bar)
# Find the amino acid mutation
## Get the sequence and the PALB2 mutation index
original_index = mutation_read[0]
mutation_index = mutation_read[1]
## Get the full codon of 3 bases to be translated
original_map = helper_takeSameCodon_1(original_index)
# mutation_map = helper_takeSameCodon_1(mutation_index)
## Translate the codon 3 bases into amino acid
aa_mutation = translate_AA(original_map, seq)
final_map = aa_mutation
# Now also figure out how many aa variants there are in that row
if isinstance(final_map,str):
variant_number = 0
if isinstance(final_map,dict):
variant_number = len(final_map)
# Append row of sequence information above
#df = df.append({'Header': header,'Variant': count, 'Sequence': seq, 'Orientation': attB[0],
# 'attB_index': attB[1], 'read_1': reading[0], 'read_2': reading[1],
# 'barcode': barcode, 'barcode_length': length, 'bar_type': bartype,
# 'twist1': twist_read[0], 'twist2': twist_read[1], 'twist_barcode': twist_bar,
# 'original_index': mutation_read[0], 'mutation_index': mutation_read[1],
# 'mutation': mutation_read[2], 'AA_mutation': final_map, 'Variant_number': variant_number},
# ignore_index=True)
writer.writerow([header,count,"seq",attB[0],attB[1],reading[0],reading[1],
barcode,length,bartype,twist_read[0],twist_read[1],
twist_bar,mutation_read[0], mutation_read[1],mutation_read[2],
final_map,variant_number]) #str(combined)))
# Tracking the progress of appending at each 1000 rows by printing
if count % 1000 == 0:
print(count)
lines = []
count = count + 1
# Export dataframe to csv
#df.to_csv(r'Output_PALB2_Analysis.csv', index=False, header=True)
csv_file.close()