/
phrecon.py
executable file
·328 lines (269 loc) · 12.4 KB
/
phrecon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#!/usr/bin/python
#
# Phrecon (Phylo Reconstructor)
#
# This script reconstructs the base sequence using an SNP loci and substituting its bases in at the given locations in the
# provided reference base sequence.
#
# The base reference should only have TWO columns (and no headers), consisting of just the space-delimited locus and base columns.
#
# The SNP input file may contain multiple SNP "fragments" (ids), and phrecon will generate a new sequence for each one
# and write them out in FASTA format into the same output file.
#
# SNP file input should contain three columns and no headers, space-delimited:
# SNP_ID (i.e. query id) | Base position | Base
#
# So something like:
# A12 1045 G
# A12 4056 A
# A12 13004 T
# A35 4 A
# A35 401 C
#
# The above example contains two query SNPs, so phrecon will generate a FASTA file containing the full sequence for each.
#
# Prephix's SNP and base ref output files can be used by phrecon as input.
#
# If the SNP line has a locus of -1 and base "-", and it is the ONLY line for that strainid, then it is treated as
# an empty SNP -- i.e. the same sequence as the ref base (since there are no SNPs reported for that strain).
#
# Usage: phrecon.py <reference base file> <SNP loci input file> [-debug] [-quiet]
#
import argparse
import os
import sys
import re
import Bio
import Bio.Seq
import Bio.SeqIO
import Bio.SeqRecord
import Bio.Alphabet
import multiprocessing
import pprint
import signal
VERSION = "4.6.0"
#####################
# UTILITY FUNCTIONS
#####################
def print_debug(msg):
'''Prints output to STDOUT and logfile if debug flag is set.
If the quiet funnction is also set, then only log to file.
Parameters:
msg = Text to print/log.
'''
if debug:
if not quiet:
print msg
logfile.write("{}\n".format(msg))
def print_all(msg):
'''Prints output to STDOUT and logfile in one call.
If the quiet funnction is set, then only log to file.
Parameters:
msg = Text to print/log.
'''
if not quiet:
print msg
logfile.write("{}\n".format(msg))
########################################
# FUNCTIONS RELATED TO MULTIPROCESSING
########################################
def mp_reconstruct_strain(myStrainID,myStrainData,myRefData):
'''This function is used by the multiprocessing feature.
It takes in three arguments: strainid name, strain id's
dictionary object (locus=>base), the reference base
dictionary data (locus=>base).
This function returns a two-tuple of the strainid name and
the string representing the reconstructed snp sequence,
or an Exception object as the second two-tuple value if
there is an error.
If the strain data has one entry of locus -1 and base '-',
then it is assumed to be from an empty SNP file (no SNPs),
so will return the base reference sequence.
'''
try:
if len(myStrainData) == 1 and 0 in myStrainData.keys() and myStrainData[0] == "-1":
strainData = myRefData.copy()
else:
strainData = myRefData.copy()
strainData.update(myStrainData)
strainKeys = strainData.keys()
strainKeys.sort()
snpSequenceString = "".join([strainData[locus] for locus in strainKeys])
except Exception as error:
return (myStrainID,error)
else:
return (myStrainID,snpSequenceString)
def mp_init_worker():
'''Initializer function for Pool. Called by each worker process, and will
make them ignore interrupts, to be handled by the main thread.
'''
signal.signal(signal.SIGINT, signal.SIG_IGN)
def mp_store_result_callback(result):
'''Callback function for Pool's async calls. Stores the result
in the result table.
'''
logFileLock.acquire()
if isinstance(result[1],Exception):
print_all("*** ERROR: Exception thrown while processing strain {}: {}".format(result[0],pprint.pformat(result[1])))
else:
print_all("Now reconstructing strain {}".format(result[0]))
resultTable[result[0]] = result[1]
logFileLock.release()
########################
# MAIN
########################
if __name__ == '__main__':
print "\nPhrecon (Phylo Reconstructor) v{}\n\n".format(VERSION)
# Setup argument parsing
parser = argparse.ArgumentParser()
parser.add_argument('reffilename', help="Reference base file name with line format [LOCUS] [BASE]")
parser.add_argument('snpfilename', help="SNP data file name with line format [STRAIN_ID] [LOCUS] [BASE]")
parser.add_argument('-debug','--debug', action="store_true", help="Enable debugging output")
parser.add_argument('-quiet','--quiet', action="store_true", help="Run in quiet mode (suppress most screen output)")
parser.add_argument('-multichrom','--multichrom', action="store_true", help="Run in multichrom mode (lexical sorting)")
parser.add_argument('-cpus','--cpus', type=int, help="Indicates how many CPUs to use during multiprocessing. Default is the detected number of CPUs from Python's multiprocessing module. Setting this to 1 basically disables multiprocessing.",default=multiprocessing.cpu_count())
args = parser.parse_args()
reffile = open(args.reffilename,"r")
infile = open(args.snpfilename,"r")
logfilename = "{}.reconstructed.log".format(args.snpfilename)
logfile = open(logfilename,"w")
#outfile is not needed since BioPython will write to it. Just need to generate the filename here.
outfilename = "{}.reconstructed".format(args.snpfilename)
debug = args.debug
quiet = args.quiet
multichrom = args.multichrom
cpu_num=args.cpus
if debug:
print "Producing debug output.\n"
if quiet:
print "Producing quiet (no stdout) output. Logfile is still generated.\n"
print_all("CPUs to use: {}".format(cpu_num))
refData = {} # This is a dictionary of the ref base. Key is locus, value is base.
print_all("Loading loci entries from reference file...")
# Read in and store reference data in REF_DATA table.
lineNumber=0
# Expect reference data to be in a two-column format: Loci Base
refRe = re.compile("^(?P<locus>\S+)\s+(?P<base>[A-Z]+)$")
for line in reffile:
lineNumber += 1
refMatch = refRe.match(line)
if refMatch:
locus = str(refMatch.group('locus'))
# If multichrom then use lexical sorting on locus value.
if multichrom:
pass
else:
# If it isn't multichrom, then convert it to integer for numeric sort.
locus = int(locus)
base = refMatch.group('base')
# Check for conflict. Fail if mismatched base at locus, ignore if identical base at locus, insert if no row exists.
if locus in refData:
if refData[locus] != base:
print_all("*** ERROR: Duplicate loci found! Read base {} at loci {}, which collides with previous base {} at same loci! On line {} of reference file.".format(base,locus,refData[locus],lineNumber))
print "Failed."
sys.exit(1)
else:
print_debug("Skipping duplicate (but identical) locus/base at line {} for locus {} and base {}\n".format(lineNumber,locus,base))
else:
refData[locus] = base
else:
print_all("*** ERROR: Reference file format not recognized. Expected two columns -- [Loci (alphnumeric)] [Base (A-Z)]. Got {} at line {} instead.".format(line,lineNumber))
print "Failed.\n"
sys.exit(1)
print
reffile.close()
print_all("Read {} loci entries from reference file.".format(lineNumber))
print_all("Loading loci entries from strain snp data file...")
# Read in SNP data and store in the snpData dictionary
# The snpData dictionary has strainIDs as the key, and the values are
# a dictionary of locus keys and base values for that strain.
lineNumber=0
warnings=0
strainCount = 0
strainNameList = [] # Store strain IDs in a list to preserve order when writing to file later.
# Expect snp data file lines to be in a three-column format: StrainID Loci Base
snpRe = re.compile("^(?P<strainid>\S+)\t(?P<locus>\S+|(-1))\t(?P<base>[A-Z]+|-)$")
currentStrain = ""
snpData = {}
for line in infile:
lineNumber += 1
snpMatch = snpRe.match(line)
# Quit on bad lines. Bad data!
if snpMatch == None:
print_all("*** ERROR: Bad data format on line {} of SNP loci file. Expect three columns [Strain ID (Alphanumeric)] [Loci (Numeric)] [Base (alphabetic)]\nGot {} instead.".format(lineNumber,line))
print "Failed.\n"
sys.exit(1)
strainid = snpMatch.group('strainid')
locus = str(snpMatch.group('locus'))
# If multichrom, treat as string (default) for lexical sorting, otherwise
# convert to numeric sorting for regular snps
if multichrom:
pass
else:
locus = int(locus)
base = snpMatch.group('base')
if currentStrain != strainid:
strainCount += 1
currentStrain = strainid
strainNameList.append(currentStrain)
snpData[currentStrain] = {}
# Sanity checks - These just generate warnings
# Check that the locus exists in the reference base.
if not locus in refData:
print_all("*** WARNING: Encountered SNP loci {}, which does NOT exist in base ref sequence! (line {} of SNP input file) Skipping...".format(locus,lineNumber))
warnings += 1
continue
# Check that the locus doesn't already exist for this strain
if locus in snpData[currentStrain]:
print_all("*** WARNING: Duplicate loci found for same strain id! (line {} of SNP input file, colliding on loci {}) Skipping (will use existing loci)...".format(lineNumber,locus))
warnings += 1
continue
# Load into SNP dictionary
print_debug("Diag: read in {}, {}, {}\n".format(strainid,locus,base))
snpData[currentStrain][locus]=base
print
infile.close()
print_all("Read {} loci entries from snp data file.".format(lineNumber))
seqRecordList = []
# Store the reference base sequence as the first output in the sequence record array
print_all("Generating reference sequence record.")
refKeys = refData.keys()
refKeys.sort()
refSequenceString = "".join([ refData[locus] for locus in refKeys])
refSeq = Bio.Seq.Seq(refSequenceString,Bio.Alphabet.Alphabet)
refSeqRecord = Bio.SeqRecord.SeqRecord(refSeq,id="REF",description="")
seqRecordList.append(refSeqRecord)
##################
# MULTIPROCESSING -- Reconstruct the input strains in parallel and store those into the sequence record array.
##################
# Setup a lock -- this is just to neatly interleave output to the log file (otherwise missing or garbles lines can occur).
logFileLock = multiprocessing.Lock()
resultTable = {} # Dictionary of strainID => base sequence string. Filled by multiprocesing call.
mpPool = multiprocessing.Pool(processes=cpu_num,initializer=mp_init_worker)
print_all("Submitting strain reconstruction requests to pool...")
try:
for strainID in strainNameList:
mpPool.apply_async(mp_reconstruct_strain,args=(strainID,snpData[strainID],refData),callback=mp_store_result_callback)
except KeyboardInterrupt:
print "Aborting..."
mpPool.terminate()
mpPool.join()
else:
print_all("Waiting for working processes to return...")
mpPool.close()
mpPool.join()
# Process results from multiprocessing.
# Namely this means generating the list of SeqRecords (in same order as the snp input file)
# to write to the output file with Biopython.
print_all("Processing results...")
for strainID in strainNameList:
snpSequenceString = resultTable[strainID]
snpSeq = Bio.Seq.Seq(snpSequenceString,Bio.Alphabet.Alphabet)
snpSeqRecord = Bio.SeqRecord.SeqRecord(snpSeq,id=strainID,description="")
seqRecordList.append(snpSeqRecord)
# Write it all out.
print_all("Writing output file...")
Bio.SeqIO.write(seqRecordList,outfilename,"fasta")
print_all("\n{} strains processed from SNP loci input file.".format(strainCount));
print_all("\n{} warnings were detected.".format(warnings));
print "Log file for this run can be found in {}\n".format(logfilename)