/
atram.py
executable file
·214 lines (166 loc) · 7.84 KB
/
atram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python3
"""
Start atram.
This wrapper module parses the input arguments and passes them to the module
that does the actual processing (core_atram.py).
"""
import os
import argparse
import textwrap
import lib.db as db
import lib.log as log
import lib.bio as bio
import lib.util as util
import lib.blast as blast
import lib.assembler as assembly
from lib.core_atram import assemble
def parse_command_line():
"""Process command-line arguments."""
description = """
This is the aTRAM script. It takes a query sequence and a blast
database built with the atram_preprocessor.py script and builds an
assembly.
If you specify more than one query sequence and/or more than one blast
database then aTRAM will build one assembly for each query/blast
DB pair.
NOTE: You may use a text file to hold the command-line arguments
like: @/path/to/args.txt. This is particularly useful when specifying
multiple blast databases or multiple query sequences.
"""
parser = argparse.ArgumentParser(
fromfile_prefix_chars='@',
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(description))
parser.add_argument('--version', action='version',
version='%(prog)s {}'.format(db.ATRAM_VERSION))
group = parser.add_argument_group('required arguments')
group.add_argument(
'-b', '--blast-db', '--sra', '--db', '--database',
required=True, metavar='DB', nargs='+',
help="""This needs to match the DB prefix you entered for
atram_preprocessor.py. You may repeat this argument to run the
--query sequence(s) against multiple blast databases.""")
group.add_argument(
'-q', '--query', '--target', '--probe', required=False, nargs='+',
help="""The path to the fasta file with sequences of interest. You may
repeat this argument. If you do then Each --query sequence file
will be run against every --blast-db.""")
group.add_argument(
'-Q', '--query-split', '--target-split', required=False, nargs='+',
help="""The path to the fasta file with multiple sequences of interest.
This will take every sequence in the fasta file and treat it as if
it were its own --query argument. So every sequence in
--query-split will be run against every --blast-db.""")
group.add_argument(
'-o', '--output-prefix', required=True,
help="""This is the prefix of all of the output files. So you can
identify different blast output file sets. You may include a
directory as part of the prefix. aTRAM will add suffixes to
differentiate output files.""")
group.add_argument(
'-a', '--assembler', default='none',
choices=['abyss', 'trinity', 'velvet', 'spades', 'none'],
help="""Which assembler to use. Choosing "none" (the default) will do
a single blast run and stop before any assembly.""")
group.add_argument(
'-i', '--iterations', type=int, default=5, metavar='N',
help="""The number of pipeline iterations. The default is "5".""")
group.add_argument(
'-p', '--protein', action='store_true',
help="""Are the query sequences protein? aTRAM will guess if you skip
this argument.""")
group.add_argument(
'--fraction', type=float, default=1.0,
help="""Use only the specified fraction of the aTRAM database. The
default is 1.0.""")
cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
group.add_argument(
'--cpus', '--processes', '--max-processes', type=int, default=cpus,
help="""Number of CPU processors to use. This will also be used for
the assemblers when possible. We will use {} out of {} CPUs.
""".format(cpus, os.cpu_count()))
group.add_argument('--log-file', help="""Log file (full path)".""")
group.add_argument(
'--path',
help="""If the assembler or blast you want to use is not in your $PATH\
then use this to prepend directories to your path.""")
group.add_argument(
'-t', '--temp-dir', metavar='DIR',
help="""Place temporary files in this directory. All files will be
deleted after aTRAM completes. The directory must exist.""")
group.add_argument(
'--keep-temp-dir', action='store_true',
help="""This flag will keep the temporary files in the --temp-dir
around for debugging.""")
group.add_argument(
'-T', '--timeout', metavar='SECONDS', default=300, type=int,
help="""How many seconds to wait for an assembler before stopping the
run. To wait forever set this to 0. The default is "300"
(5 minutes).""")
group = parser.add_argument_group(
'optional values for blast-filtering contigs')
group.add_argument(
'--no-filter', action='store_true',
help="""Do not filter the assembled contigs. This will: set both the
--bit-score and --contig-length to 0""")
group.add_argument(
'--bit-score', type=float, default=70.0, metavar='SCORE',
help="""Remove contigs that have a value less than this. The default
is "70.0". This is turned off by the --no-filter argument.""")
group.add_argument(
'--contig-length', '--length', type=int, default=100,
help="""Remove blast hits that are shorter than this length. The
default is "100". This is turned off by the --no-filter argument.
""")
blast.command_line_args(parser)
assembly.command_line_args(parser)
args = vars(parser.parse_args())
check_query_args(args)
blast.check_args(args)
# Set defaults and adjust arguments based on other arguments
args['cov_cutoff'] = assembly.default_cov_cutoff(args['cov_cutoff'])
args['blast_db'] = blast.touchup_blast_db_names(args['blast_db'])
args['kmer'] = assembly.default_kmer(args['kmer'], args['assembler'])
args['max_target_seqs'] = blast.default_max_target_seqs(
args['max_target_seqs'], args['blast_db'], args['max_memory'])
setup_blast_args(args)
set_protein_arg(args)
setup_path_arg(args)
find_programs(args)
util.temp_dir_exists(args['temp_dir'], args.get('debug_dir'))
util.set_blast_batch_size(args['batch_size'])
return args
def setup_path_arg(args):
"""Prepend to PATH environment variable if requested."""
if args['path']:
os.environ['PATH'] = '{}:{}'.format(args['path'], os.environ['PATH'])
def setup_blast_args(args):
"""Set up the blast args."""
if args['no_filter']:
args['bit_score'] = 0
args['contig_length'] = 0
def check_query_args(args):
"""Validate the query arguments."""
if not args.get('query') and not args.get('query_split'):
err = 'You must have at least one --query or --query-split argument.'
log.fatal(err)
def set_protein_arg(args):
"""Set up the protein argument."""
if not args['protein'] and args['query']:
args['protein'] = bio.fasta_file_has_protein(args['query'])
def find_programs(args):
"""Make sure we can find the programs needed by the assembler and blast."""
blast.find_program('makeblastdb')
blast.find_program('tblastn')
blast.find_program('blastn')
assembly.find_program(
'abyss', 'bwa', args['assembler'], not args['no_long_reads'])
assembly.find_program('trinity', 'Trinity', args['assembler'])
assembly.find_program(
'trinity', 'Trinity', args['assembler'], args['bowtie2'])
assembly.find_program('velvet', 'velveth', args['assembler'])
assembly.find_program('velvet', 'velvetg', args['assembler'])
assembly.find_program('spades', 'spades.py', args['assembler'])
if __name__ == '__main__':
ARGS = parse_command_line()
assemble(ARGS)