/
atram_preprocessor.py
executable file
·170 lines (135 loc) · 6.21 KB
/
atram_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
"""
Start the atram preprocessor.
This wrapper module parses the input arguments and passes them to the module
that does the actual preprocessing (core_preprocessor.py).
"""
import os
from os.path import join
import argparse
import textwrap
from datetime import date
import lib.db as db
import lib.util as util
import lib.blast as blast
from lib.core_preprocessor import preprocess
def parse_command_line():
"""Process command-line arguments."""
description = """
This script prepares data for use by the atram.py
script. It takes fasta or fastq files of paired-end (or
single-end) sequence reads and creates a set of atram
databases.
You need to prepare the sequence read archive files so that the
header lines contain only a sequence ID with the optional
paired-end suffix at the end of the header line. The separator
for the optional trailing paired-end suffix may be a space,
a slash "/", a dot ".", or an underscore "_".
For example:
>DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/1
GATTAA...
>DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/2
ATAGCC...
>DBRHHJN1:427:H9YYAADXX:1:1101:10006:63769/2
CGAAAA...
"""
parser = argparse.ArgumentParser(
fromfile_prefix_chars='@',
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(description))
parser.add_argument('--version', action='version',
version='%(prog)s {}'.format(db.ATRAM_VERSION))
parser.add_argument(
'--end-1', '-1', metavar='FASTA_or_FASTQ', nargs='+',
help="""Sequence read archive files that have only end 1 sequences. The
sequence names do not need an end suffix, we will assume the suffix
is always 1. The files are in fasta or fastq format. You may enter
more than one file or you may use wildcards.
""")
parser.add_argument(
'--end-2', '-2', metavar='FASTA_or_FASTQ', nargs='+',
help="""Sequence read archive files that have only end 2 sequences.
The sequence names do not need an end suffix, we will assume the
suffix is always 2. The files are in fasta or fastq format. You may
enter more than one file or you may use wildcards.
""")
parser.add_argument(
'--mixed-ends', '-m', metavar='FASTA_or_FASTQ', nargs='+',
help="""Sequence read archive files that have a mix of both end 1 and
end 2 sequences (or single ends). The files are in fasta or fastq
format. You may enter more than one file or you may use wildcards.
""")
parser.add_argument(
'--single-ends', '-0', metavar='FASTA_or_FASTQ', nargs='+',
help="""Sequence read archive files that have only unpaired sequences.
Any sequence suffix will be ignored. The files are in fasta or
fastq format. You may enter more than one file or you may use
wildcards.
""")
group = parser.add_argument_group('preprocessor arguments')
blast_db = join('.', 'atram_' + date.today().isoformat())
group.add_argument(
'-b', '--blast-db', '--db', default=blast_db, metavar='DB',
help="""This is the prefix of all of the blast database files. So you
can identify different blast database sets. You may include a
directory as part of the prefix. The default is "{}".
""".format(blast_db))
cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
group.add_argument(
'--cpus', '--processes', '--max-processes', type=int, default=cpus,
help="""Number of CPU threads to use. On this machine the default is
("{}")""".format(cpus))
group.add_argument(
'-t', '--temp-dir', metavar='DIR',
help="""Place temporary files in this directory. All files will be
deleted after aTRAM completes. The directory must exist.""")
group.add_argument(
'--keep-temp-dir', action='store_true',
help="""This flag will keep the temporary files in the --temp-dir
around for debugging.""")
group.add_argument(
'-l', '--log-file',
help="""Log file (full path). The default is to use the DB and program
name to come up with a name like "<DB>_atram_preprocessor.log".""")
group.add_argument(
'-s', '--shards', '--number', type=int, metavar='SHARDS',
dest='shard_count',
help="""Number of blast DB shards to create. The default is to have
each shard contain roughly 250MB of sequence data.""")
group.add_argument(
'--path',
help="""If blast or makeblastdb is not in your $PATH then use this to
prepend directories to your path.""")
group.add_argument(
'--fasta', action='store_true',
help="""Are these fasta files? If you do not specify either --fasta or
--fastq then aTRAM will guess the file type by looking at the last
character of the file name.""")
group.add_argument(
'--fastq', action='store_true',
help="""Are these fastq files? If you do not specify either --fasta or
--fastq then aTRAM will guess the file type by looking at the last
character of the file name.""")
group.add_argument(
'--gzip', action='store_true',
help="""Are these gzip files?""")
group.add_argument(
'--bzip', action='store_true',
help="""Are these bzip files?""")
args = vars(parser.parse_args())
# Prepend to PATH environment variable if requested
if args['path']:
os.environ['PATH'] = '{}:{}'.format(args['path'], os.environ['PATH'])
all_files = []
for ends in ['mixed_ends', 'end_1', 'end_2', 'single_ends']:
if args.get(ends):
all_files.extend([i for i in args[ends]])
args['shard_count'] = blast.default_shard_count(
args['shard_count'], all_files)
blast.make_blast_output_dir(args['blast_db'])
blast.find_program('makeblastdb')
util.temp_dir_exists(args['temp_dir'])
return args
if __name__ == '__main__':
ARGS = parse_command_line()
preprocess(ARGS)