Skip to content

Commit

Permalink
Adding --single-ends argument to the preprocessor #212
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Jan 18, 2018
1 parent 0236a8d commit 9706f55
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 21 deletions.
45 changes: 27 additions & 18 deletions atram_preprocessor.py
Expand Up @@ -44,14 +44,14 @@ def preprocess(args):
def load_seqs(args, db_conn):
"""Load sequences from a fasta/fastq files into the atram database."""
# We have to clamp the end suffix depending on the file type.
for (arg, clamp) in [('mixed_ends', None), ('end_1', '1'),
for (arg, clamp) in [('mixed_ends', ''), ('end_1', '1'),
('end_2', '2'), ('single_ends', '')]:
if args.get(arg):
for file_name in args[arg]:
load_one_file(db_conn, file_name, clamp)
load_one_file(db_conn, file_name, arg, clamp)


def load_one_file(db_conn, file_name, seq_end_clamp=''):
def load_one_file(db_conn, file_name, arg, seq_end_clamp=''):
"""Load sequences from a fasta/fastq file into the atram database."""
log.info('Loading "{}" into sqlite database'.format(file_name))

Expand All @@ -68,7 +68,10 @@ def load_one_file(db_conn, file_name, seq_end_clamp=''):
match = blast.PARSE_HEADER.match(title)
if match.group(2):
seq_name = match.group(1)
seq_end = match.group(2)
if arg == 'mixed_ends':
seq_end = match.group(2)
else:
seq_end = seq_end_clamp
else:
seq_name = title
seq_end = seq_end_clamp
Expand Down Expand Up @@ -183,26 +186,33 @@ def parse_command_line(temp_dir_default):

parser.add_argument('--mixed-ends', '-m', metavar='FASTA', nargs='+',
help='''Sequence read archive files that have a mix of
both end 1 and end 2 sequences. The sequence names
MUST have an end suffix like "/1" or "_2". The
files are in fasta or fastq format. You may enter
more than one file or you may use wildcards.''')
both end 1 and end 2 sequences (or single ends).
The files are in fasta or fastq format. You may
enter more than one file or you may use wildcards.
''')

parser.add_argument('--end-1', '-1', metavar='FASTA', nargs='+',
help='''Sequence read archive files that have only
end 1 sequences. The sequence names do not need an
end suffix, we will assume the suffix is 1 if it
is missing. The files are in fasta or fastq
format. You may enter more than one file or you
may use wildcards.''')
end suffix, we will assume the suffix is always 1.
The files are in fasta or fastq format. You may
enter more than one file or you may use wildcards.
''')

parser.add_argument('--end-2', '-2', metavar='FASTA', nargs='+',
help='''Sequence read archive files that have only
end 2 sequences. The sequence names do not need an
end suffix, we will assume the suffix is 2 if it
is missing. The files are in fasta or fastq
format. You may enter more than one file or you
may use wildcards.''')
end suffix, we will assume the suffix is always 2.
The files are in fasta or fastq format. You may
enter more than one file or you may use wildcards.
''')

parser.add_argument('--single-ends', '-S', metavar='FASTA', nargs='+',
help='''Sequence read archive files that have only
unpaired sequences. Any sequence suffix will be
ignored. The files are in fasta or fastq format.
You may enter more than one file or you may use
wildcards.''')

parser.add_argument('--version', action='version',
version='%(prog)s {}'.format(db.ATRAM_VERSION))
Expand All @@ -221,8 +231,7 @@ def parse_command_line(temp_dir_default):
cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
group.add_argument('--cpus', '--processes', '--max-processes',
type=int, default=cpus,
help='''Number of CPU threads to use. This will also be
used for the assemblers when possible. On this
help='''Number of CPU threads to use. On this
machine the default is ("{}")'''.format(cpus))

group.add_argument('-t', '--temp-dir', metavar='DIR',
Expand Down
82 changes: 79 additions & 3 deletions tests/test_atram_preprocessor.py
Expand Up @@ -62,11 +62,11 @@ def test_preprocess(

@patch('lib.log.info')
@patch('lib.db.insert_sequences_batch')
def test_load_one_file_1(self, insert_sequences_batch, info):
def test_load_one_file_mixed(self, insert_sequences_batch, info):
db.BATCH_SIZE = 5

file_1 = join('tests', 'data', 'load_seq1.txt')
atram_preprocessor.load_one_file(self.db_conn, file_1)
atram_preprocessor.load_one_file(self.db_conn, file_1, 'mixed_ends')

msg = 'Loading "{}" into sqlite database'.format(file_1)
info.assert_called_once_with(msg)
Expand All @@ -85,13 +85,89 @@ def test_load_one_file_1(self, insert_sequences_batch, info):
('seq4', '2', 'AAAAAAAAAAGGGGGGGGGG')])]
insert_sequences_batch.assert_has_calls(calls)

@patch('lib.log.info')
@patch('lib.db.insert_sequences_batch')
def test_load_one_file_end1(self, insert_sequences_batch, info):
db.BATCH_SIZE = 5

file_1 = join('tests', 'data', 'load_seq1.txt')
atram_preprocessor.load_one_file(self.db_conn, file_1, 'end_1', '1')

msg = 'Loading "{}" into sqlite database'.format(file_1)
info.assert_called_once_with(msg)

calls = [
call(self.db_conn, [
('seq1', '1', 'AAAAAAAAAA'),
('seq2', '1', 'AAAAAAAAAAGGGGGGGGGG'),
('seq3', '1', 'AAAAAAAAAA'),
('seq4', '1', 'AAAAAAAAAA'),
('seq5/3', '1', 'AAAAAAAAAAGGGGGGGGGG')]),
call(self.db_conn, [
('seq1', '1', 'AAAAAAAAAA'),
('seq2', '1', 'AAAAAAAAAAGGGGGGGGGG'),
('seq3', '1', 'AAAAAAAAAA'),
('seq4', '1', 'AAAAAAAAAAGGGGGGGGGG')])]
insert_sequences_batch.assert_has_calls(calls)

@patch('lib.log.info')
@patch('lib.db.insert_sequences_batch')
def test_load_one_file_end2(self, insert_sequences_batch, info):
db.BATCH_SIZE = 5

file_1 = join('tests', 'data', 'load_seq1.txt')
atram_preprocessor.load_one_file(self.db_conn, file_1, 'end_2', '2')

msg = 'Loading "{}" into sqlite database'.format(file_1)
info.assert_called_once_with(msg)

calls = [
call(self.db_conn, [
('seq1', '2', 'AAAAAAAAAA'),
('seq2', '2', 'AAAAAAAAAAGGGGGGGGGG'),
('seq3', '2', 'AAAAAAAAAA'),
('seq4', '2', 'AAAAAAAAAA'),
('seq5/3', '2', 'AAAAAAAAAAGGGGGGGGGG')]),
call(self.db_conn, [
('seq1', '2', 'AAAAAAAAAA'),
('seq2', '2', 'AAAAAAAAAAGGGGGGGGGG'),
('seq3', '2', 'AAAAAAAAAA'),
('seq4', '2', 'AAAAAAAAAAGGGGGGGGGG')])]
insert_sequences_batch.assert_has_calls(calls)

@patch('lib.log.info')
@patch('lib.db.insert_sequences_batch')
def test_load_one_file_single(self, insert_sequences_batch, info):
db.BATCH_SIZE = 5

file_1 = join('tests', 'data', 'load_seq1.txt')
atram_preprocessor.load_one_file(
self.db_conn, file_1, 'single_ends', '')

msg = 'Loading "{}" into sqlite database'.format(file_1)
info.assert_called_once_with(msg)

calls = [
call(self.db_conn, [
('seq1', '', 'AAAAAAAAAA'),
('seq2', '', 'AAAAAAAAAAGGGGGGGGGG'),
('seq3', '', 'AAAAAAAAAA'),
('seq4', '', 'AAAAAAAAAA'),
('seq5/3', '', 'AAAAAAAAAAGGGGGGGGGG')]),
call(self.db_conn, [
('seq1', '', 'AAAAAAAAAA'),
('seq2', '', 'AAAAAAAAAAGGGGGGGGGG'),
('seq3', '', 'AAAAAAAAAA'),
('seq4', '', 'AAAAAAAAAAGGGGGGGGGG')])]
insert_sequences_batch.assert_has_calls(calls)

@patch('lib.log.info')
@patch('lib.db.insert_sequences_batch')
def test_load_one_file_2(self, insert_sequences_batch, info):
db.BATCH_SIZE = 5

file_1 = join('tests', 'data', 'load_seq2.txt')
atram_preprocessor.load_one_file(self.db_conn, file_1)
atram_preprocessor.load_one_file(self.db_conn, file_1, 'mixed_ends')

msg = 'Loading "{}" into sqlite database'.format(file_1)
info.assert_called_once_with(msg)
Expand Down

0 comments on commit 9706f55

Please sign in to comment.