Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Fix sig kmers #2856

Open
wants to merge 4 commits into
base: latest
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 11 additions & 0 deletions doc/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,14 @@ read mapping between the metagenome and the relevant reference genome
or, if you are interested in retrieving accessory elements, you can
try out
[spacegraphcats](https://spacegraphcats.github.io/spacegraphcats/02-spacegraphcats-use-cases/).

## How does memory usage for sourmash change with k-mer size?

sourmash hashes k-mers into 64-bit numbers, so the size of what is
stored is independent of the k-mer size. The only impact of k-mer size
on sourmash behavior is then more on the biology side - how many
matches do you gain (or lose) with that k-mer size? And do you have a
lot of new k-mers that pop up with a longer k-mer size (e.g. because
of included variation)? These questions must be answered by experimentation
and may be data-set specific.

1 change: 1 addition & 0 deletions doc/sidebar.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ X and Linux. They require about 5 GB of disk space and 5 GB of RAM.
* [Publications about sourmash](publications.md)
* [A guide to the internal design and structure of sourmash](sourmash-internals.md)
* [Funding acknowledgements](funding.md)
* [Frequently asked questions](faq.md)

## Developing and extending sourmash

Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False):
if translate:
# forward AND reverse complement => twice the k-mers
n_kmers = (len(sequence) - ksize + 1) * 2
assert n_kmers == len(hashvals)
assert n_kmers == len(hashvals), (n_kmers, len(hashvals))

# generate reverse complement of sequence
seqrc = screed.rc(sequence)
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,7 +1169,7 @@
# output matching k-mers:
if kmer_w:
seq = record.sequence
kh_iter = seq_mh.kmers_and_hashes(seq, force=False,
kh_iter = seq_mh.kmers_and_hashes(seq, force=True,

Check warning on line 1172 in src/sourmash/sig/__main__.py

View check run for this annotation

Codecov / codecov/patch

src/sourmash/sig/__main__.py#L1172

Added line #L1172 was not covered by tests
is_protein=is_protein)
for kmer, hashval in kh_iter:
if hashval in query_mh.hashes:
Expand Down
70 changes: 70 additions & 0 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -4521,6 +4521,76 @@ def test_sig_kmers_2_hp(runtmp):
assert check_mh2.similarity(mh) == 1.0


def test_sig_kmers_3_bad_dna_ignore(runtmp):
# test sig kmers on dna w/bad DNA ('N') - should be ignored.
seqfile = utils.get_test_data('short.bad.fa')

runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1')
ss = sourmash.load_one_signature(runtmp.output('short.bad.fa.sig'))
mh = ss.minhash
assert mh.moltype == 'DNA'

runtmp.sourmash('sig', 'kmers', '--sig', 'short.bad.fa.sig',
'--seq', seqfile,
'--save-kmers', 'short.csv',
'--save-sequences', 'matched.fa')

out = runtmp.last_result.out
print(out)
err = runtmp.last_result.err
print(err)

assert 'total hashes in merged signature: 795' in err
assert 'found 795 distinct matching hashes (100.0%)' in err

# check FASTA output
assert os.path.exists(runtmp.output('matched.fa'))
with screed.open(runtmp.output('matched.fa')) as f:
records = list(f)
assert len(records) == 1
assert len(records[0].sequence) == 1012, len(records[0].sequence)

seq_mh = mh.copy_and_clear()
for record in records:
seq_mh.add_sequence(record.sequence, force=True)
assert seq_mh.similarity(mh) == 1.0

# check CSV output w/k-mers and hashes etc
assert os.path.exists(runtmp.output('short.csv'))
with open(runtmp.output('short.csv'), newline='') as fp:
r = csv.DictReader(fp)
rows = list(r)
assert len(rows) == 795

check_mh = mh.copy_and_clear()
check_mh2 = mh.copy_and_clear()
for row in rows:
check_mh.add_sequence(row['kmer'], force=True)
check_mh2.add_hash(int(row['hashval']))
assert check_mh.similarity(mh) == 1.0
assert check_mh2.similarity(mh) == 1.0


def test_sig_kmers_3_bad_dna_fail(runtmp):
# test sig kmers on dna w/bad DNA ('N') and --check-seq - should fail
seqfile = utils.get_test_data('short.bad.fa')

runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1')
ss = sourmash.load_one_signature(runtmp.output('short.bad.fa.sig'))
mh = ss.minhash
assert mh.moltype == 'DNA'

with pytest.raises(SourmashCommandFailed):
runtmp.sourmash('sig', 'kmers', '--sig', 'short.bad.fa.sig',
'--seq', seqfile,
'--save-kmers', 'short.csv',
'--save-sequences', 'matched.fa',
'--check-sequence')

assert "ERROR in sequence 'shortName'" in runtmp.last_result.err
assert "invalid DNA character in input k-mer: TCTGATCTCNGGATAAANAAGCGATCCCAGT" in runtmp.last_result.err


def test_sig_check_1(runtmp):
# basic check functionality
sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig'))
Expand Down
2 changes: 1 addition & 1 deletion tests/test_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def test_seq_to_hashes_bad_kmers_as_zeroes_2():
seq = "ATGAGAGACGATAGACAGATGACN"

with pytest.raises(ValueError):
hashes = mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True)
hashes = mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True, force=False)


def test_seq_to_hashes_translated_short():
Expand Down