sourmash-bio · ctb · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023 · Nov 28, 2023
diff --git a/doc/faq.md b/doc/faq.md
@@ -245,3 +245,14 @@ read mapping between the metagenome and the relevant reference genome
 or, if you are interested in retrieving accessory elements, you can
 try out
 [spacegraphcats](https://spacegraphcats.github.io/spacegraphcats/02-spacegraphcats-use-cases/).
+
+## How does memory usage for sourmash change with k-mer size?
+
+sourmash hashes k-mers into 64-bit numbers, so the size of what is
+stored is independent of the k-mer size. The only impact of k-mer size
+on sourmash behavior is then more on the biology side - how many
+matches do you gain (or lose) with that k-mer size? And do you have a
+lot of new k-mers that pop up with a longer k-mer size (e.g. because
+of included variation)? These questions must be answered by experimentation
+and may be data-set specific.
+
diff --git a/doc/sidebar.md b/doc/sidebar.md
@@ -38,6 +38,7 @@ X and Linux. They require about 5 GB of disk space and 5 GB of RAM.
 * [Publications about sourmash](publications.md)
 * [A guide to the internal design and structure of sourmash](sourmash-internals.md)
 * [Funding acknowledgements](funding.md)
+* [Frequently asked questions](faq.md)
 
 ## Developing and extending sourmash
 

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
@@ -406,7 +406,7 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False):
         if translate:
             # forward AND reverse complement => twice the k-mers
             n_kmers = (len(sequence) - ksize + 1) * 2
-            assert n_kmers == len(hashvals)
+            assert n_kmers == len(hashvals), (n_kmers, len(hashvals))
 
             # generate reverse complement of sequence
             seqrc = screed.rc(sequence)

diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py
@@ -1169,7 +1169,7 @@
                     # output matching k-mers:
                     if kmer_w:
                         seq = record.sequence
-                        kh_iter = seq_mh.kmers_and_hashes(seq, force=False,
+                        kh_iter = seq_mh.kmers_and_hashes(seq, force=True,
                                                           is_protein=is_protein)
                         for kmer, hashval in kh_iter:
                             if hashval in query_mh.hashes:

diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
@@ -4521,6 +4521,76 @@ def test_sig_kmers_2_hp(runtmp):
     assert check_mh2.similarity(mh) == 1.0
 
 
+def test_sig_kmers_3_bad_dna_ignore(runtmp):
+    # test sig kmers on dna w/bad DNA ('N') - should be ignored.
+    seqfile = utils.get_test_data('short.bad.fa')
+
+    runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1')
+    ss = sourmash.load_one_signature(runtmp.output('short.bad.fa.sig'))
+    mh = ss.minhash
+    assert mh.moltype == 'DNA'
+
+    runtmp.sourmash('sig', 'kmers', '--sig', 'short.bad.fa.sig',
+                    '--seq', seqfile,
+                    '--save-kmers', 'short.csv',
+                    '--save-sequences', 'matched.fa')
+
+    out = runtmp.last_result.out
+    print(out)
+    err = runtmp.last_result.err
+    print(err)
+
+    assert 'total hashes in merged signature: 795' in err
+    assert 'found 795 distinct matching hashes (100.0%)' in err
+
+    # check FASTA output
+    assert os.path.exists(runtmp.output('matched.fa'))
+    with screed.open(runtmp.output('matched.fa')) as f:
+        records = list(f)
+    assert len(records) == 1
+    assert len(records[0].sequence) == 1012, len(records[0].sequence)
+
+    seq_mh = mh.copy_and_clear()
+    for record in records:
+        seq_mh.add_sequence(record.sequence, force=True)
+    assert seq_mh.similarity(mh) == 1.0
+
+    # check CSV output w/k-mers and hashes etc
+    assert os.path.exists(runtmp.output('short.csv'))
+    with open(runtmp.output('short.csv'), newline='') as fp:
+        r = csv.DictReader(fp)
+        rows = list(r)
+        assert len(rows) == 795
+
+    check_mh = mh.copy_and_clear()
+    check_mh2 = mh.copy_and_clear()
+    for row in rows:
+        check_mh.add_sequence(row['kmer'], force=True)
+        check_mh2.add_hash(int(row['hashval']))
+    assert check_mh.similarity(mh) == 1.0
+    assert check_mh2.similarity(mh) == 1.0
+
+
+def test_sig_kmers_3_bad_dna_fail(runtmp):
+    # test sig kmers on dna w/bad DNA ('N') and --check-seq - should fail
+    seqfile = utils.get_test_data('short.bad.fa')
+
+    runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1')
+    ss = sourmash.load_one_signature(runtmp.output('short.bad.fa.sig'))
+    mh = ss.minhash
+    assert mh.moltype == 'DNA'
+
+    with pytest.raises(SourmashCommandFailed):
+        runtmp.sourmash('sig', 'kmers', '--sig', 'short.bad.fa.sig',
+                        '--seq', seqfile,
+                        '--save-kmers', 'short.csv',
+                        '--save-sequences', 'matched.fa',
+                        '--check-sequence')
+
+    assert "ERROR in sequence 'shortName'" in runtmp.last_result.err
+    assert "invalid DNA character in input k-mer: TCTGATCTCNGGATAAANAAGCGATCCCAGT" in runtmp.last_result.err
+
+
 def test_sig_check_1(runtmp):
     # basic check functionality
     sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig'))

diff --git a/tests/test_minhash.py b/tests/test_minhash.py
@@ -264,7 +264,7 @@ def test_seq_to_hashes_bad_kmers_as_zeroes_2():
     seq = "ATGAGAGACGATAGACAGATGACN"
 
     with pytest.raises(ValueError):
-        hashes = mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True)
+        hashes = mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True, force=False)
 
 
 def test_seq_to_hashes_translated_short():