feat: add nonpareil wrapper (#1294)

### Description  ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · May 13, 2023 · 53318ed · 53318ed
1 parent 1b396df
commit 53318ed
Show file tree

Hide file tree

Showing 13 changed files with 212 additions and 5 deletions.
diff --git a/bio/nonpareil/environment.yaml b/bio/nonpareil/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - nonpareil =3.4.1
diff --git a/bio/nonpareil/meta.yaml b/bio/nonpareil/meta.yaml
@@ -0,0 +1,15 @@
+name: nonpareil
+description: Nonpareil uses the redundancy of the reads in metagenomic datasets to estimate the average coverage and predict the amount of sequences that will be required to achieve “nearly complete coverage”.
+url: https://nonpareil.readthedocs.io/en/latest/
+authors:
+  - Filipe G. Vieira
+input:
+  - reads in FASTA/Q format (can be gziped or bziped)
+output:
+  - redund_sum: redundancy summary TSV file with six columns, representing sequencing effort, summary of the distribution of redundancy (average redundancy, standard deviation, quartile 1, median, and quartile 3).
+  - redund_val: redundancy values TSV file with three columns (similar to redundancy summary, but provides ALL results), representing sequencing effort, ID of the replicate and estimated redundancy value.
+  - mate_distr: mate distribution file, with the number of reads in the dataset matching a query read.
+  - log: log of internal Nonpareil processing.
+params:
+  - alg: nonpareil algorithm, either `kmer` or `alignment` (mandatory).
+  - extra: additional program arguments
diff --git a/bio/nonpareil/test/Snakefile b/bio/nonpareil/test/Snakefile
@@ -0,0 +1,15 @@
+rule nonpareil:
+    input:
+        "reads/{sample}",
+    output:
+        redund_sum="results/{sample}.npo",
+        redund_val="results/{sample}.npa",
+        mate_distr="results/{sample}.npc",
+        log="results/{sample}.log",
+    log:
+        "logs/{sample}.log",
+    params:
+        alg="kmer",
+        extra="-X 1 -k 3 -F",
+    wrapper:
+        "master/bio/nonpareil"
diff --git a/bio/nonpareil/test/reads/a.fa b/bio/nonpareil/test/reads/a.fa
@@ -0,0 +1,30 @@
+>1
+ACGGCAT
+>2
+ACGGCAT
+>3
+ACGGCAT
+>4
+ACGGCAT
+>5
+ACGGCAT
+>6
+ACGGCAT
+>7
+ACGGCAT
+>8
+ACGGCAT
+>9
+ACGGCAT
+>10
+ACGGCAT
+>11
+ACGGCAT
+>12
+ACGGCAT
+>13
+ACGGCAT
+>14
+ACGGCAT
+>15
+ACGGCAT
diff --git a/bio/nonpareil/test/reads/a.fas.bz2 b/bio/nonpareil/test/reads/a.fas.bz2
diff --git a/bio/nonpareil/test/reads/a.fasta.gz b/bio/nonpareil/test/reads/a.fasta.gz
diff --git a/bio/nonpareil/test/reads/a.fastq.gz b/bio/nonpareil/test/reads/a.fastq.gz
diff --git a/bio/nonpareil/test/reads/a.fq b/bio/nonpareil/test/reads/a.fq
@@ -0,0 +1,60 @@
+@1
+ACGGCAT
++
+!!!!!!!
+@2
+ACGGCAT
++
+!!!!!!!
+@3
+ACGGCAT
++
+!!!!!!!
+@4
+ACGGCAT
++
+!!!!!!!
+@5
+ACGGCAT
++
+!!!!!!!
+@6
+ACGGCAT
++
+!!!!!!!
+@7
+ACGGCAT
++
+!!!!!!!
+@8
+ACGGCAT
++
+!!!!!!!
+@9
+ACGGCAT
++
+!!!!!!!
+@10
+ACGGCAT
++
+!!!!!!!
+@11
+ACGGCAT
++
+!!!!!!!
+@12
+ACGGCAT
++
+!!!!!!!
+@13
+ACGGCAT
++
+!!!!!!!
+@14
+ACGGCAT
++
+!!!!!!!
+@15
+ACGGCAT
++
+!!!!!!!
diff --git a/bio/nonpareil/test/reads/a.fq.bz2 b/bio/nonpareil/test/reads/a.fq.bz2
diff --git a/bio/nonpareil/wrapper.py b/bio/nonpareil/wrapper.py
@@ -0,0 +1,64 @@
+__author__ = "Filipe G. Vieira"
+__copyright__ = "Copyright 2023, Filipe G. Vieira"
+__license__ = "MIT"
+
+from os import path
+import tempfile
+from snakemake.shell import shell
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+extra = snakemake.params.get("extra", "")
+
+uncomp = ""
+in_name, in_ext = path.splitext(snakemake.input[0])
+if in_ext in [".gz", ".bz2"]:
+    uncomp = "zcat" if in_ext == ".gz" else "bzcat"
+    in_name, in_ext = path.splitext(in_name)
+
+# Infer output format
+if in_ext in [".fa", ".fas", ".fasta"]:
+    in_format = "fasta"
+elif in_ext in [".fq", ".fastq"]:
+    in_format = "fastq"
+else:
+    raise ValueError("invalid input format")
+
+# Redundancy summary
+redund_sum = snakemake.output.get("redund_sum", "")
+if redund_sum:
+    redund_sum = f"-o {redund_sum}"
+
+# Redundancy values
+redund_val = snakemake.output.get("redund_val", "")
+if redund_val:
+    redund_val = f"-a {redund_val}"
+
+# Mate distribution
+mate_distr = snakemake.output.get("mate_distr", "")
+if mate_distr:
+    mate_distr = f"-C {mate_distr}"
+
+# Log
+out_log = snakemake.output.get("log", "")
+if out_log:
+    out_log = f"-l {out_log}"
+
+
+with tempfile.NamedTemporaryFile() as tmp:
+    in_uncomp = snakemake.input[0]
+    if uncomp:
+        in_uncomp = tmp.name
+        shell("{uncomp} {snakemake.input[0]} > {in_uncomp}")
+
+    shell(
+        "nonpareil"
+        " -T {snakemake.params.alg}"
+        " -s {in_uncomp}"
+        " -f {in_format}"
+        " {extra}"
+        " {redund_sum}"
+        " {redund_val}"
+        " {mate_distr}"
+        " {out_log}"
+        " {log}"
+    )
diff --git a/bio/preseq/lc_extrap/meta.yaml b/bio/preseq/lc_extrap/meta.yaml
@@ -1,8 +1,7 @@
 name: preseq lc_extrap
 description: >
   ``preseq`` estimates the library complexity of existing sequencing data to then estimate the yield of future experiments based on their design.
-  For usage information, please see ``preseq``'s command line help (this seems more up to date than the available `documentation from 2014 <http://smithlabresearch.org/wp-content/uploads/manual.pdf>`_ ).
-  For more information about ``preseq``, also see the `source code <https://github.com/smithlabcode/preseq>`_.
+url: https://github.com/smithlabcode/preseq
 authors:
   - Antonie Vietor
 input:

diff --git a/bio/vsearch/wrapper.py b/bio/vsearch/wrapper.py
@@ -31,6 +31,4 @@
 output = [out for _, out in sorted(zip(out_gz or out_bz2, out_list))]
 
 
-shell(
-    "vsearch --threads {snakemake.threads}" " {input}" " {extra}" " {log}" " {output}"
-)
+shell("vsearch --threads {snakemake.threads} {input} {extra} {log} {output}")
diff --git a/test.py b/test.py
@@ -139,6 +139,26 @@ def run(wrapper, cmd, check_log=None):
             os.chdir(origdir)
 
 
+
+@skip_if_not_modified
+def test_nonpareil():
+    run(
+        "bio/nonpareil",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "--use-conda",
+            "-F",
+            "results/a.fa.npo",
+            "results/a.fas.bz2.npo",
+            "results/a.fasta.gz.npo",
+            "results/a.fq.npo",
+            "results/a.fq.bz2.npo",
+            "results/a.fastq.gz.npo",
+        ]
+    )
+
 @skip_if_not_modified
 def test_indelqual():
     run(