From cb1372ba61ea3f92050b974e0dfa61e42b496806 Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Wed, 22 Mar 2023 11:19:44 +0100 Subject: [PATCH] feat: added wrapper for tadpole (#1152) ### Description Add wrapper for tadpole (from bbtools). ### QC * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --- bio/bbtools/bbduk/environment.yaml | 1 + bio/bbtools/bbduk/meta.yaml | 9 ++- bio/bbtools/bbduk/test/Snakefile | 2 +- bio/bbtools/tadpole/environment.yaml | 8 +++ bio/bbtools/tadpole/meta.yaml | 17 +++++ bio/bbtools/tadpole/test/Snakefile | 70 +++++++++++++++++++++ bio/bbtools/tadpole/test/reads/pe/a.1.fastq | 4 ++ bio/bbtools/tadpole/test/reads/pe/a.2.fastq | 4 ++ bio/bbtools/tadpole/test/reads/se/a.fastq | 4 ++ bio/bbtools/tadpole/wrapper.py | 49 +++++++++++++++ test.py | 17 +++++ 11 files changed, 179 insertions(+), 6 deletions(-) create mode 100644 bio/bbtools/tadpole/environment.yaml create mode 100644 bio/bbtools/tadpole/meta.yaml create mode 100644 bio/bbtools/tadpole/test/Snakefile create mode 100644 bio/bbtools/tadpole/test/reads/pe/a.1.fastq create mode 100644 bio/bbtools/tadpole/test/reads/pe/a.2.fastq create mode 100644 bio/bbtools/tadpole/test/reads/se/a.fastq create mode 100644 bio/bbtools/tadpole/wrapper.py diff --git a/bio/bbtools/bbduk/environment.yaml b/bio/bbtools/bbduk/environment.yaml index 6600f9af4c..66495824ab 100644 --- a/bio/bbtools/bbduk/environment.yaml +++ b/bio/bbtools/bbduk/environment.yaml @@ -4,4 +4,5 @@ channels: - nodefaults dependencies: - bbmap =39.01 + - python =3.11.0 - snakemake-wrapper-utils =0.5.0 diff --git a/bio/bbtools/bbduk/meta.yaml b/bio/bbtools/bbduk/meta.yaml index bfae87fc01..61e4378541 100644 --- a/bio/bbtools/bbduk/meta.yaml +++ b/bio/bbtools/bbduk/meta.yaml @@ -5,15 +5,14 @@ description: | authors: - Filipe G. Vieira input: - - sample: list of paths. Raw fastq file with R1 reads, raw fastq file with R2 reads (PE only, optional) + - sample: list of raw R1 and (if PE) R2 fastq file(s) output: - - trimmed: trimmed fastq file with R1 reads, trimmed fastq file with R2 reads (PE only, optional) + - trimmed: list of trimmed R1 and (if PE) R2 fastq file(s) - singleton: fastq file with singleton reads (optional) - discarded: fastq file with discarded reads (optional) - - stats: stats file (optonal) + - stats: stats file (optional) params: - - extra: Optional parameters + - extra: additional program arguments - adapters: Literal adapters sequences notes: | * The `java_opts` param allows for additional arguments to be passed to the java compiler, e.g. "-XX:ParallelGCThreads=10" (not for `-XmX` or `-Djava.io.tmpdir`, since they are handled automatically). - * The `extra` param allows for additional program arguments. diff --git a/bio/bbtools/bbduk/test/Snakefile b/bio/bbtools/bbduk/test/Snakefile index 06cb7ed99c..6af9c6d59e 100644 --- a/bio/bbtools/bbduk/test/Snakefile +++ b/bio/bbtools/bbduk/test/Snakefile @@ -26,7 +26,7 @@ rule bbduk_pe: discarded="trimmed/pe/{sample}.discarded.fastq", stats="trimmed/pe/{sample}.stats.txt", log: - "logs/fastp/pe/{sample}.log" + "logs/bbduk/pe/{sample}.log" params: extra = lambda w, input: "ref={},adapters,artifacts ktrim=r k=23 mink=11 hdist=1 tpe tbo trimpolygright=10 minlen=25 maxns=30 entropy=0.5 entropywindow=50 entropyk=5".format(input.adapters), threads: 7 diff --git a/bio/bbtools/tadpole/environment.yaml b/bio/bbtools/tadpole/environment.yaml new file mode 100644 index 0000000000..78cbccb6ae --- /dev/null +++ b/bio/bbtools/tadpole/environment.yaml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bbmap =39.01 + - python =3.11.0 + - snakemake-wrapper-utils =0.5.2 diff --git a/bio/bbtools/tadpole/meta.yaml b/bio/bbtools/tadpole/meta.yaml new file mode 100644 index 0000000000..6d0f709b7a --- /dev/null +++ b/bio/bbtools/tadpole/meta.yaml @@ -0,0 +1,17 @@ +name: Tadpole +url: https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/tadpole-guide/ +description: | + Run Tadpole. +authors: + - Filipe G. Vieira +input: + - sample: list of R1 and (if PE) R2 fastq file(s) + - extra: kmer data, but not for error-correction or extension (optional) +output: + - trimmed: trimmed fastq file with R1 reads, trimmed fastq file with R2 reads (PE only, optional) + - discarded: fastq file with discarded reads (optional) +params: + - mode: Run mode (one of `contig`, `extend`, `correct`, `insert`, or `discard`; mandatory) + - extra: additional program arguments +notes: | + * The `java_opts` param allows for additional arguments to be passed to the java compiler, e.g. `-XX:ParallelGCThreads=10` (not for `-XmX` or `-Djava.io.tmpdir`, since they are handled automatically). diff --git a/bio/bbtools/tadpole/test/Snakefile b/bio/bbtools/tadpole/test/Snakefile new file mode 100644 index 0000000000..2628da720f --- /dev/null +++ b/bio/bbtools/tadpole/test/Snakefile @@ -0,0 +1,70 @@ +rule tadpole_correct_se: + input: + sample=["reads/se/{sample}.fastq"], + output: + out="out/correct_se/{sample}.fastq.gz", + discarded="out/correct_se/{sample}.discarded.fastq.gz", + log: + "logs/correct_se/{sample}.log", + params: + mode="correct", + extra="", + threads: 2 + resources: + mem_mb=1024, + wrapper: + "master/bio/bbtools/tadpole" + + +rule tadpole_correct_pe: + input: + sample=["reads/pe/{sample}.1.fastq", "reads/pe/{sample}.2.fastq"], + output: + out=["out/correct_pe/{sample}.1.fastq", "out/correct/pe/{sample}.2.fastq"], + discarded="out/correct_pe/{sample}.discarded.fastq", + log: + "logs/correct_pe/{sample}.log", + params: + mode="correct", + extra="", + threads: 2 + resources: + mem_mb=1024, + wrapper: + "master/bio/bbtools/tadpole" + + +rule tadpole_extend_se: + input: + sample=["reads/se/{sample}.fastq"], + output: + out="out/extend_se/{sample}.fastq.gz", + discarded="out/extend_se/{sample}.discarded.fastq.gz", + log: + "logs/extend_se/{sample}.log", + params: + mode="extend", + extra="", + threads: 2 + resources: + mem_mb=1024, + wrapper: + "master/bio/bbtools/tadpole" + + +rule tadpole_extend_pe: + input: + sample=["reads/pe/{sample}.1.fastq", "reads/pe/{sample}.2.fastq"], + output: + out=["out/extend_pe/{sample}.1.fastq", "out/extend/pe/{sample}.2.fastq"], + discarded="out/extend_pe/{sample}.discarded.fastq", + log: + "logs/extend_pe/{sample}.log", + params: + mode="extend", + extra="", + threads: 2 + resources: + mem_mb=1024, + wrapper: + "master/bio/bbtools/tadpole" diff --git a/bio/bbtools/tadpole/test/reads/pe/a.1.fastq b/bio/bbtools/tadpole/test/reads/pe/a.1.fastq new file mode 100644 index 0000000000..42735560ae --- /dev/null +++ b/bio/bbtools/tadpole/test/reads/pe/a.1.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/bio/bbtools/tadpole/test/reads/pe/a.2.fastq b/bio/bbtools/tadpole/test/reads/pe/a.2.fastq new file mode 100644 index 0000000000..42735560ae --- /dev/null +++ b/bio/bbtools/tadpole/test/reads/pe/a.2.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/bio/bbtools/tadpole/test/reads/se/a.fastq b/bio/bbtools/tadpole/test/reads/se/a.fastq new file mode 100644 index 0000000000..42735560ae --- /dev/null +++ b/bio/bbtools/tadpole/test/reads/se/a.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/bio/bbtools/tadpole/wrapper.py b/bio/bbtools/tadpole/wrapper.py new file mode 100644 index 0000000000..d10d1d4277 --- /dev/null +++ b/bio/bbtools/tadpole/wrapper.py @@ -0,0 +1,49 @@ +__author__ = "Filipe G. Vieira" +__copyright__ = "Copyright 2023, Filipe G. Vieira" +__license__ = "MIT" + + +from snakemake.shell import shell +from snakemake_wrapper_utils.java import get_java_opts + + +java_opts = get_java_opts(snakemake) +extra = snakemake.params.get("extra", "") +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + + +assert snakemake.params.mode in ["contig", "extend", "correct", "insert", "discard"] + + +n = len(snakemake.input.sample) +assert ( + n == 1 or n == 2 +), "input->sample must have 1 (single-end) or 2 (paired-end) elements." + +if n == 1: + reads = "in={}".format(snakemake.input.sample) + out = "out={}".format(snakemake.output.out) +else: + reads = "in={} in2={}".format(*snakemake.input.sample) + out = "out={} out2={}".format(*snakemake.output.out) + + +extra = snakemake.output.get("extra", "") +if extra: + reads += f" extra={extra}" + + +discarded = snakemake.output.get("discarded", "") +if discarded: + out += f" outd={discarded}" + + +shell( + "tadpole.sh {java_opts}" + " threads={snakemake.threads}" + " mode={snakemake.params.mode}" + " {reads}" + " {extra}" + " {out}" + " {log}" +) diff --git a/test.py b/test.py index 632de6536f..f08d503b0e 100644 --- a/test.py +++ b/test.py @@ -140,6 +140,23 @@ def run(wrapper, cmd, check_log=None): +@skip_if_not_modified +def test_tadpole(): + run( + "bio/bbtools/tadpole", + [ + "snakemake", + "--cores", + "2", + "--use-conda", + "-F", + "out/correct_se/a.fastq.gz", + "out/correct_pe/a.1.fastq", + "out/extend_se/a.fastq.gz", + "out/extend_pe/a.1.fastq", + ], + ) + @skip_if_not_modified def test_seqkit_stats(): run(