fix: for vsearch, fixed bug with bz2 output and added general log (#1775

)   ### Description  Fixed bug with `bz2` output and added the general log too keep console clean and be able to keep track of progress. ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · Dec 18, 2023 · 28e76d0 · 28e76d0
1 parent 0fd1007
commit 28e76d0
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 11 deletions.
diff --git a/bio/vsearch/environment.linux-64.pin.txt b/bio/vsearch/environment.linux-64.pin.txt
@@ -8,7 +8,8 @@ https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_3.conda#
 https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d
 https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_3.conda#23fdf1fef05baeb7eadc2aed5fb0011f
 https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4
-https://conda.anaconda.org/conda-forge/linux-64/gzip-1.13-hd590300_0.conda#cb8143aa2e59e9684c41dfdf74af38ac
 https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad
+https://conda.anaconda.org/conda-forge/linux-64/pbzip2-1.1.13-h1fcc475_2.conda#e1bf3c0868789f3ddf5d1aeb47bc60a6
+https://conda.anaconda.org/conda-forge/linux-64/pigz-2.8-h2797004_0.conda#1832561770273ca7cf52b989dd83e6c3
 https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda#68c34ec6149623be41a1933ab996a209
 https://conda.anaconda.org/bioconda/linux-64/vsearch-2.26.1-h6a68c12_0.tar.bz2#c68cb1de8ca3df2c8fa9c3234d775d0c
diff --git a/bio/vsearch/environment.yaml b/bio/vsearch/environment.yaml
@@ -4,5 +4,5 @@ channels:
   - nodefaults
 dependencies:
   - vsearch =2.26.1
-  - gzip
-  - bzip2
+  - pigz
+  - pbzip2
diff --git a/bio/vsearch/meta.yaml b/bio/vsearch/meta.yaml
@@ -12,3 +12,4 @@ params:
   - extra: additional program arguments
 notes: |
   * Keys for `input` and `output` files need to match `vsearch` arguments, (e.g. input) `uchime_denovo`, `cluster_fast`, `fastx_uniques`, `maskfasta`, `fastq_convert`, `fastq_mergepairs`, or (e.g. output) `chimeras`, `fastaout`, `fastqout`, `output`.
+  * An extra `log` file (named `vsearch`) can be specified that will be passed to `vsearch` option `--log`.
diff --git a/bio/vsearch/test/Snakefile b/bio/vsearch/test/Snakefile
@@ -5,6 +5,7 @@ rule vsearch_cluster_fast:
         profile="out/cluster_fast/{sample}.profile",
     log:
         "logs/vsearch/cluster_fast/{sample}.log",
+        vsearch="out/maskfasta/{sample}.log",
     params:
         extra="--id 0.2 --sizeout --minseqlength 5",
     threads: 1
@@ -19,6 +20,7 @@ rule vsearch_maskfasta:
         output="out/maskfasta/{sample}.fasta",
     log:
         "logs/vsearch/maskfasta/{sample}.log",
+        vsearch="out/maskfasta/{sample}.log",
     params:
         extra="--hardmask",
     threads: 1
@@ -33,9 +35,10 @@ rule vsearch_fastx_uniques:
         fastqout="out/fastx_uniques/{sample}.fastq",
     log:
         "logs/vsearch/fastx_uniques/{sample}.log",
+        vsearch="out/fastx_uniques/{sample}.log",
     params:
         extra="--strand both --minseqlength 5",
-    threads: 2
+    threads: 1
     wrapper:
         "master/bio/vsearch"
 
@@ -47,6 +50,7 @@ rule vsearch_fastx_uniques_gzip:
         fastqout="out/fastx_uniques/{sample}.fastq.gz",
     log:
         "logs/vsearch/fastx_uniques/{sample}.log",
+        vsearch="out/fastx_uniques/{sample}.log",
     params:
         extra="--strand both --minseqlength 5",
     threads: 2
@@ -61,6 +65,7 @@ rule vsearch_fastx_uniques_bzip2:
         fastqout="out/fastx_uniques/{sample}.fastq.bz2",
     log:
         "logs/vsearch/fastx_uniques/{sample}.log",
+        vsearch="out/fastx_uniques/{sample}.log",
     params:
         extra="--strand both --minseqlength 5",
     threads: 2
@@ -75,6 +80,7 @@ rule vsearch_fastq_convert:
         fastqout="out/fastq_convert/{sample}.fastq",
     log:
         "logs/vsearch/fastq_convert/{sample}.log",
+        vsearch="out/fastq_convert/{sample}.log",
     params:
         extra="--fastq_ascii 33 --fastq_asciiout 64",
     threads: 2

diff --git a/bio/vsearch/wrapper.py b/bio/vsearch/wrapper.py
@@ -6,19 +6,25 @@
 
 
 extra = snakemake.params.get("extra", "")
-if snakemake.log:
-    log = f"--log {snakemake.log}"
+log = snakemake.log.get("vsearch", "")
+if log:
+    extra += f" --log {log}"
 
 
+# Parse input files
 input = " ".join([f"--{key} {value}" for key, value in snakemake.input.items()])
 
-
+# Parse output files
 out_list = list()
 for key, value in snakemake.output.items():
     if value.endswith(".gz"):
-        out_list.append(f"--{key} /dev/stdout | gzip > {value}")
+        out_list.append(
+            f"--{key} /dev/stdout | pigz --processes {snakemake.threads} --stdout > {value}"
+        )
     elif value.endswith(".bz2"):
-        out_list.append(f"--{key} /dev/stdout | bzip2 > {value}")
+        out_list.append(
+            f"--{key} /dev/stdout | pbzip2 -p{snakemake.threads} --compress --stdout > {value}"
+        )
     else:
         out_list.append(f"--{key} {value}")
 
@@ -28,7 +34,11 @@
 assert sum(out_gz + out_bz2) <= 1, "only one output can be compressed"
 
 # Move compressed file (if any) to last
-output = [out for _, out in sorted(zip(out_gz or out_bz2, out_list))]
+output = [
+    out for _, out in sorted(zip([x | y for x, y in zip(out_gz, out_bz2)], out_list))
+]
 
 
-shell("vsearch --threads {snakemake.threads} {input} {extra} {log} {output}")
+shell(
+    "(vsearch --threads {snakemake.threads} {input} {extra} {output}) 2> {snakemake.log[0]}"
+)