From ecc45e819f5e72b71ee777fa336da8c61a84758c Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Wed, 4 Oct 2023 15:55:48 +0200 Subject: [PATCH] fix: Fix MultiQC output file issues (#1796) ### Description ### QC * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --- bio/multiqc/environment.yaml | 2 +- bio/multiqc/meta.yaml | 5 +++-- bio/multiqc/test/Snakefile | 21 +++++++++++-------- bio/multiqc/wrapper.py | 40 +++++++++++++++++++++--------------- test.py | 2 +- 5 files changed, 41 insertions(+), 29 deletions(-) diff --git a/bio/multiqc/environment.yaml b/bio/multiqc/environment.yaml index 26933abdfe..41e10942a2 100644 --- a/bio/multiqc/environment.yaml +++ b/bio/multiqc/environment.yaml @@ -3,4 +3,4 @@ channels: - bioconda - nodefaults dependencies: - - multiqc =1.15 + - multiqc =1.16 diff --git a/bio/multiqc/meta.yaml b/bio/multiqc/meta.yaml index f1df113718..7544c6b087 100644 --- a/bio/multiqc/meta.yaml +++ b/bio/multiqc/meta.yaml @@ -6,7 +6,8 @@ authors: - Julian de Ruiter input: - input directory containing qc files, default behaviour is to extract folder path from the provided files or parent folder if a folder is provided. -params: - - use_input_files_only: if this variable is set to True input will be used as it is, i.e no folder will be extract from provided file names output: - qc report (html) + - multiqc data folder or zip (optional) +params: + - use_input_files_only: if this variable is set to True input will be used as it is, i.e no folder will be extract from provided file names diff --git a/bio/multiqc/test/Snakefile b/bio/multiqc/test/Snakefile index bebe3a9d47..d28554ec6c 100644 --- a/bio/multiqc/test/Snakefile +++ b/bio/multiqc/test/Snakefile @@ -1,24 +1,27 @@ rule multiqc_dir: input: - expand("samtools_stats/{sample}.txt", sample=["a", "b"]) + expand("samtools_stats/{sample}.txt", sample=["a", "b"]), output: - "qc/multiqc.html" + "qc/multiqc.html", + directory("qc/multiqc_data"), params: - extra="" # Optional: extra parameters for multiqc. + extra="--data-dir", # Optional: extra parameters for multiqc. log: - "logs/multiqc.log" + "logs/multiqc.log", wrapper: "master/bio/multiqc" + rule multiqc_file: input: - expand("samtools_stats/{sample}.txt", sample=["a"]) + expand("samtools_stats/{sample}.txt", sample=["a"]), output: - "qc/multiqc_a.html" + "qc/multiqc.a.html", + "qc/multiqc.a_data.zip", params: - extra="", # Optional: extra parameters for multiqc. - use_input_files_only=True, # Optional, use only a.txt and don't search folder samtools_stats for files + extra="--zip-data-dir", # Optional: extra parameters for multiqc. + use_input_files_only=True, # Optional, use only a.txt and don't search folder samtools_stats for files log: - "logs/multiqc.log" + "logs/multiqc.log", wrapper: "master/bio/multiqc" diff --git a/bio/multiqc/wrapper.py b/bio/multiqc/wrapper.py index 3b92e5efea..aa35064694 100644 --- a/bio/multiqc/wrapper.py +++ b/bio/multiqc/wrapper.py @@ -6,31 +6,39 @@ __license__ = "MIT" -from os import path - +import tempfile +from pathlib import Path from snakemake.shell import shell extra = snakemake.params.get("extra", "") +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + + # Set this to False if multiqc should use the actual input directly # instead of parsing the folders where the provided files are located use_input_files_only = snakemake.params.get("use_input_files_only", False) - if not use_input_files_only: - input_data = set(path.dirname(fp) for fp in snakemake.input) + input_data = set(Path(fp).parent for fp in snakemake.input) else: input_data = set(snakemake.input) -output_dir = path.dirname(snakemake.output[0]) -output_name = path.basename(snakemake.output[0]) -log = snakemake.log_fmt_shell(stdout=True, stderr=True) -shell( - "multiqc" - " {extra}" - " --force" - " -o {output_dir}" - " -n {output_name}" - " {input_data}" - " {log}" -) +with tempfile.TemporaryDirectory() as tmpdir: + shell( + "multiqc" + " {extra}" + " --outdir {tmpdir}" + " --filename out" + " {input_data}" + " {log}" + ) + + for output in snakemake.output: + if output.endswith("_data"): + ext = "_data" + elif output.endswith(".zip"): + ext = "_data.zip" + else: + ext = Path(output).suffix + shell("mv {tmpdir}/out{ext} {output}") diff --git a/test.py b/test.py index 054b9bd8cc..fc7b9cdb68 100644 --- a/test.py +++ b/test.py @@ -3200,7 +3200,7 @@ def test_multiqc(): def test_multiqc_a(): run( "bio/multiqc", - ["snakemake", "--cores", "1", "qc/multiqc_a.html", "--use-conda", "-F"], + ["snakemake", "--cores", "1", "qc/multiqc.a.html", "--use-conda", "-F"], )