fix: inconsistent output filenames trim-galore (#1451)

### Description Now output files can be changed arbitrarily. I think it closes #967 . ### QC * [ ] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · Jun 26, 2023 · b915e37 · b915e37
1 parent 064acf8
commit b915e37
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 42 deletions.
diff --git a/bio/trim_galore/pe/test/Snakefile b/bio/trim_galore/pe/test/Snakefile
@@ -2,10 +2,27 @@ rule trim_galore_pe:
     input:
         ["reads/{sample}.1.fastq.gz", "reads/{sample}.2.fastq.gz"],
     output:
-        "trimmed/{sample}.1_val_1.fq.gz",
-        "trimmed/{sample}.1.fastq.gz_trimming_report.txt",
-        "trimmed/{sample}.2_val_2.fq.gz",
-        "trimmed/{sample}.2.fastq.gz_trimming_report.txt",
+        fasta_fwd="trimmed/{sample}_R1.fq.gz",
+        report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
+        fasta_rev="trimmed/{sample}_R2.fq.gz",
+        report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
+    threads: 1
+    params:
+        extra="--illumina -q 20",
+    log:
+        "logs/trim_galore/{sample}.log",
+    wrapper:
+        "master/bio/trim_galore/pe"
+
+
+rule trim_galore_pe_uncompressed:
+    input:
+        ["reads/{sample}_R1.fastq", "reads/{sample}_R2.fastq"],
+    output:
+        fasta_fwd="trimmed/{sample}_R1.fastq",
+        report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
+        fasta_rev="trimmed/{sample}_R2.fastq",
+        report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
     threads: 1
     params:
         extra="--illumina -q 20",

diff --git a/bio/trim_galore/pe/test/reads/a_R1.fastq b/bio/trim_galore/pe/test/reads/a_R1.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/bio/trim_galore/pe/test/reads/a_R2.fastq b/bio/trim_galore/pe/test/reads/a_R2.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/bio/trim_galore/pe/wrapper.py b/bio/trim_galore/pe/wrapper.py
@@ -7,14 +7,44 @@
 
 
 from snakemake.shell import shell
-import os.path
+import tempfile
+import re
+import os
 
 
-log = snakemake.log_fmt_shell()
+def report_filename(infile: str) -> str:
+    """Infer report output file name from input
+    >>> report_filename('reads/sample.1.fastq.gz')
+    'sample.1.fastq.gz_trimming_report.txt
+    >>> report_filename('reads/sample_R2.fastq.gz')
+    'sample_R2.fastq.gz_trimming_report.txt
+    """
+    return os.path.basename(infile) + "_trimming_report.txt"
+
+
+def fasta_filename(infile: str, infix: str, out_gzip: bool) -> str:
+    """Infer fasta output file name from input
+    >>> fasta_filename('reads/sample.1.fq.gz', infix = '_val_1', out_gzip = False)
+    'sample.1_val_1.fq.gz'
+    >>> fasta_filename('reads/sample_R2.fastq', infix = '_val_2', out_gzip = True)
+    'sample_R2_val_2.fq.gz'
+    """
+    base_input = os.path.basename(infile)
+    suffix = ".gz" if out_gzip or infile.endswith(".gz") else ""
+    REGEX_RULES = [r"\.fastq$", "\.fastq\.gz$", r"\.fq$", r"\.fq\.gz$"]
+    for regex in REGEX_RULES:
+        if re.search(regex, base_input):
+            return re.sub(regex, f"{infix}.fq", base_input) + suffix
+    return base_input + infix + suffix
+
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+extra = snakemake.params.get("extra", "")
 
 # Check that two input files were supplied
 n = len(snakemake.input)
 assert n == 2, "Input must contain 2 files. Given: %r." % n
+infile_fwd, infile_rev = snakemake.input[0:2]
 
 # Don't run with `--fastqc` flag
 if "--fastqc" in snakemake.params.get("extra", ""):
@@ -30,20 +60,36 @@
 m = len(snakemake.output)
 assert m == 4, "Output must contain 4 files. Given: %r." % m
 
-# Check that all output files are in the same directory
-out_dir = os.path.dirname(snakemake.output[0])
-for file_path in snakemake.output[1:]:
-    assert out_dir == os.path.dirname(file_path), (
-        "trim_galore can only output files to a single directory."
-        " Please indicate only one directory for the output files."
+fasta_fwd, fasta_rev, report_fwd, report_rev = (
+    snakemake.output.get(key)
+    for key in ["fasta_fwd", "fasta_rev", "report_fwd", "report_rev"]
+)
+
+out_gzip = any((fasta_fwd.endswith("gz"), fasta_rev.endswith("gz")))
+if out_gzip:
+    extra += " --gzip"
+
+with tempfile.TemporaryDirectory() as tmpdir:
+    shell(
+        "(trim_galore"
+        " {extra}"
+        " --cores {snakemake.threads}"
+        " --paired"
+        " -o {tmpdir}"
+        " {infile_fwd} {infile_rev})"
+        " {log}"
     )
 
-shell(
-    "(trim_galore"
-    " {snakemake.params.extra}"
-    " --cores {snakemake.threads}"
-    " --paired"
-    " -o {out_dir}"
-    " {snakemake.input})"
-    " {log}"
-)
+    if report_fwd:
+        shell(f"mv {tmpdir}/{report_filename(infile_fwd)} {report_fwd}")
+    if report_rev:
+        shell(f"mv {tmpdir}/{report_filename(infile_rev)} {report_rev}")
+
+    if fasta_fwd:
+        shell(
+            f"mv {tmpdir}/{fasta_filename(infile_fwd, '_val_1', out_gzip)} {fasta_fwd}"
+        )
+    if fasta_rev:
+        shell(
+            f"mv {tmpdir}/{fasta_filename(infile_rev, '_val_2', out_gzip)} {fasta_rev}"
+        )
diff --git a/bio/trim_galore/se/test/Snakefile b/bio/trim_galore/se/test/Snakefile
@@ -2,8 +2,22 @@ rule trim_galore_se:
     input:
         "reads/{sample}.fastq.gz",
     output:
-        "trimmed/{sample}_trimmed.fq.gz",
-        "trimmed/{sample}.fastq.gz_trimming_report.txt",
+        fasta="trimmed/{sample}_trimmed.fq.gz",
+        report="trimmed/report/{sample}.fastq.gz_trimming_report.txt",
+    params:
+        extra="--illumina -q 20",
+    log:
+        "logs/trim_galore/{sample}.log",
+    wrapper:
+        "master/bio/trim_galore/se"
+
+
+rule trim_galore_se_uncompressed:
+    input:
+        "reads/{sample}.fastq",
+    output:
+        fasta="trimmed/{sample}_trimmed.fastq",
+        report="trimmed/report/{sample}.fastq_trimming_report.txt",
     params:
         extra="--illumina -q 20",
     threads: 1

diff --git a/bio/trim_galore/se/test/reads/a.fastq b/bio/trim_galore/se/test/reads/a.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/bio/trim_galore/se/wrapper.py b/bio/trim_galore/se/wrapper.py
@@ -7,10 +7,41 @@
 
 
 from snakemake.shell import shell
-import os.path
+import os
+import re
+import tempfile
 
 
-log = snakemake.log_fmt_shell()
+def report_filename(infile: str) -> str:
+    """Infer report output file name from input
+    >>> report_filename('reads/sample.fastq.gz')
+    'sample.fastq.gz_trimming_report.txt'
+    """
+    return os.path.basename(infile) + "_trimming_report.txt"
+
+
+def fasta_filename(infile: str, out_gzip: bool) -> str:
+    """Infer fasta output file name from input
+    >>> fasta_filename('reads/sample.fq.gz', out_gzip = False)
+    'sample_trimmed.fq.gz'
+    >>> fasta_filename('reads/sample.fastq.gz', out_gzip = False)
+    'sample_trimmed.fq.gz'
+    >>> fasta_filename('reads/sample.fastq', out_gzip = False)
+    'sample_trimmed.fq'
+    >>> fasta_filename('reads/sample.fastq', out_gzip = True)
+    'sample_trimmed.fq.gz'
+    """
+    base_input = os.path.basename(infile)
+    suffix = ".gz" if out_gzip or infile.endswith(".gz") else ""
+    REGEX_RULES = [r"\.fastq$", "\.fastq\.gz$", r"\.fq$", r"\.fq\.gz$"]
+    for regex in REGEX_RULES:
+        if re.search(regex, base_input):
+            return re.sub(regex, "_trimmed.fq", base_input) + suffix
+    return base_input + "_trimmed.fq" + suffix
+
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+extra = snakemake.params.get("extra", "")
 
 # Don't run with `--fastqc` flag
 if "--fastqc" in snakemake.params.get("extra", ""):
@@ -22,23 +53,31 @@
         "the input and output files instead."
     )
 
+# Check that input files were supplied
+n = len(snakemake.input)
+assert n == 1, "Input must contain 1 files. Given: %r." % n
+infile = snakemake.input[0]
+
 # Check that two output files were supplied
 m = len(snakemake.output)
 assert m == 2, "Output must contain 2 files. Given: %r." % m
+fasta, report = (snakemake.output.get(key) for key in ["fasta", "report"])
+
+out_gzip = fasta.endswith("gz")
+if out_gzip:
+    extra += " --gzip"
 
-# Check that all output files are in the same directory
-out_dir = os.path.dirname(snakemake.output[0])
-for file_path in snakemake.output[1:]:
-    assert out_dir == os.path.dirname(file_path), (
-        "trim_galore can only output files to a single directory."
-        " Please indicate only one directory for the output files."
+with tempfile.TemporaryDirectory() as tmpdir:
+    shell(
+        "(trim_galore"
+        " {extra}"
+        " --cores {snakemake.threads}"
+        " -o {tmpdir}"
+        " {infile})"
+        " {log}"
     )
+    if report:
+        shell(f"mv {tmpdir}/{report_filename(infile)} {report}")
 
-shell(
-    "(trim_galore"
-    " {snakemake.params.extra}"
-    " --cores {snakemake.threads}"
-    " -o {out_dir}"
-    " {snakemake.input})"
-    " {log}"
-)
+    if fasta:
+        shell(f"mv {tmpdir}/{fasta_filename(infile, out_gzip)} {fasta}")
diff --git a/test.py b/test.py
@@ -170,9 +170,10 @@ def test_nonpareil_plot():
             "--use-conda",
             "-F",
             "results/a.pdf",
-        ]
+        ],
     )
 
+
 @skip_if_not_modified
 def test_indelqual():
     run(
@@ -2724,7 +2725,14 @@ def test_happy_prepy():
 def test_happy_prepy():
     run(
         "bio/hap.py/pre.py",
-        ["snakemake", "--cores", "1", "normalized/variants.vcf.gz", "--use-conda", "-F"],
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "normalized/variants.vcf.gz",
+            "--use-conda",
+            "-F",
+        ],
     )
 
 
@@ -3724,7 +3732,15 @@ def test_subread_featurecounts():
 def test_trim_galore_pe():
     run(
         "bio/trim_galore/pe",
-        ["snakemake", "--cores", "1", "trimmed/a.1_val_1.fq.gz", "--use-conda", "-F"],
+        ["snakemake", "--cores", "1", "trimmed/a_R1.fq.gz", "--use-conda", "-F"],
+    )
+
+
+@skip_if_not_modified
+def test_trim_galore_pe_uncompressed():
+    run(
+        "bio/trim_galore/pe",
+        ["snakemake", "--cores", "1", "trimmed/a_R2.fastq", "--use-conda", "-F"],
     )
 
 
@@ -3736,6 +3752,14 @@ def test_trim_galore_se():
     )
 
 
+@skip_if_not_modified
+def test_trim_galore_se_uncompressed():
+    run(
+        "bio/trim_galore/se",
+        ["snakemake", "--cores", "1", "trimmed/a_trimmed.fastq", "--use-conda", "-F"],
+    )
+
+
 @skip_if_not_modified
 def test_trimmomatic_pe():
     """Four tests, one per fq-gz combination"""