Skip to content

Commit

Permalink
fix: inconsistent output filenames trim-galore (#1451)
Browse files Browse the repository at this point in the history
### Description

Now output files can be changed arbitrarily. I think it closes #967 . 


### QC

* [ ] I confirm that:

For all wrappers added by this PR, 

* there is a test case which covers any introduced changes,
* `input:` and `output:` file paths in the resulting rule can be changed
arbitrarily,
* either the wrapper can only use a single core, or the example rule
contains a `threads: x` statement with `x` being a reasonable default,
* rule names in the test case are in
[snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell
what the rule is about or match the tools purpose or name (e.g.,
`map_reads` for a step that maps reads),
* all `environment.yaml` specifications follow [the respective best
practices](https://stackoverflow.com/a/64594513/2352071),
* wherever possible, command line arguments are inferred and set
automatically (e.g. based on file extensions in `input:` or `output:`),
* all fields of the example rules in the `Snakefile`s and their entries
are explained via comments (`input:`/`output:`/`params:` etc.),
* `stderr` and/or `stdout` are logged correctly (`log:`), depending on
the wrapped tool,
* temporary files are either written to a unique hidden folder in the
working directory, or (better) stored where the Python function
`tempfile.gettempdir()` points to (see
[here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir);
this also means that using any Python `tempfile` default behavior
works),
* the `meta.yaml` contains a link to the documentation of the respective
tool or command,
* `Snakefile`s pass the linting (`snakemake --lint`),
* `Snakefile`s are formatted with
[snakefmt](https://github.com/snakemake/snakefmt),
* Python wrapper scripts are formatted with
[black](https://black.readthedocs.io).
* Conda environments use a minimal amount of channels, in recommended
ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as
conda-forge should have highest priority and defaults channels are
usually not needed because most packages are in conda-forge nowadays).
  • Loading branch information
currocam committed Jun 26, 2023
1 parent 064acf8 commit b915e37
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 42 deletions.
25 changes: 21 additions & 4 deletions bio/trim_galore/pe/test/Snakefile
Expand Up @@ -2,10 +2,27 @@ rule trim_galore_pe:
input:
["reads/{sample}.1.fastq.gz", "reads/{sample}.2.fastq.gz"],
output:
"trimmed/{sample}.1_val_1.fq.gz",
"trimmed/{sample}.1.fastq.gz_trimming_report.txt",
"trimmed/{sample}.2_val_2.fq.gz",
"trimmed/{sample}.2.fastq.gz_trimming_report.txt",
fasta_fwd="trimmed/{sample}_R1.fq.gz",
report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
fasta_rev="trimmed/{sample}_R2.fq.gz",
report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
threads: 1
params:
extra="--illumina -q 20",
log:
"logs/trim_galore/{sample}.log",
wrapper:
"master/bio/trim_galore/pe"


rule trim_galore_pe_uncompressed:
input:
["reads/{sample}_R1.fastq", "reads/{sample}_R2.fastq"],
output:
fasta_fwd="trimmed/{sample}_R1.fastq",
report_fwd="trimmed/reports/{sample}_R1_trimming_report.txt",
fasta_rev="trimmed/{sample}_R2.fastq",
report_rev="trimmed/reports/{sample}_R2_trimming_report.txt",
threads: 1
params:
extra="--illumina -q 20",
Expand Down
4 changes: 4 additions & 0 deletions bio/trim_galore/pe/test/reads/a_R1.fastq
@@ -0,0 +1,4 @@
@1
ACGGCAT
+
!!!!!!!
4 changes: 4 additions & 0 deletions bio/trim_galore/pe/test/reads/a_R2.fastq
@@ -0,0 +1,4 @@
@1
ACGGCAT
+
!!!!!!!
80 changes: 63 additions & 17 deletions bio/trim_galore/pe/wrapper.py
Expand Up @@ -7,14 +7,44 @@


from snakemake.shell import shell
import os.path
import tempfile
import re
import os


log = snakemake.log_fmt_shell()
def report_filename(infile: str) -> str:
"""Infer report output file name from input
>>> report_filename('reads/sample.1.fastq.gz')
'sample.1.fastq.gz_trimming_report.txt
>>> report_filename('reads/sample_R2.fastq.gz')
'sample_R2.fastq.gz_trimming_report.txt
"""
return os.path.basename(infile) + "_trimming_report.txt"


def fasta_filename(infile: str, infix: str, out_gzip: bool) -> str:
"""Infer fasta output file name from input
>>> fasta_filename('reads/sample.1.fq.gz', infix = '_val_1', out_gzip = False)
'sample.1_val_1.fq.gz'
>>> fasta_filename('reads/sample_R2.fastq', infix = '_val_2', out_gzip = True)
'sample_R2_val_2.fq.gz'
"""
base_input = os.path.basename(infile)
suffix = ".gz" if out_gzip or infile.endswith(".gz") else ""
REGEX_RULES = [r"\.fastq$", "\.fastq\.gz$", r"\.fq$", r"\.fq\.gz$"]
for regex in REGEX_RULES:
if re.search(regex, base_input):
return re.sub(regex, f"{infix}.fq", base_input) + suffix
return base_input + infix + suffix


log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")

# Check that two input files were supplied
n = len(snakemake.input)
assert n == 2, "Input must contain 2 files. Given: %r." % n
infile_fwd, infile_rev = snakemake.input[0:2]

# Don't run with `--fastqc` flag
if "--fastqc" in snakemake.params.get("extra", ""):
Expand All @@ -30,20 +60,36 @@
m = len(snakemake.output)
assert m == 4, "Output must contain 4 files. Given: %r." % m

# Check that all output files are in the same directory
out_dir = os.path.dirname(snakemake.output[0])
for file_path in snakemake.output[1:]:
assert out_dir == os.path.dirname(file_path), (
"trim_galore can only output files to a single directory."
" Please indicate only one directory for the output files."
fasta_fwd, fasta_rev, report_fwd, report_rev = (
snakemake.output.get(key)
for key in ["fasta_fwd", "fasta_rev", "report_fwd", "report_rev"]
)

out_gzip = any((fasta_fwd.endswith("gz"), fasta_rev.endswith("gz")))
if out_gzip:
extra += " --gzip"

with tempfile.TemporaryDirectory() as tmpdir:
shell(
"(trim_galore"
" {extra}"
" --cores {snakemake.threads}"
" --paired"
" -o {tmpdir}"
" {infile_fwd} {infile_rev})"
" {log}"
)

shell(
"(trim_galore"
" {snakemake.params.extra}"
" --cores {snakemake.threads}"
" --paired"
" -o {out_dir}"
" {snakemake.input})"
" {log}"
)
if report_fwd:
shell(f"mv {tmpdir}/{report_filename(infile_fwd)} {report_fwd}")
if report_rev:
shell(f"mv {tmpdir}/{report_filename(infile_rev)} {report_rev}")

if fasta_fwd:
shell(
f"mv {tmpdir}/{fasta_filename(infile_fwd, '_val_1', out_gzip)} {fasta_fwd}"
)
if fasta_rev:
shell(
f"mv {tmpdir}/{fasta_filename(infile_rev, '_val_2', out_gzip)} {fasta_rev}"
)
18 changes: 16 additions & 2 deletions bio/trim_galore/se/test/Snakefile
Expand Up @@ -2,8 +2,22 @@ rule trim_galore_se:
input:
"reads/{sample}.fastq.gz",
output:
"trimmed/{sample}_trimmed.fq.gz",
"trimmed/{sample}.fastq.gz_trimming_report.txt",
fasta="trimmed/{sample}_trimmed.fq.gz",
report="trimmed/report/{sample}.fastq.gz_trimming_report.txt",
params:
extra="--illumina -q 20",
log:
"logs/trim_galore/{sample}.log",
wrapper:
"master/bio/trim_galore/se"


rule trim_galore_se_uncompressed:
input:
"reads/{sample}.fastq",
output:
fasta="trimmed/{sample}_trimmed.fastq",
report="trimmed/report/{sample}.fastq_trimming_report.txt",
params:
extra="--illumina -q 20",
threads: 1
Expand Down
4 changes: 4 additions & 0 deletions bio/trim_galore/se/test/reads/a.fastq
@@ -0,0 +1,4 @@
@1
ACGGCAT
+
!!!!!!!
71 changes: 55 additions & 16 deletions bio/trim_galore/se/wrapper.py
Expand Up @@ -7,10 +7,41 @@


from snakemake.shell import shell
import os.path
import os
import re
import tempfile


log = snakemake.log_fmt_shell()
def report_filename(infile: str) -> str:
"""Infer report output file name from input
>>> report_filename('reads/sample.fastq.gz')
'sample.fastq.gz_trimming_report.txt'
"""
return os.path.basename(infile) + "_trimming_report.txt"


def fasta_filename(infile: str, out_gzip: bool) -> str:
"""Infer fasta output file name from input
>>> fasta_filename('reads/sample.fq.gz', out_gzip = False)
'sample_trimmed.fq.gz'
>>> fasta_filename('reads/sample.fastq.gz', out_gzip = False)
'sample_trimmed.fq.gz'
>>> fasta_filename('reads/sample.fastq', out_gzip = False)
'sample_trimmed.fq'
>>> fasta_filename('reads/sample.fastq', out_gzip = True)
'sample_trimmed.fq.gz'
"""
base_input = os.path.basename(infile)
suffix = ".gz" if out_gzip or infile.endswith(".gz") else ""
REGEX_RULES = [r"\.fastq$", "\.fastq\.gz$", r"\.fq$", r"\.fq\.gz$"]
for regex in REGEX_RULES:
if re.search(regex, base_input):
return re.sub(regex, "_trimmed.fq", base_input) + suffix
return base_input + "_trimmed.fq" + suffix


log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")

# Don't run with `--fastqc` flag
if "--fastqc" in snakemake.params.get("extra", ""):
Expand All @@ -22,23 +53,31 @@
"the input and output files instead."
)

# Check that input files were supplied
n = len(snakemake.input)
assert n == 1, "Input must contain 1 files. Given: %r." % n
infile = snakemake.input[0]

# Check that two output files were supplied
m = len(snakemake.output)
assert m == 2, "Output must contain 2 files. Given: %r." % m
fasta, report = (snakemake.output.get(key) for key in ["fasta", "report"])

out_gzip = fasta.endswith("gz")
if out_gzip:
extra += " --gzip"

# Check that all output files are in the same directory
out_dir = os.path.dirname(snakemake.output[0])
for file_path in snakemake.output[1:]:
assert out_dir == os.path.dirname(file_path), (
"trim_galore can only output files to a single directory."
" Please indicate only one directory for the output files."
with tempfile.TemporaryDirectory() as tmpdir:
shell(
"(trim_galore"
" {extra}"
" --cores {snakemake.threads}"
" -o {tmpdir}"
" {infile})"
" {log}"
)
if report:
shell(f"mv {tmpdir}/{report_filename(infile)} {report}")

shell(
"(trim_galore"
" {snakemake.params.extra}"
" --cores {snakemake.threads}"
" -o {out_dir}"
" {snakemake.input})"
" {log}"
)
if fasta:
shell(f"mv {tmpdir}/{fasta_filename(infile, out_gzip)} {fasta}")
30 changes: 27 additions & 3 deletions test.py
Expand Up @@ -170,9 +170,10 @@ def test_nonpareil_plot():
"--use-conda",
"-F",
"results/a.pdf",
]
],
)


@skip_if_not_modified
def test_indelqual():
run(
Expand Down Expand Up @@ -2724,7 +2725,14 @@ def test_happy_prepy():
def test_happy_prepy():
run(
"bio/hap.py/pre.py",
["snakemake", "--cores", "1", "normalized/variants.vcf.gz", "--use-conda", "-F"],
[
"snakemake",
"--cores",
"1",
"normalized/variants.vcf.gz",
"--use-conda",
"-F",
],
)


Expand Down Expand Up @@ -3724,7 +3732,15 @@ def test_subread_featurecounts():
def test_trim_galore_pe():
run(
"bio/trim_galore/pe",
["snakemake", "--cores", "1", "trimmed/a.1_val_1.fq.gz", "--use-conda", "-F"],
["snakemake", "--cores", "1", "trimmed/a_R1.fq.gz", "--use-conda", "-F"],
)


@skip_if_not_modified
def test_trim_galore_pe_uncompressed():
run(
"bio/trim_galore/pe",
["snakemake", "--cores", "1", "trimmed/a_R2.fastq", "--use-conda", "-F"],
)


Expand All @@ -3736,6 +3752,14 @@ def test_trim_galore_se():
)


@skip_if_not_modified
def test_trim_galore_se_uncompressed():
run(
"bio/trim_galore/se",
["snakemake", "--cores", "1", "trimmed/a_trimmed.fastq", "--use-conda", "-F"],
)


@skip_if_not_modified
def test_trimmomatic_pe():
"""Four tests, one per fq-gz combination"""
Expand Down

0 comments on commit b915e37

Please sign in to comment.