feat: remove existing run directory option for strelka (#1297)

### Description When strelka reruns it doesn't like if the run directory exists. Add an option to delete it before. ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --------- Co-authored-by: Christopher Schröder <cschroeder@c46.ikim.uk-essen.de> Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
snakemake · Jun 12, 2023 · 6f384dd · 6f384dd
1 parent e8a2b20
commit 6f384dd
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 31 deletions.
diff --git a/bio/strelka/germline/meta.yaml b/bio/strelka/germline/meta.yaml
@@ -2,3 +2,4 @@ name: "Strelka germline"
 description: Call germline variants with Strelka.
 authors:
   - Jan Forster
+  - Christopher Schröder
diff --git a/bio/strelka/germline/test/Snakefile b/bio/strelka/germline/test/Snakefile
@@ -4,16 +4,19 @@ rule strelka_germline:
         bam="mapped/{sample}.bam",
         # path to reference genome fasta and index
         fasta="genome.fasta",
-        fasta_index="genome.fasta.fai"
+        fasta_index="genome.fasta.fai",
     output:
         # Strelka results - either use directory or complete file path
-        directory("strelka/{sample}")
+        variants="strelka/{sample}.vcf.gz",
+        variants_index="strelka/{sample}.vcf.gz.tbi",
+        sample_genomes=["strelka/{sample}.genome.vcf.gz"],
+        sample_genomes_indices=["strelka/{sample}.genome.vcf.gz.tbi"],
     log:
-        "logs/strelka/germline/{sample}.log"
+        "logs/strelka/germline/{sample}.log",
     params:
         # optional parameters
         config_extra="",
-        run_extra=""
+        run_extra="",
     threads: 8
     wrapper:
         "master/bio/strelka/germline"
diff --git a/bio/strelka/germline/wrapper.py b/bio/strelka/germline/wrapper.py
@@ -3,6 +3,8 @@
 __email__ = "jan.forster@uk-essen.de"
 __license__ = "MIT"
 
+import tempfile
+import glob
 
 from pathlib import Path
 from snakemake.shell import shell
@@ -16,22 +18,47 @@
 if isinstance(bam, str):
     bam = [bam]
 
-if snakemake.output[0].endswith(".vcf.gz"):
-    run_dir = Path(snakemake.output[0]).parents[2]
-else:
-    run_dir = snakemake.output
+if snakemake.output.get("sample_genomes"):
+    assert len(bam) == len(
+        snakemake.output.get("sample_genomes")
+    ), "number of input bams and sample_genomes must be equal "
+
+if snakemake.output.get("sample_genomes_indices"):
+    assert len(bam) == len(
+        snakemake.output.get("sample_genomes_indices")
+    ), "number of input bams and sample_genomes_indices must be equal "
 
 bam_input = " ".join(f"--bam {b}" for b in bam)
 
-shell(
-    "(configureStrelkaGermlineWorkflow.py "  # configure the strelka run
-    "{bam_input} "  # input bam
-    "--referenceFasta {snakemake.input.fasta} "  # reference genome
-    "--runDir {run_dir} "  # output directory
-    "{config_extra} "  # additional parameters for the configuration
-    "&& {run_dir}/runWorkflow.py "  # run the strelka workflow
-    "-m local "  # run in local mode
-    "-j {snakemake.threads} "  # number of threads
-    "{run_extra}) "  # additional parameters for the run
-    "{log}"
-)  # logging
+with tempfile.TemporaryDirectory() as tmpdir:
+    shell(
+        "(configureStrelkaGermlineWorkflow.py "  # configure the strelka run
+        "{bam_input} "  # input bam
+        "--referenceFasta {snakemake.input.fasta} "  # reference genome
+        "--runDir {tmpdir} "  # output directory
+        "{config_extra} "  # additional parameters for the configuration
+        "&& {tmpdir}/runWorkflow.py "  # run the strelka workflow
+        "-m local "  # run in local mode
+        "-j {snakemake.threads} "  # number of threads
+        "{run_extra}) "  # additional parameters for the run
+        "{log}"
+    )  # logging
+
+    if snakemake.output.get("variants"):
+        shell(
+            "cat {tmpdir}/results/variants/variants.vcf.gz > {snakemake.output.variants:q}"
+        )
+    if snakemake.output.get("variants_index"):
+        shell(
+            "cat {tmpdir}/results/variants/variants.vcf.gz.tbi > {snakemake.output.variants_index:q}"
+        )
+    if targets := snakemake.output.get("sample_genomes"):
+        origins = glob.glob(f"{tmpdir}/results/variants/genome.S*.vcf.gz")
+        assert len(origins) == len(targets)
+        for origin, target in zip(origins, targets):
+            shell(f"cat {origin} > {target}")
+    if targets := snakemake.output.get("sample_genomes_indices"):
+        origins = glob.glob(f"{tmpdir}/results/variants/genome.S*.vcf.gz.tbi")
+        assert len(origins) == len(targets)
+        for origin, target in zip(origins, targets):
+            shell(f"cat {origin} > {target}")
diff --git a/bio/strelka/somatic/meta.yaml b/bio/strelka/somatic/meta.yaml
@@ -3,6 +3,7 @@ description: |
   Strelka calls somatic and germline small variants from mapped sequencing reads
 authors:
   - Thibault Dayris
+  - Christopher Schröder
 input:
   - A tumor bam file, with its index.
   - A reference genome sequence in fasta format, with its index.

diff --git a/bio/strelka/somatic/test/Snakefile b/bio/strelka/somatic/test/Snakefile
@@ -4,19 +4,18 @@ rule strelka:
         # are optional input
         # normal = "data/b.bam",
         # normal_index = "data/b.bam.bai"
-        tumor = "data/{tumor}.bam",
-        tumor_index = "data/{tumor}.bam.bai",
-        fasta = "data/genome.fasta",
-        fasta_index = "data/genome.fasta.fai"
+        tumor="data/{tumor}.bam",
+        tumor_index="data/{tumor}.bam.bai",
+        fasta="data/genome.fasta",
+        fasta_index="data/genome.fasta.fai",
     output:
         # Strelka output - can be directory or full file path
-        directory("{tumor}_vcf")
-    threads:
-        1
+        directory("{tumor}_vcf"),
+    threads: 1
     params:
-        run_extra = "",
-        config_extra = ""
+        run_extra="",
+        config_extra="",
     log:
-        "logs/strelka_{tumor}.log"
+        "logs/strelka_{tumor}.log",
     wrapper:
         "master/bio/strelka/somatic"
diff --git a/test.py b/test.py
@@ -3684,7 +3684,7 @@ def test_snpeff_download():
 def test_strelka_germline():
     run(
         "bio/strelka/germline",
-        ["snakemake", "--cores", "1", "strelka/a", "--use-conda", "-F"],
+        ["snakemake", "--cores", "1", "strelka/a.vcf.gz", "--use-conda", "-F"],
     )