feat: bwa memx (#691)

### Description  ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · Oct 27, 2022 · f281336 · f281336
1 parent 3296efd
commit f281336
Show file tree

Hide file tree

Showing 27 changed files with 496 additions and 0 deletions.
diff --git a/bio/bwa-memx/index/environment.yaml b/bio/bwa-memx/index/environment.yaml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bwa == 0.7.17
+  - bwa-mem2 ==2.2.1
+  - bwa-meme ==1.0.4
diff --git a/bio/bwa-memx/index/meta.yaml b/bio/bwa-memx/index/meta.yaml
@@ -0,0 +1,5 @@
+name: "bwa-mem2 index"
+description: "Creates a bwa-mem, bwa-mem2 or bwa-meme index."
+authors:
+  - Christopher Schröder
+  - Patrik Smeds
diff --git a/bio/bwa-memx/index/test/Snakefile b/bio/bwa-memx/index/test/Snakefile
@@ -0,0 +1,66 @@
+rule bwa_mem_index:
+    input:
+        "{genome}",
+    output:
+        multiext(
+            "{genome}",
+            ".amb",
+            ".ann",
+            ".bwt",
+            ".pac",
+            ".sa",
+        ),
+    log:
+        "logs/bwa-mem_index/{genome}.log",
+    params:
+        bwa="bwa-mem",
+    threads: 8
+    wrapper:
+        "master/bio/bwa-memx/index"
+
+
+rule bwa_mem2_index:
+    input:
+        "{genome}",
+    output:
+        multiext(
+            "{genome}",
+            ".0123",
+            ".amb",
+            ".ann",
+            ".bwt.2bit.64",
+            ".pac",
+        ),
+    log:
+        "logs/bwa-mem2_index/{genome}.log",
+    params:
+        bwa="bwa-mem2",
+    threads: 8
+    wrapper:
+        "master/bio/bwa-memx/index"
+
+
+rule bwa_meme_index:
+    input:
+        "{genome}",
+    output:
+        multiext(
+            "{genome}",
+            ".0123",
+            ".amb",
+            ".ann",
+            ".pac",
+            ".pos_packed",
+            ".suffixarray_uint64",
+            ".suffixarray_uint64_L0_PARAMETERS",
+            ".suffixarray_uint64_L1_PARAMETERS",
+            ".suffixarray_uint64_L2_PARAMETERS",
+        ),
+    log:
+        "logs/bwa-meme_index/{genome}.log",
+    params:
+        bwa="bwa-meme",
+        num_models=100000,  #[hide]
+    threads: 8
+    wrapper:
+        "master/bio/bwa-memx/index"
diff --git a/bio/bwa-memx/index/test/genome.fasta b/bio/bwa-memx/index/test/genome.fasta
@@ -0,0 +1,2 @@
+>Sheila
+GCTAGCTCAGAAAAAAAAAA
diff --git a/bio/bwa-memx/index/wrapper.py b/bio/bwa-memx/index/wrapper.py
@@ -0,0 +1,85 @@
+__author__ = "Christopher Schröder, Patrik Smeds"
+__copyright__ = "Copyright 2022, Christopher Schröder, Patrik Smeds"
+__email__ = "christopher.schroeder@tu-dortmund.de, patrik.smeds@gmail.com"
+__license__ = "MIT"
+
+from os import path
+
+from snakemake.shell import shell
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+bwa = snakemake.params.get("bwa", "bwa-mem")
+
+# Check inputs/arguments.
+if len(snakemake.input) == 0:
+    raise ValueError("A reference genome has to be provided.")
+elif len(snakemake.input) > 1:
+    raise ValueError("Please provide exactly one reference genome as input.")
+
+if bwa == "bwa-mem":
+    valid_suffixes = {
+        ".amb",
+        ".ann",
+        ".bwt",
+        ".pac",
+        ".sa",
+    }
+    cmd = "bwa index {prefix} {snakemake.input[0]}"
+elif bwa == "bwa-mem2":
+    valid_suffixes = {
+        ".0123",
+        ".amb",
+        ".ann",
+        ".bwt.2bit.64",
+        ".pac",
+    }
+    cmd = "bwa-mem2 index -p {prefix} {snakemake.input[0]}"
+elif bwa == "bwa-meme":
+    valid_suffixes = {
+        ".0123",
+        ".amb",
+        ".ann",
+        ".pac",
+        ".pos_packed",
+        ".suffixarray_uint64",
+        ".suffixarray_uint64_L0_PARAMETERS",
+        ".suffixarray_uint64_L1_PARAMETERS",
+        ".suffixarray_uint64_L2_PARAMETERS",
+    }
+
+    cmd = "bwa-meme index -a meme -p {prefix} {snakemake.input[0]} -t {snakemake.threads} && bwa-meme-train-prmi -t {snakemake.threads} --data-path {dirname} {suffixarray} {basename} pwl,linear,linear_spline {num_models}"
+else:
+    raise ValueError(
+        "Unexpected value for params.bwa ({}). Must be bwa-mem, bwa-mem2 or bwa-meme.".format(
+            bwa
+        )
+    )
+
+
+def get_valid_suffix(path):
+    for suffix in valid_suffixes:
+        if path.endswith(suffix):
+            return suffix
+
+
+prefixes = set()
+for s in snakemake.output:
+    suffix = get_valid_suffix(s)
+    if suffix is None:
+        raise ValueError(f"{s} cannot be generated by bwa-meme index (invalid suffix).")
+    prefixes.add(s[: -len(suffix)])
+
+if len(prefixes) != 1:
+    raise ValueError("Output files must share common prefix up to their file endings.")
+(prefix,) = prefixes
+
+suffixarray = snakemake.input[0] + ".suffixarray_uint64"
+dirname = path.dirname(suffixarray)
+basename = path.basename(suffixarray)
+num_models = snakemake.params.get("num_models", 268435456)  # change only for testing!
+
+if not dirname:
+    dirname = "."
+
+shell(f"({cmd}) {log}")
diff --git a/bio/bwa-memx/mem/environment.yaml b/bio/bwa-memx/mem/environment.yaml
@@ -0,0 +1,12 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bwa == 0.7.17
+  - bwa-mem2 ==2.2.1
+  - bwa-meme ==1.0.4
+  - samtools ==1.15
+  - samblaster == 0.1.26
+  - mbuffer == 20160228
+  - picard-slim =2.27
diff --git a/bio/bwa-memx/mem/meta.yaml b/bio/bwa-memx/mem/meta.yaml
@@ -0,0 +1,7 @@
+name: "bwa-memx"
+description: Collection of bwa-mem, bwa-mem2 and bwa-meme.
+note: Depending on the "bwa" param, bwa-mem, bwa-mem2 or bwa-meme is used for alignment. Correspondingly different indices are expected as input.
+authors:
+  - Christopher Schröder
+  - Johannes Köster
+  - Julian de Ruiter
diff --git a/bio/bwa-memx/mem/test/Snakefile b/bio/bwa-memx/mem/test/Snakefile
@@ -0,0 +1,115 @@
+rule bwa_memx_test:  # [hide]
+    input:  # [hide]
+        "mapped/mem/a.cram",  # [hide]
+        "mapped/mem2/a.cram",  # [hide]
+        "mapped/meme/a.cram",  # [hide]
+
+
+rule L2_PARAMETERS_extract:  #[hide]
+    input:  #[hide]
+        "genome.fasta.suffixarray_uint64_L2_PARAMETERS.gz",  #[hide]
+    output:  #[hide]
+        temp("genome.fasta.suffixarray_uint64_L2_PARAMETERS"),  #[hide]
+    conda:  #[hide]
+        "gzip.yaml"  #[hide]
+    log:  #[hide]
+        "log/extract.l2.log",  #[hide]
+    shell:  #[hide]
+        "zcat {input} > {output} 2> {log}"  #[hide]
+
+
+rule bwa_memx_mem:
+    input:
+        reads=["reads/{sample}.1.fastq", "reads/{sample}.2.fastq"],
+        reference="genome.fasta",
+        idx=multiext(
+            "genome.fasta",
+            ".amb",
+            ".ann",
+            ".bwt",
+            ".pac",
+            ".sa",
+        ),
+    output:
+        "mapped/mem/{sample}.cram",  # Output can be .cram, .bam, or .sam
+    log:
+        "logs/bwa_memx/{sample}.log",
+    params:
+        bwa="bwa-mem",  # Can be 'bwa-mem, bwa-mem2 or bwa-meme.
+        extra=r"-R '@RG\tID:{sample}\tSM:{sample}' -M",
+        sort="samtools",  # Can be 'none' or 'samtools or picard'.
+        sort_order="coordinate",  # Can be 'coordinate' (default) or 'queryname'.
+        sort_extra="",  # Extra args for samtools.
+        dedup="mark",  # Can be 'none' (default), 'mark' or 'remove'.
+        dedup_extra="-M",  # Extra args for samblaster.
+        exceed_thread_limit=True,  # Set threads als for samtools sort / view (total used CPU may exceed threads!)
+        embed_ref=True,  # Embed reference when writing cram.
+    threads: 8
+    wrapper:
+        "master/bio/bwa-memx/mem"
+
+
+rule bwa_memx_mem2:
+    input:
+        reads=["reads/{sample}.1.fastq", "reads/{sample}.2.fastq"],
+        reference="genome.fasta",
+        idx=multiext(
+            "genome.fasta",
+            ".0123",
+            ".amb",
+            ".ann",
+            ".bwt.2bit.64",
+            ".pac",
+        ),
+    output:
+        "mapped/mem2/{sample}.cram",
+    log:
+        "logs/bwa_memx/{sample}.log",
+    params:
+        bwa="bwa-mem2",
+        extra=r"-R '@RG\tID:{sample}\tSM:{sample}' -M",
+        sort="picard",
+        sort_order="queryname",
+        sort_extra="",
+        dedup="none",
+        dedup_extra="-M",
+        exceed_thread_limit=True,
+        embed_ref=True,
+    threads: 8
+    wrapper:
+        "master/bio/bwa-memx/mem"
+
+
+rule bwa_memx_meme:
+    input:
+        reads=["reads/{sample}.1.fastq", "reads/{sample}.2.fastq"],
+        reference="genome.fasta",
+        idx=multiext(
+            "genome.fasta",
+            ".0123",
+            ".amb",
+            ".ann",
+            ".pac",
+            ".pos_packed",
+            ".suffixarray_uint64",
+            ".suffixarray_uint64_L0_PARAMETERS",
+            ".suffixarray_uint64_L1_PARAMETERS",
+            ".suffixarray_uint64_L2_PARAMETERS",
+        ),
+    output:
+        "mapped/meme/{sample}.cram",
+    log:
+        "logs/bwa_memx/{sample}.log",
+    params:
+        bwa="bwa-meme",
+        extra=r"-R '@RG\tID:{sample}\tSM:{sample}' -M",
+        sort="picard",
+        sort_order="coordinate",
+        sort_extra="",
+        dedup="remove",
+        dedup_extra="-M",
+        exceed_thread_limit=False,
+        embed_ref=False,
+    threads: 8
+    wrapper:
+        "master/bio/bwa-memx/mem"
diff --git a/bio/bwa-memx/mem/test/genome.fasta b/bio/bwa-memx/mem/test/genome.fasta
@@ -0,0 +1,2 @@
+>Sheila
+GCTAGCTCAGAAAAAAAAAA
diff --git a/bio/bwa-memx/mem/test/genome.fasta.0123 b/bio/bwa-memx/mem/test/genome.fasta.0123
diff --git a/bio/bwa-memx/mem/test/genome.fasta.amb b/bio/bwa-memx/mem/test/genome.fasta.amb
@@ -0,0 +1 @@
+20 1 0
diff --git a/bio/bwa-memx/mem/test/genome.fasta.ann b/bio/bwa-memx/mem/test/genome.fasta.ann
@@ -0,0 +1,3 @@
+20 1 11
+0 Sheila (null)
+0 20 0
diff --git a/bio/bwa-memx/mem/test/genome.fasta.bwt b/bio/bwa-memx/mem/test/genome.fasta.bwt
diff --git a/bio/bwa-memx/mem/test/genome.fasta.bwt.2bit.64 b/bio/bwa-memx/mem/test/genome.fasta.bwt.2bit.64
diff --git a/bio/bwa-memx/mem/test/genome.fasta.fai b/bio/bwa-memx/mem/test/genome.fasta.fai
@@ -0,0 +1 @@
+Sheila	20	8	20	21
diff --git a/bio/bwa-memx/mem/test/genome.fasta.pac b/bio/bwa-memx/mem/test/genome.fasta.pac
diff --git a/bio/bwa-memx/mem/test/genome.fasta.pos_packed b/bio/bwa-memx/mem/test/genome.fasta.pos_packed
diff --git a/bio/bwa-memx/mem/test/genome.fasta.sa b/bio/bwa-memx/mem/test/genome.fasta.sa
diff --git a/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64 b/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64
diff --git a/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64_L0_PARAMETERS b/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64_L0_PARAMETERS
diff --git a/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64_L1_PARAMETERS b/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64_L1_PARAMETERS
diff --git a/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64_L2_PARAMETERS.gz b/bio/bwa-memx/mem/test/genome.fasta.suffixarray_uint64_L2_PARAMETERS.gz
diff --git a/bio/bwa-memx/mem/test/gzip.yaml b/bio/bwa-memx/mem/test/gzip.yaml
@@ -0,0 +1,4 @@
+channels:
+  - conda-forge
+dependencies:
+  - gzip ==1.12
diff --git a/bio/bwa-memx/mem/test/reads/a.1.fastq b/bio/bwa-memx/mem/test/reads/a.1.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/bio/bwa-memx/mem/test/reads/a.2.fastq b/bio/bwa-memx/mem/test/reads/a.2.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!