feat: Add wrapper for bwa-meme (#555)

* bwa-meme initial commit * wrapper path * Update bio/bwa-meme/index/test/Snakefile Co-authored-by: Johannes Köster <johannes.koester@uni-due.de> * Update bio/bwa-meme/mem/test/Snakefile_samtools Co-authored-by: Johannes Köster <johannes.koester@uni-due.de> * samtools test to normal test Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
snakemake · Sep 17, 2022 · 92579c4 · 92579c4
1 parent 06329dd
commit 92579c4
Show file tree

Hide file tree

Showing 13 changed files with 303 additions and 0 deletions.
diff --git a/bio/bwa-meme/index/environment.yaml b/bio/bwa-meme/index/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bwa-meme ==1.0.4
diff --git a/bio/bwa-meme/index/meta.yaml b/bio/bwa-meme/index/meta.yaml
@@ -0,0 +1,5 @@
+name: "bwa-mem2 index"
+description: "Creates a bwa-meme index."
+authors:
+  - Christopher Schröder
+  - Patrik Smeds
diff --git a/bio/bwa-meme/index/test/Snakefile b/bio/bwa-meme/index/test/Snakefile
@@ -0,0 +1,21 @@
+rule bwa_mem2_index:
+    input:
+        "{genome}",
+    output:
+        multiext(
+            "{genome}",
+            ".0123",
+            ".amb",
+            ".ann",
+            ".pac",
+            ".pos_packed",
+            ".suffixarray_uint64",
+            ".suffixarray_uint64_L0_PARAMETERS",
+            ".suffixarray_uint64_L1_PARAMETERS",
+            ".suffixarray_uint64_L2_PARAMETERS"
+        )
+    log:
+        "logs/bwa-meme_index/{genome}.log",
+    threads: 8
+    wrapper:
+        "master/bio/bwa-meme/index"
diff --git a/bio/bwa-meme/index/test/genome.fasta b/bio/bwa-meme/index/test/genome.fasta
@@ -0,0 +1,2 @@
+>Sheila
+GCTAGCTCAGAAAAAAAAAA
diff --git a/bio/bwa-meme/index/wrapper.py b/bio/bwa-meme/index/wrapper.py
@@ -0,0 +1,50 @@
+__author__ = "Christopher Schröder, Patrik Smeds"
+__copyright__ = "Copyright 2022, Christopher Schröder, Patrik Smeds"
+__email__ = "christopher.schroeder@tu-dortmund.de, patrik.smeds@gmail.com"
+__license__ = "MIT"
+
+from os import path
+
+from snakemake.shell import shell
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+# Check inputs/arguments.
+if len(snakemake.input) == 0:
+    raise ValueError("A reference genome has to be provided.")
+elif len(snakemake.input) > 1:
+    raise ValueError("Please provide exactly one reference genome as input.")
+
+valid_suffixes = {
+    ".0123",
+    ".amb",
+    ".ann",
+    ".pac",
+    ".pos_packed",
+    ".suffixarray_uint64",
+    ".suffixarray_uint64_L0_PARAMETERS",
+    ".suffixarray_uint64_L1_PARAMETERS",
+    ".suffixarray_uint64_L2_PARAMETERS",
+}
+
+
+def get_valid_suffix(path):
+    for suffix in valid_suffixes:
+        if path.endswith(suffix):
+            return suffix
+
+
+prefixes = set()
+for s in snakemake.output:
+    suffix = get_valid_suffix(s)
+    if suffix is None:
+        raise ValueError(f"{s} cannot be generated by bwa-meme index (invalid suffix).")
+    prefixes.add(s[: -len(suffix)])
+
+if len(prefixes) != 1:
+    raise ValueError("Output files must share common prefix up to their file endings.")
+(prefix,) = prefixes
+
+shell(
+    "(bwa-meme index -a meme -p {prefix} {snakemake.input[0]} -t {snakemake.threads} && build_rmis_dna.sh {snakemake.input[0]}) {log}"
+)
diff --git a/bio/bwa-meme/mem/environment.yaml b/bio/bwa-meme/mem/environment.yaml
@@ -0,0 +1,9 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bwa-meme ==1.0.4
+  - samtools ==1.15
+  - samblaster == 0.1.26
+  - mbuffer == 20160228
diff --git a/bio/bwa-meme/mem/meta.yaml b/bio/bwa-meme/mem/meta.yaml
@@ -0,0 +1,6 @@
+name: "bwa-meme"
+description: BWA-MEME is a practical and efficient seeding algorithm based on a suffix array search algorithm that solves the challenges in utilizing learned indices for SMEM search which is extensively used in the seeding phase. It achieves up to 3.45× speedup in seeding throughput over BWA-MEM2 by reducing the number of instructions by 4.60×, memory accesses by 8.77× and LLC misses by 2.21×, while ensuring the identical SAM output to BWA-MEM2.
+authors:
+  - Christopher Schröder
+  - Johannes Köster
+  - Julian de Ruiter
diff --git a/bio/bwa-meme/mem/test/Snakefile b/bio/bwa-meme/mem/test/Snakefile
@@ -0,0 +1,53 @@
+rule bwa_meme_index: #[hide]
+    input: #[hide]
+        "genome.fasta", #[hide]
+    output: #[hide]
+        "{genome}.0123", #[hide]
+        "{genome}.amb", #[hide]
+        "{genome}.ann", #[hide]
+        "{genome}.pac", #[hide]
+        "{genome}.pos_packed", #[hide]
+        "{genome}.suffixarray_uint64", #[hide]
+        "{genome}.suffixarray_uint64_L0_PARAMETERS", #[hide]
+        "{genome}.suffixarray_uint64_L1_PARAMETERS", #[hide]
+        "{genome}.suffixarray_uint64_L2_PARAMETERS", #[hide]
+    threads: 8 #[hide]
+    log: #[hide]
+        "logs/bwa_meme_index/{genome}.log", #[hide]
+    wrapper: #[hide]
+        "master/bio/bwa-meme/index" #[hide]
+#[hide]
+#[hide]
+rule bwa_meme_mem:
+    input:
+        reads=["reads/{sample}.1.fastq", "reads/{sample}.2.fastq"],
+        # Index can be a list of (all) files created by bwa, or one of them
+        reference="genome.fasta",
+        idx=multiext(
+            "genome.fasta",
+            ".0123",
+            ".amb",
+            ".ann",
+            ".pac",
+            ".pos_packed",
+            ".suffixarray_uint64",
+            ".suffixarray_uint64_L0_PARAMETERS",
+            ".suffixarray_uint64_L1_PARAMETERS",
+            ".suffixarray_uint64_L2_PARAMETERS",
+        ),
+    output:
+        "mapped/{sample}.cram",  # Output can be .cram, .bam, or .sam
+    log:
+        "logs/bwa_meme/{sample}.log",
+    params:
+        extra=r"-R '@RG\tID:{sample}\tSM:{sample}' -M",
+        sort="samtools",  # Can be 'none' or 'samtools'.
+        sort_order="coordinate",  # Can be 'coordinate' (default) or 'queryname'.
+        sort_extra="",  # Extra args for samtools.
+        dedup="mark",  # Can be 'none' (default), 'mark' or 'remove'.
+        dedup_extra="-M",  # Extra args for samblaster.
+        exceed_thread_limit=True,  # Set threads als for samtools sort / view (total used CPU may exceed threads!)
+        embed_ref=True,  # Embed reference when writing cram.
+    threads: 8
+    wrapper:
+        "master/bio/bwa-meme/mem"
diff --git a/bio/bwa-meme/mem/test/genome.fasta b/bio/bwa-meme/mem/test/genome.fasta
@@ -0,0 +1,2 @@
+>Sheila
+GCTAGCTCAGAAAAAAAAAA
diff --git a/bio/bwa-meme/mem/test/reads/a.1.fastq b/bio/bwa-meme/mem/test/reads/a.1.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/bio/bwa-meme/mem/test/reads/a.2.fastq b/bio/bwa-meme/mem/test/reads/a.2.fastq
@@ -0,0 +1,4 @@
+@1
+ACGGCAT
++
+!!!!!!!
diff --git a/bio/bwa-meme/mem/wrapper.py b/bio/bwa-meme/mem/wrapper.py
@@ -0,0 +1,125 @@
+__author__ = "Christopher Schröder, Johannes Köster, Julian de Ruiter"
+__copyright__ = (
+    "Copyright 2020, Christopher Schröder, Johannes Köster and Julian de Ruiter"
+)
+__email__ = "christopher.schroeder@tu-dortmund.de koester@jimmy.harvard.edu, julianderuiter@gmail.com"
+__license__ = "MIT"
+
+
+from os import path
+
+from snakemake.shell import shell
+
+
+# Extract arguments.
+extra = snakemake.params.get("extra", "")
+
+sort = snakemake.params.get("sort", "none")
+sort_order = snakemake.params.get("sort_order", "coordinate")
+sort_extra = snakemake.params.get("sort_extra", "")
+embed_ref = snakemake.params.get("embed_ref", False)
+
+# Option to set the threads of samtools sort and view to the snakemake limit.
+# In theory, bwa and alternate and samtools view starts only when sort is
+# finished, so that never more threads are used than the limit. But it can
+# not always be guaranteed.
+exceed_thread_limit = snakemake.params.get("exceed_thread_limit", False)
+dedup = snakemake.params.get("dedup", "none")
+dedup_extra = snakemake.params.get("dedup_extra", "")
+
+# Detect output format.
+if snakemake.output[0].endswith(".sam"):
+    output_format = "cram"
+elif snakemake.output[0].endswith(".bam"):
+    output_format = "bam"
+elif snakemake.output[0].endswith(".cram"):
+    output_format = "cram"
+else:
+    raise ValueError("output file format must be .sam, .bam or .cram")
+
+if embed_ref:
+    output_format += ",embed_ref"
+
+if exceed_thread_limit:
+    samtools_threads = snakemake.threads
+else:
+    samtools_threads = 1
+
+reference = snakemake.input.get("reference")
+
+log = snakemake.log_fmt_shell(stdout=False, stderr=True)
+
+# Check inputs/arguments.
+if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
+    1,
+    2,
+}:
+    raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements")
+
+if sort_order not in {"coordinate", "queryname"}:
+    raise ValueError("Unexpected value for sort_order ({})".format(sort_order))
+
+# Determine which pipe command to use for converting to bam or sorting.
+if sort == "none":
+    # Simply convert to bam using samtools view.
+    pipe_cmd = "samtools view -h -O {output_format} -o {snakemake.output[0]} -T {reference} -@ {samtools_threads} -"
+
+elif sort == "samtools":
+
+    pipe_cmd = "samtools sort {sort_extra} -O {output_format} -o {snakemake.output[0]} --reference {reference} -@ {samtools_threads} -"
+
+    # Add name flag if needed.
+    if sort_order == "queryname":
+        sort_extra += " -n"
+
+    prefix = path.splitext(snakemake.output[0])[0]
+    sort_extra += " -T " + prefix + ".tmp"
+
+    # Sort alignments using samtools sort.
+
+# elif sort == "picard":
+#     # Sort alignments using picard SortSam.
+#     pipe_cmd = (
+#         "picard SortSam {sort_extra} INPUT=/dev/stdin"
+#         " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}"
+#     )
+
+else:
+    raise ValueError("Unexpected value for params.sort ({})".format(sort))
+
+
+# Determine which pipe command to use for converting to bam or sorting.
+if dedup == "none":
+    # Do not detect duplicates.
+    dedup_cmd = ""
+
+elif dedup == "mark":
+    # Mark duplicates using samblaster.
+    dedup_cmd = "samblaster -q {dedup_extra} | "
+
+elif dedup == "remove":
+    dedup_cmd = "samblaster -q -r {dedup_extra} | "
+
+# elif sort == "picard":
+#     # Sort alignments using picard SortSam.
+#     pipe_cmd = (
+#         "picard SortSam {sort_extra} INPUT=/dev/stdin"
+#         " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}"
+#     )
+
+else:
+    raise ValueError("Unexpected value for params.dedup ({})".format(dedup))
+
+
+duplicates = snakemake.params.get("mark_duplicates", False)
+
+
+shell(
+    "(bwa-meme mem -7"
+    " -t {snakemake.threads}"
+    " {extra}"
+    " {reference}"
+    " {snakemake.input.reads}"
+    " | mbuffer -q -m 10G "
+    " | " + dedup_cmd + pipe_cmd + ") {log}"
+)
diff --git a/test.py b/test.py
@@ -1640,6 +1640,22 @@ def test_bwa_mem2_sort_samtools():
         ],
     )
 
+@skip_if_not_modified
+def test_bwa_meme():
+    run(
+        "bio/bwa-mem2/mem",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "mapped/a.cram",
+            "--use-conda",
+            "-F",
+            "-s",
+            "Snakefile_samtools",
+        ],
+    )
+
 
 @skip_if_not_modified
 def test_bwa_mem2_sort_picard():