Skip to content

Commit

Permalink
feat: Add wrapper for bwa-meme (#555)
Browse files Browse the repository at this point in the history
* bwa-meme initial commit

* wrapper path

* Update bio/bwa-meme/index/test/Snakefile

Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>

* Update bio/bwa-meme/mem/test/Snakefile_samtools

Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>

* samtools test to normal test

Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
  • Loading branch information
christopher-schroeder and johanneskoester committed Sep 17, 2022
1 parent 06329dd commit 92579c4
Show file tree
Hide file tree
Showing 13 changed files with 303 additions and 0 deletions.
6 changes: 6 additions & 0 deletions bio/bwa-meme/index/environment.yaml
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- bwa-meme ==1.0.4
5 changes: 5 additions & 0 deletions bio/bwa-meme/index/meta.yaml
@@ -0,0 +1,5 @@
name: "bwa-mem2 index"
description: "Creates a bwa-meme index."
authors:
- Christopher Schröder
- Patrik Smeds
21 changes: 21 additions & 0 deletions bio/bwa-meme/index/test/Snakefile
@@ -0,0 +1,21 @@
rule bwa_mem2_index:
input:
"{genome}",
output:
multiext(
"{genome}",
".0123",
".amb",
".ann",
".pac",
".pos_packed",
".suffixarray_uint64",
".suffixarray_uint64_L0_PARAMETERS",
".suffixarray_uint64_L1_PARAMETERS",
".suffixarray_uint64_L2_PARAMETERS"
)
log:
"logs/bwa-meme_index/{genome}.log",
threads: 8
wrapper:
"master/bio/bwa-meme/index"
2 changes: 2 additions & 0 deletions bio/bwa-meme/index/test/genome.fasta
@@ -0,0 +1,2 @@
>Sheila
GCTAGCTCAGAAAAAAAAAA
50 changes: 50 additions & 0 deletions bio/bwa-meme/index/wrapper.py
@@ -0,0 +1,50 @@
__author__ = "Christopher Schröder, Patrik Smeds"
__copyright__ = "Copyright 2022, Christopher Schröder, Patrik Smeds"
__email__ = "christopher.schroeder@tu-dortmund.de, patrik.smeds@gmail.com"
__license__ = "MIT"

from os import path

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
raise ValueError("A reference genome has to be provided.")
elif len(snakemake.input) > 1:
raise ValueError("Please provide exactly one reference genome as input.")

valid_suffixes = {
".0123",
".amb",
".ann",
".pac",
".pos_packed",
".suffixarray_uint64",
".suffixarray_uint64_L0_PARAMETERS",
".suffixarray_uint64_L1_PARAMETERS",
".suffixarray_uint64_L2_PARAMETERS",
}


def get_valid_suffix(path):
for suffix in valid_suffixes:
if path.endswith(suffix):
return suffix


prefixes = set()
for s in snakemake.output:
suffix = get_valid_suffix(s)
if suffix is None:
raise ValueError(f"{s} cannot be generated by bwa-meme index (invalid suffix).")
prefixes.add(s[: -len(suffix)])

if len(prefixes) != 1:
raise ValueError("Output files must share common prefix up to their file endings.")
(prefix,) = prefixes

shell(
"(bwa-meme index -a meme -p {prefix} {snakemake.input[0]} -t {snakemake.threads} && build_rmis_dna.sh {snakemake.input[0]}) {log}"
)
9 changes: 9 additions & 0 deletions bio/bwa-meme/mem/environment.yaml
@@ -0,0 +1,9 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- bwa-meme ==1.0.4
- samtools ==1.15
- samblaster == 0.1.26
- mbuffer == 20160228
6 changes: 6 additions & 0 deletions bio/bwa-meme/mem/meta.yaml
@@ -0,0 +1,6 @@
name: "bwa-meme"
description: BWA-MEME is a practical and efficient seeding algorithm based on a suffix array search algorithm that solves the challenges in utilizing learned indices for SMEM search which is extensively used in the seeding phase. It achieves up to 3.45× speedup in seeding throughput over BWA-MEM2 by reducing the number of instructions by 4.60×, memory accesses by 8.77× and LLC misses by 2.21×, while ensuring the identical SAM output to BWA-MEM2.
authors:
- Christopher Schröder
- Johannes Köster
- Julian de Ruiter
53 changes: 53 additions & 0 deletions bio/bwa-meme/mem/test/Snakefile
@@ -0,0 +1,53 @@
rule bwa_meme_index: #[hide]
input: #[hide]
"genome.fasta", #[hide]
output: #[hide]
"{genome}.0123", #[hide]
"{genome}.amb", #[hide]
"{genome}.ann", #[hide]
"{genome}.pac", #[hide]
"{genome}.pos_packed", #[hide]
"{genome}.suffixarray_uint64", #[hide]
"{genome}.suffixarray_uint64_L0_PARAMETERS", #[hide]
"{genome}.suffixarray_uint64_L1_PARAMETERS", #[hide]
"{genome}.suffixarray_uint64_L2_PARAMETERS", #[hide]
threads: 8 #[hide]
log: #[hide]
"logs/bwa_meme_index/{genome}.log", #[hide]
wrapper: #[hide]
"master/bio/bwa-meme/index" #[hide]
#[hide]
#[hide]
rule bwa_meme_mem:
input:
reads=["reads/{sample}.1.fastq", "reads/{sample}.2.fastq"],
# Index can be a list of (all) files created by bwa, or one of them
reference="genome.fasta",
idx=multiext(
"genome.fasta",
".0123",
".amb",
".ann",
".pac",
".pos_packed",
".suffixarray_uint64",
".suffixarray_uint64_L0_PARAMETERS",
".suffixarray_uint64_L1_PARAMETERS",
".suffixarray_uint64_L2_PARAMETERS",
),
output:
"mapped/{sample}.cram", # Output can be .cram, .bam, or .sam
log:
"logs/bwa_meme/{sample}.log",
params:
extra=r"-R '@RG\tID:{sample}\tSM:{sample}' -M",
sort="samtools", # Can be 'none' or 'samtools'.
sort_order="coordinate", # Can be 'coordinate' (default) or 'queryname'.
sort_extra="", # Extra args for samtools.
dedup="mark", # Can be 'none' (default), 'mark' or 'remove'.
dedup_extra="-M", # Extra args for samblaster.
exceed_thread_limit=True, # Set threads als for samtools sort / view (total used CPU may exceed threads!)
embed_ref=True, # Embed reference when writing cram.
threads: 8
wrapper:
"master/bio/bwa-meme/mem"
2 changes: 2 additions & 0 deletions bio/bwa-meme/mem/test/genome.fasta
@@ -0,0 +1,2 @@
>Sheila
GCTAGCTCAGAAAAAAAAAA
4 changes: 4 additions & 0 deletions bio/bwa-meme/mem/test/reads/a.1.fastq
@@ -0,0 +1,4 @@
@1
ACGGCAT
+
!!!!!!!
4 changes: 4 additions & 0 deletions bio/bwa-meme/mem/test/reads/a.2.fastq
@@ -0,0 +1,4 @@
@1
ACGGCAT
+
!!!!!!!
125 changes: 125 additions & 0 deletions bio/bwa-meme/mem/wrapper.py
@@ -0,0 +1,125 @@
__author__ = "Christopher Schröder, Johannes Köster, Julian de Ruiter"
__copyright__ = (
"Copyright 2020, Christopher Schröder, Johannes Köster and Julian de Ruiter"
)
__email__ = "christopher.schroeder@tu-dortmund.de koester@jimmy.harvard.edu, julianderuiter@gmail.com"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


# Extract arguments.
extra = snakemake.params.get("extra", "")

sort = snakemake.params.get("sort", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")
embed_ref = snakemake.params.get("embed_ref", False)

# Option to set the threads of samtools sort and view to the snakemake limit.
# In theory, bwa and alternate and samtools view starts only when sort is
# finished, so that never more threads are used than the limit. But it can
# not always be guaranteed.
exceed_thread_limit = snakemake.params.get("exceed_thread_limit", False)
dedup = snakemake.params.get("dedup", "none")
dedup_extra = snakemake.params.get("dedup_extra", "")

# Detect output format.
if snakemake.output[0].endswith(".sam"):
output_format = "cram"
elif snakemake.output[0].endswith(".bam"):
output_format = "bam"
elif snakemake.output[0].endswith(".cram"):
output_format = "cram"
else:
raise ValueError("output file format must be .sam, .bam or .cram")

if embed_ref:
output_format += ",embed_ref"

if exceed_thread_limit:
samtools_threads = snakemake.threads
else:
samtools_threads = 1

reference = snakemake.input.get("reference")

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
1,
2,
}:
raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements")

if sort_order not in {"coordinate", "queryname"}:
raise ValueError("Unexpected value for sort_order ({})".format(sort_order))

# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":
# Simply convert to bam using samtools view.
pipe_cmd = "samtools view -h -O {output_format} -o {snakemake.output[0]} -T {reference} -@ {samtools_threads} -"

elif sort == "samtools":

pipe_cmd = "samtools sort {sort_extra} -O {output_format} -o {snakemake.output[0]} --reference {reference} -@ {samtools_threads} -"

# Add name flag if needed.
if sort_order == "queryname":
sort_extra += " -n"

prefix = path.splitext(snakemake.output[0])[0]
sort_extra += " -T " + prefix + ".tmp"

# Sort alignments using samtools sort.

# elif sort == "picard":
# # Sort alignments using picard SortSam.
# pipe_cmd = (
# "picard SortSam {sort_extra} INPUT=/dev/stdin"
# " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}"
# )

else:
raise ValueError("Unexpected value for params.sort ({})".format(sort))


# Determine which pipe command to use for converting to bam or sorting.
if dedup == "none":
# Do not detect duplicates.
dedup_cmd = ""

elif dedup == "mark":
# Mark duplicates using samblaster.
dedup_cmd = "samblaster -q {dedup_extra} | "

elif dedup == "remove":
dedup_cmd = "samblaster -q -r {dedup_extra} | "

# elif sort == "picard":
# # Sort alignments using picard SortSam.
# pipe_cmd = (
# "picard SortSam {sort_extra} INPUT=/dev/stdin"
# " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}"
# )

else:
raise ValueError("Unexpected value for params.dedup ({})".format(dedup))


duplicates = snakemake.params.get("mark_duplicates", False)


shell(
"(bwa-meme mem -7"
" -t {snakemake.threads}"
" {extra}"
" {reference}"
" {snakemake.input.reads}"
" | mbuffer -q -m 10G "
" | " + dedup_cmd + pipe_cmd + ") {log}"
)
16 changes: 16 additions & 0 deletions test.py
Expand Up @@ -1640,6 +1640,22 @@ def test_bwa_mem2_sort_samtools():
],
)

@skip_if_not_modified
def test_bwa_meme():
run(
"bio/bwa-mem2/mem",
[
"snakemake",
"--cores",
"1",
"mapped/a.cram",
"--use-conda",
"-F",
"-s",
"Snakefile_samtools",
],
)


@skip_if_not_modified
def test_bwa_mem2_sort_picard():
Expand Down

0 comments on commit 92579c4

Please sign in to comment.