feat: support cram output in picardtools markduplicates (#486)

* cleaner code * testcase * snakefmt * add 'bams' term * add cram test
snakemake · May 13, 2022 · 218fd39 · 218fd39
1 parent 4fa0d56
commit 218fd39
Show file tree

Hide file tree

Showing 9 changed files with 1,055 additions and 8 deletions.
diff --git a/bio/picard/markduplicates/environment.yaml b/bio/picard/markduplicates/environment.yaml
@@ -4,4 +4,5 @@ channels:
   - defaults
 dependencies:
   - picard =2.26
+  - samtools =1.15.1
   - snakemake-wrapper-utils ==0.1.3
diff --git a/bio/picard/markduplicates/meta.yaml b/bio/picard/markduplicates/meta.yaml
@@ -3,10 +3,11 @@ description: |
   Mark PCR and optical duplicates with picard tools. For more information about MarkDuplicates see `picard documentation <https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates>`_.
 authors:
   - Johannes Köster
+  - Christopher Schröder
 input:
-  - bam file(s)
+  - bam/cram file(s)
 output:
-  - bam file with marked or removed duplicates
+  - bam/cram file with marked or removed duplicates
 notes: |
   * The `java_opts` param allows for additional arguments to be passed to the java compiler, e.g. "-XX:ParallelGCThreads=10" (not for `-XmX` or `-Djava.io.tmpdir`, since they are handled automatically).
   * The `extra` param allows for additional program arguments.

diff --git a/bio/picard/markduplicates/test/Snakefile b/bio/picard/markduplicates/test/Snakefile
@@ -1,6 +1,6 @@
 rule mark_duplicates:
     input:
-        "mapped/{sample}.bam",
+        bams="mapped/{sample}.bam",
     # optional to specify a list of BAMs; this has the same effect
     # of marking duplicates on separate read groups for a sample
     # and then merging
@@ -19,3 +19,28 @@ rule mark_duplicates:
         mem_mb=1024,
     wrapper:
         "master/bio/picard/markduplicates"
+
+
+rule mark_duplicates_cram:
+    input:
+        bams="mapped/{sample}.bam",
+        ref="ref/genome.fasta",
+    # optional to specify a list of BAMs; this has the same effect
+    # of marking duplicates on separate read groups for a sample
+    # and then merging
+    output:
+        bam="dedup/{sample}.cram",
+        metrics="dedup/{sample}.metrics.txt",
+    log:
+        "logs/picard/dedup/{sample}.log",
+    params:
+        extra="--REMOVE_DUPLICATES true",
+        embed_ref=True,  # set true if the fasta reference should be embedded into the cram
+    # optional specification of memory usage of the JVM that snakemake will respect with global
+    # resource restrictions (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources)
+    # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`:
+    # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties
+    resources:
+        mem_mb=1024,
+    wrapper:
+        "master/bio/picard/markduplicates"
diff --git a/bio/picard/markduplicates/test/mapped/a.bam b/bio/picard/markduplicates/test/mapped/a.bam
diff --git a/bio/picard/markduplicates/test/mapped/a.bam.bai b/bio/picard/markduplicates/test/mapped/a.bam.bai
diff --git a/bio/picard/markduplicates/test/ref/genome.fasta b/bio/picard/markduplicates/test/ref/genome.fasta
diff --git a/bio/picard/markduplicates/test/ref/genome.fasta.fai b/bio/picard/markduplicates/test/ref/genome.fasta.fai
@@ -0,0 +1 @@
+1	59940	56	60	61
diff --git a/bio/picard/markduplicates/wrapper.py b/bio/picard/markduplicates/wrapper.py
@@ -1,4 +1,4 @@
-__author__ = "Johannes Köster"
+__author__ = "Johannes Köster, Christopher Schröder"
 __copyright__ = "Copyright 2016, Johannes Köster"
 __email__ = "koester@jimmy.harvard.edu"
 __license__ = "MIT"
@@ -13,19 +13,30 @@
 extra = snakemake.params.get("extra", "")
 java_opts = get_java_opts(snakemake)
 
-bams = snakemake.input
+bams = snakemake.input.bams
 if isinstance(bams, str):
     bams = [bams]
 bams = list(map("--INPUT {}".format, bams))
 
+if snakemake.output.bam.endswith(".cram"):
+    output = "/dev/stdout"
+    if snakemake.params.embed_ref:
+        view_options = "-O cram,embed_ref"
+    else:
+        view_options = "-O cram"
+    convert = f" | samtools view -@ {snakemake.threads} {view_options} --reference {snakemake.input.ref} -o {snakemake.output.bam}"
+else:
+    output = snakemake.output.bam
+    convert = ""
+
 with tempfile.TemporaryDirectory() as tmpdir:
     shell(
-        "picard MarkDuplicates"  # Tool and its subcommand
+        "(picard MarkDuplicates"  # Tool and its subcommand
         " {java_opts}"  # Automatic java option
         " {extra}"  # User defined parmeters
         " {bams}"  # Input bam(s)
         " --TMP_DIR {tmpdir}"
-        " --OUTPUT {snakemake.output.bam}"  # Output bam
+        " --OUTPUT {output}"  # Output bam
         " --METRICS_FILE {snakemake.output.metrics}"  # Output metrics
-        " {log}"  # Logging
+        " {convert} ) {log}"  # Logging
     )
diff --git a/test.py b/test.py
@@ -2201,6 +2201,14 @@ def test_picard_markduplicates():
     )
 
 
+@skip_if_not_modified
+def test_picard_markduplicates_cram():
+    run(
+        "bio/picard/markduplicates",
+        ["snakemake", "--cores", "1", "dedup/a.cram", "--use-conda", "-F"],
+    )
+
+
 @skip_if_not_modified
 def test_picard_markduplicateswithmatecigar():
     run(