feat: Salmon decoy-aware gentrome (#490)

* [fix] (template): Missing code in wrappers' doc. Error #187 * salmon decoy initial commit * string formatting * fix comment * fix comment * meta.yaml fixed * logging Co-authored-by: tdayris <tdayris@gustaveroussy.fr>
snakemake · Jun 14, 2022 · 5bb3eab · 5bb3eab
1 parent ca246fb
commit 5bb3eab
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 0 deletions.
diff --git a/bio/salmon/decoys/environment.yaml b/bio/salmon/decoys/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - bzip2 =1.0.8
+  - gzip =1.12
diff --git a/bio/salmon/decoys/meta.yaml b/bio/salmon/decoys/meta.yaml
@@ -0,0 +1,16 @@
+name: decoys
+url: https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/
+description: Generate gentrome sequences and gather decoy sequences name
+authors:
+  - Thibault Dayris
+input:
+  - transcriptome: Path to transcriptome sequences, fasta (gz/bz2) formatted.
+  - genome: Path to genome sequences, fasta (gz/bz2) formatted.
+output:
+  - gentrome: Path to gentrome, fasta (gz/bz2) formatted.
+  - decoys: Path to text file contianing decoy sequence names.
+notes: |
+  Provide transcriptome and genome under the same format (raw fasta, gzipped
+  or bgzipped). In case of compressed input, this wrapper requires 2 threads:
+  one for on-the-fly decompression and one for actual decoy sequences
+  acquisition.
diff --git a/bio/salmon/decoys/test/Snakefile b/bio/salmon/decoys/test/Snakefile
@@ -0,0 +1,12 @@
+rule test_salmon_decoy:
+    input:
+        transcriptome="transcriptome.fasta.gz",
+        genome="genome.fasta.gz",
+    output:
+        gentrome="gentrome.fasta.gz",
+        decoys="decoys.txt",
+    threads: 2
+    log:
+        "decoys.log"
+    wrapper:
+        "master/bio/salmon/decoys"
diff --git a/bio/salmon/decoys/test/genome.fasta.gz b/bio/salmon/decoys/test/genome.fasta.gz
diff --git a/bio/salmon/decoys/test/transcriptome.fasta.gz b/bio/salmon/decoys/test/transcriptome.fasta.gz
diff --git a/bio/salmon/decoys/wrapper.py b/bio/salmon/decoys/wrapper.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Snakemake wrapper for gentrome and decoy sequences acquisition"""
+
+__author__ = "Thibault Dayris"
+__copyright__ = "Copyright 2022, Thibault Dayris"
+__email__ = "thibault.dayris@gustaveroussy.fr"
+__license__ = "MIT"
+
+
+from snakemake.shell import shell
+
+log = snakemake.log_fmt_shell(stdout=False, stderr=True, append=True)
+required_thread_nb = 1
+
+genome = snakemake.input["genome"]
+if genome.endswith(".gz"):
+    genome = f"<( gzip --stdout --decompress {genome} )"
+    required_thread_nb += 1  # Add a thread for gzip uncompression
+elif genome.endswith(".bz2"):
+    genome = f"<( bzip2 --stdout --decompress {genome} )"
+    required_thread_nb += 1  # Add a thread for bzip2 uncompression
+
+if snakemake.threads < required_thread_nb:
+    raise ValueError(
+        f"Salmon decoy wrapper requires exactly {required_thread_nb} threads, "
+        f"but only {snakemake.threads} were provided"
+    )
+
+sequences = [
+    snakemake.input["transcriptome"],
+    snakemake.input["genome"],
+    snakemake.output["gentrome"],
+]
+if all(fasta.endswith(".gz") for fasta in sequences):
+    # Then all input sequences are gzipped. The output will also be gzipped.
+    pass
+elif all(fasta.endswith(".bz2") for fasta in sequences):
+    # Then all input sequences are bgzipped. The output will also be bgzipped.
+    pass
+elif all(fasta.endswith((".fa", ".fna", ".fasta")) for fasta in sequences):
+    # Then all input sequences are raw fasta. The output will also be raw fasta.
+    pass
+else:
+    raise ValueError(
+        "Mixed compression status: Either all fasta sequences are compressed "
+        "with the *same* compression algorithm, or none of them are compressed."
+    )
+
+# Gathering decoy sequences names
+# Sed command works as follow:
+# -n       = do not print all lines
+# s/ .*//g = Remove anything after spaces. (remove comments)
+# s/>//p  = Remove '>' character at the begining of sequence names. Print names.
+shell("( sed -n 's/ .*//g;s/>//p' {genome} ) > {snakemake.output.decoys} {log}")
+
+# Building big gentrome file
+shell(
+    "cat {snakemake.input.transcriptome} {snakemake.input.genome} "
+    "> {snakemake.output.gentrome} {log}"
+)
diff --git a/test.py b/test.py
@@ -3000,6 +3000,21 @@ def test_trinity():
     )
 
 
+@skip_if_not_modified
+def test_salmon_decoys():
+    run(
+        "bio/salmon/decoys",
+        [
+            "snakemake",
+            "--cores",
+            "2",
+            "--use-conda",
+            "-F",
+            "gentrome.fasta.gz"
+        ]
+    )
+
+
 @skip_if_not_modified
 def test_salmon_index():
     run(