Skip to content

Commit

Permalink
feat: Salmon decoy-aware gentrome (#490)
Browse files Browse the repository at this point in the history
* [fix] (template): Missing code in wrappers' doc. Error #187

* salmon decoy initial commit

* string formatting

* fix comment

* fix comment

* meta.yaml fixed

* logging

Co-authored-by: tdayris <tdayris@gustaveroussy.fr>
  • Loading branch information
tdayris and tdayris committed Jun 14, 2022
1 parent ca246fb commit 5bb3eab
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 0 deletions.
6 changes: 6 additions & 0 deletions bio/salmon/decoys/environment.yaml
@@ -0,0 +1,6 @@
channels:
- conda-forge
- defaults
dependencies:
- bzip2 =1.0.8
- gzip =1.12
16 changes: 16 additions & 0 deletions bio/salmon/decoys/meta.yaml
@@ -0,0 +1,16 @@
name: decoys
url: https://combine-lab.github.io/alevin-tutorial/2019/selective-alignment/
description: Generate gentrome sequences and gather decoy sequences name
authors:
- Thibault Dayris
input:
- transcriptome: Path to transcriptome sequences, fasta (gz/bz2) formatted.
- genome: Path to genome sequences, fasta (gz/bz2) formatted.
output:
- gentrome: Path to gentrome, fasta (gz/bz2) formatted.
- decoys: Path to text file contianing decoy sequence names.
notes: |
Provide transcriptome and genome under the same format (raw fasta, gzipped
or bgzipped). In case of compressed input, this wrapper requires 2 threads:
one for on-the-fly decompression and one for actual decoy sequences
acquisition.
12 changes: 12 additions & 0 deletions bio/salmon/decoys/test/Snakefile
@@ -0,0 +1,12 @@
rule test_salmon_decoy:
input:
transcriptome="transcriptome.fasta.gz",
genome="genome.fasta.gz",
output:
gentrome="gentrome.fasta.gz",
decoys="decoys.txt",
threads: 2
log:
"decoys.log"
wrapper:
"master/bio/salmon/decoys"
Binary file added bio/salmon/decoys/test/genome.fasta.gz
Binary file not shown.
Binary file added bio/salmon/decoys/test/transcriptome.fasta.gz
Binary file not shown.
62 changes: 62 additions & 0 deletions bio/salmon/decoys/wrapper.py
@@ -0,0 +1,62 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Snakemake wrapper for gentrome and decoy sequences acquisition"""

__author__ = "Thibault Dayris"
__copyright__ = "Copyright 2022, Thibault Dayris"
__email__ = "thibault.dayris@gustaveroussy.fr"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True, append=True)
required_thread_nb = 1

genome = snakemake.input["genome"]
if genome.endswith(".gz"):
genome = f"<( gzip --stdout --decompress {genome} )"
required_thread_nb += 1 # Add a thread for gzip uncompression
elif genome.endswith(".bz2"):
genome = f"<( bzip2 --stdout --decompress {genome} )"
required_thread_nb += 1 # Add a thread for bzip2 uncompression

if snakemake.threads < required_thread_nb:
raise ValueError(
f"Salmon decoy wrapper requires exactly {required_thread_nb} threads, "
f"but only {snakemake.threads} were provided"
)

sequences = [
snakemake.input["transcriptome"],
snakemake.input["genome"],
snakemake.output["gentrome"],
]
if all(fasta.endswith(".gz") for fasta in sequences):
# Then all input sequences are gzipped. The output will also be gzipped.
pass
elif all(fasta.endswith(".bz2") for fasta in sequences):
# Then all input sequences are bgzipped. The output will also be bgzipped.
pass
elif all(fasta.endswith((".fa", ".fna", ".fasta")) for fasta in sequences):
# Then all input sequences are raw fasta. The output will also be raw fasta.
pass
else:
raise ValueError(
"Mixed compression status: Either all fasta sequences are compressed "
"with the *same* compression algorithm, or none of them are compressed."
)

# Gathering decoy sequences names
# Sed command works as follow:
# -n = do not print all lines
# s/ .*//g = Remove anything after spaces. (remove comments)
# s/>//p = Remove '>' character at the begining of sequence names. Print names.
shell("( sed -n 's/ .*//g;s/>//p' {genome} ) > {snakemake.output.decoys} {log}")

# Building big gentrome file
shell(
"cat {snakemake.input.transcriptome} {snakemake.input.genome} "
"> {snakemake.output.gentrome} {log}"
)
15 changes: 15 additions & 0 deletions test.py
Expand Up @@ -3000,6 +3000,21 @@ def test_trinity():
)


@skip_if_not_modified
def test_salmon_decoys():
run(
"bio/salmon/decoys",
[
"snakemake",
"--cores",
"2",
"--use-conda",
"-F",
"gentrome.fasta.gz"
]
)


@skip_if_not_modified
def test_salmon_index():
run(
Expand Down

0 comments on commit 5bb3eab

Please sign in to comment.