Skip to content

Commit

Permalink
feat: added support for gzip output files in ensembl annotation downl…
Browse files Browse the repository at this point in the history
…oad wrapper (#475)

* Added support for gzip output files

* Added test for gtf.gz

Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
  • Loading branch information
fgvieira and johanneskoester committed Apr 25, 2022
1 parent b9e25ae commit 42696c2
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 15 deletions.
2 changes: 2 additions & 0 deletions bio/reference/ensembl-annotation/meta.yaml
Expand Up @@ -2,3 +2,5 @@ name: ensembl-annotation
description: Download annotation of genomic sites (e.g. transcripts) from ENSEMBL FTP servers, and store them in a single .gtf or .gff3 file.
authors:
- Johannes Köster
output:
- Ensemble GTF or GFF3 anotation file
22 changes: 18 additions & 4 deletions bio/reference/ensembl-annotation/test/Snakefile
@@ -1,14 +1,28 @@
rule get_annotation:
output:
"refs/annotation.gtf"
"refs/annotation.gtf",
params:
species="homo_sapiens",
release="87",
build="GRCh37",
fmt="gtf",
flavor="" # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
log:
"logs/get_annotation.log"
"logs/get_annotation.log",
cache: True # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-annotation"


rule get_annotation_gz:
output:
"refs/annotation.gtf.gz",
params:
species="homo_sapiens",
release="87",
build="GRCh37",
flavor="", # optional, e.g. chr_patch_hapl_scaff, see Ensembl FTP.
log:
"logs/get_annotation.log",
cache: True # save space and time with between workflow caching (see docs)
wrapper:
"master/bio/reference/ensembl-annotation"
35 changes: 26 additions & 9 deletions bio/reference/ensembl-annotation/wrapper.py
Expand Up @@ -5,43 +5,60 @@

import subprocess
import sys
from pathlib import Path
from snakemake.shell import shell


log = snakemake.log_fmt_shell(stdout=False, stderr=True)


species = snakemake.params.species.lower()
release = int(snakemake.params.release)
fmt = snakemake.params.fmt
build = snakemake.params.build
flavor = snakemake.params.get("flavor", "")
release = int(snakemake.params.release)
out_fmt = Path(snakemake.output[0]).suffixes
out_gz = (out_fmt.pop() and True) if out_fmt[-1] == ".gz" else False
out_fmt = out_fmt.pop().lstrip(".")


branch = ""
if release >= 81 and build == "GRCh37":
# use the special grch37 branch for new releases
branch = "grch37/"


flavor = snakemake.params.get("flavor", "")
if flavor:
flavor += "."

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

suffix = ""
if fmt == "gtf":
if out_fmt == "gtf":
suffix = "gtf.gz"
elif fmt == "gff3":
elif out_fmt == "gff3":
suffix = "gff3.gz"
else:
raise ValueError(
"invalid format specified. Only 'gtf[.gz]' and 'gff3[.gz]' are currently supported."
)


url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
release=release,
build=build,
species=species,
fmt=fmt,
out_fmt=out_fmt,
species_cap=species.capitalize(),
suffix=suffix,
flavor=flavor,
branch=branch,
)


try:
shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
if out_gz:
shell("curl -L {url} > {snakemake.output[0]} {log}")
else:
shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
if snakemake.log:
sys.stderr = open(snakemake.log[0], "a")
Expand Down
12 changes: 10 additions & 2 deletions test.py
Expand Up @@ -3604,10 +3604,18 @@ def test_ensembl_sequence_chromosome_old_release():


@skip_if_not_modified
def test_ensembl_annotation():
def test_ensembl_annotation_gtf():
run(
"bio/reference/ensembl-annotation",
["snakemake", "--cores", "1", "--use-conda", "-F"],
["snakemake", "--cores", "1", "refs/annotation.gtf", "--use-conda", "-F"],
)


@skip_if_not_modified
def test_ensembl_annotation_gtf_gz():
run(
"bio/reference/ensembl-annotation",
["snakemake", "--cores", "1", "refs/annotation.gtf.gz", "--use-conda", "-F"],
)


Expand Down

0 comments on commit 42696c2

Please sign in to comment.