feat: allow for specifying multiple chromosomes in ensembl reference …

…dna sequence download wrapper (#2376) This will concatenate multiple chromosomes in the order in which they are listed under `params: chromosome:`.   ### Description  ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · Nov 23, 2023 · c5590f0 · c5590f0
1 parent 03bf2d9
commit c5590f0
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 6 deletions.
diff --git a/bio/reference/ensembl-sequence/test/Snakefile b/bio/reference/ensembl-sequence/test/Snakefile
@@ -13,15 +13,31 @@ rule get_genome:
         "master/bio/reference/ensembl-sequence"
 
 
-rule get_chromosome:
+rule get_single_chromosome:
     output:
-        "refs/chr1.fasta",
+        "refs/chr2.fasta",
     params:
         species="saccharomyces_cerevisiae",
         datatype="dna",
         build="R64-1-1",
         release="101",
-        chromosome="I",  # optional: restrict to chromosome
+        chromosome="II",  # optional: restrict to one or multiple chromosomes, for multiple see below
+        # branch="plants",  # optional: specify branch
+    log:
+        "logs/get_genome.log",
+    cache: "omit-software"  # save space and time with between workflow caching (see docs)
+    wrapper:
+        "master/bio/reference/ensembl-sequence"
+
+rule get_multiple_chromosome:
+    output:
+        "refs/chr1_and_chr2.fasta",
+    params:
+        species="saccharomyces_cerevisiae",
+        datatype="dna",
+        build="R64-1-1",
+        release="101",
+        chromosome=["I", "II"],  # optional: restrict to one or multiple chromosomes
         # branch="plants",  # optional: specify branch
     log:
         "logs/get_genome.log",

diff --git a/bio/reference/ensembl-sequence/wrapper.py b/bio/reference/ensembl-sequence/wrapper.py
@@ -30,7 +30,7 @@
 chromosome = snakemake.params.get("chromosome", "")
 if datatype == "dna":
     if chromosome:
-        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
+        suffixes = [f"dna.chromosome.{chrom}.fa.gz" for chrom in chromosome]
     else:
         suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
 elif datatype == "cdna":
@@ -62,7 +62,7 @@
     except sp.CalledProcessError:
         continue
 
-    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
+    shell("(curl -L {url} | gzip -d >> {snakemake.output[0]}) {log}")
     success = True
     break
 

diff --git a/test.py b/test.py
@@ -5321,7 +5321,14 @@ def test_ensembl_sequence_old_release():
 def test_ensembl_sequence_chromosome():
     run(
         "bio/reference/ensembl-sequence",
-        ["snakemake", "--cores", "1", "refs/chr1.fasta", "--use-conda", "-F"],
+        ["snakemake", "--cores", "1", "refs/chr2.fasta", "--use-conda", "-F"],
+    )
+
+
+def test_ensembl_sequence_chromosomes():
+    run(
+        "bio/reference/ensembl-sequence",
+        ["snakemake", "--cores", "1", "refs/chr1_and_chr2.fasta", "--use-conda", "-F"],
     )