feat: Gffread (#1291)

### Description This PR adds [gffread](https://github.com/gpertea/gffread) to the list of available wrappers. ### QC  * [X] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --------- Co-authored-by: tdayris <tdayris@gustaveroussy.fr> Co-authored-by: tdayris <thibault.dayris@gustaveroussy.fr> Co-authored-by: Johannes Köster <johannes.koester@uni-due.de> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: snakedeploy-bot[bot] <115615832+snakedeploy-bot[bot]@users.noreply.github.com> Co-authored-by: Felix Mölder <felix.moelder@uni-due.de> Co-authored-by: Christopher Schröder <christopher.schroeder@tu-dortmund.de> Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com>
snakemake · May 12, 2023 · d613ddf · d613ddf
1 parent b08081f
commit d613ddf
Show file tree

Hide file tree

Showing 7 changed files with 171 additions and 0 deletions.
diff --git a/bio/gffread/environment.yaml b/bio/gffread/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - gffread =0.12.7
diff --git a/bio/gffread/meta.yaml b/bio/gffread/meta.yaml
@@ -0,0 +1,20 @@
+name: gffread
+url: http://ccb.jhu.edu/software/stringtie/gff.shtml
+description: >
+  Validate, filter, convert and perform various other operations on GFF/GTF files with Gffread
+author:
+  - Thibault Dayris
+input:
+  - fasta: Path to genome file (FASTA formatted).
+  - annotation: Path to genome annotation (GTF/GTF/BED formatted).
+  - ids: Optional path to records/transcript to keep.
+  - nids: Optional path to records/transcripts to discard.
+  - seq_info: Optional path to sequence information, a TSV formatted text file containing `<seq-name> <seq-length> <seq-description>`
+  - sort_by: Optional path to a text file containing the ordered list of reference sequences.
+  - attr: Optional text file containing comma-separated list of annotation attributes to keep.
+  - chr_replace: Optional path to a TSV-formatted text file containing `<original_ref_ID> <new_ref_ID>`.
+output:
+  - records: Path to genome sequence/annotation in the requested format, containing the requested information.
+  - dupinfo: Optional path to clustering/merging information
+notes: |
+  Input/output formats are automatically detected from their file extension.
diff --git a/bio/gffread/test/Snakefile b/bio/gffread/test/Snakefile
@@ -0,0 +1,20 @@
+rule test_gffread:
+    input:
+        fasta="genome.fasta",
+        annotation="annotation.gtf",
+        # ids="",  # Optional path to records to keep
+        # nids="",  # Optional path to records to drop
+        # seq_info="",  # Optional path to sequence information
+        # sort_by="",  # Optional path to the ordered list of reference sequences
+        # attr="",  # Optional annotation attributes to keep.
+        # chr_replace="",  # Optional path to <original_ref_ID> <new_ref_ID>
+    output:
+        records="transcripts.fa",
+        # dupinfo="",  # Optional path to clustering/merging information
+    threads: 1
+    log:
+        "logs/gffread.log",
+    params:
+        extra="",
+    wrapper:
+        "master/bio/gffread"
diff --git a/bio/gffread/test/annotation.gtf b/bio/gffread/test/annotation.gtf
@@ -0,0 +1,11 @@
+#!genome-build ManuallyMadeForExample
+#!genome-version MMFE01
+#!genome-date 2023-04
+#!genome-build-accession NCBI:MMFE01
+#!genebuild-last-updated 2023-04
+chromosome1	manually_made	gene	160	208	.	+	.	gene_id "ENMG01"; gene_version "1"; gene_name "ManuallyMadeGene1"; gene_source "manually_made"; gene_biotype "protein_coding";
+chromosome1	manually_made	transcript	160	208	.	+	.	gene_id "ENMG01"; gene_version "1"; transcript_id "transcript1"; transcript_version "1"; gene_name "ManuallyMadeGene1"; gene_source "manually_made"; gene_biotype "protein_coding"; transcript_name "ENMT01"; transcript_source "manually_made"; transcript_biotype "processed_transcript"; tag "basic";
+chromosome1	manually_made	exon	160	208	.	+	.	gene_id "ENMG01"; gene_version "1"; transcript_id "transcript1"; transcript_version "1"; exon_number "1"; gene_name "ManuallyMadeGene1"; gene_source "manually_made"; gene_biotype "protein_coding"; transcript_name: "ENMT01"; transcript_source "manually_made"; transcript_biotype "processed_transcript"; tag "basic"; exon_id "ENEX01"; exon_version "1";
+chromosome1	manually_made	gene	160	240	.	+	.	gene_id "ENMG02"; gene_version "1"; gene_name "ManuallyMadeGene2"; gene_source "manually_made"; gene_biotype "protein_coding";
+chromosome1	manually_made	transcript	160	240	.	+	.	gene_id "ENMG02"; gene_version "1"; transcript_id "transcript2"; transcript_version "1"; gene_name "ManuallyMadeGene2"; gene_source "manually_made"; gene_biotype "protein_coding"; transcript_name "ENMT02"; transcript_source "manually_made"; transcript_biotype "processed_transcript"; tag "basic";
+chromosome1	manually_made	exon	160	240	.	+	.	gene_id "ENMG02"; gene_version "1"; transcript_id "transcript2"; transcript_version "1"; exon_number "1"; gene_name "ManuallyMadeGene2"; gene_source "manually_made"; gene_biotype "protein_coding"; transcript_name: "ENMT02"; transcript_source "manually_made"; transcript_biotype "processed_transcript"; tag "basic"; exon_id "ENEX02"; exon_version "1";
diff --git a/bio/gffread/test/genome.fasta b/bio/gffread/test/genome.fasta
@@ -0,0 +1,13 @@
+>chromosome1
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNCTAGTAATACACGGATCTCCTCGGCGGAAGATTCCTACCGAAGCATCATCGTAACTTAATTACGTGATGTG
+CCAGGCTCGTATGTACATCGCTCCTCAAAGTGAGGGGAAGTCCTAATCGGATACCGATTGGACTCTTGAGTACCGGCCC
+TGTCGTACCGCTTCCCCCTTGAGCCGCTAGTAATCGATGCTCTACGAATAGGGCACCATCCTCGTTGTGCGCTACCACG
+GATTAGGCGCATCTCCCTGAGTCGGTTTAAAGATTGTTACCGTCCACCGTTGTCATATCAATATTATTAACAAGTTCGG
+TGGTAGGCATCTTATGGAAGGCTTACGGTTGCACCTTCCCTCAATCTCTTGCGACCATACTGTTATTCGGCGGGAACAC
+CGGTCTAACTGCGGTTAAGATAAGATTGCTAAGAATATTGTCGACTGGGATCCGGTTTATTATAGGATCTTCAGCTGTG
+GTTCCGCGACCACAACATCTAGCATGGGGGGCTCCGTGTGTTTCGAAGCGCCCATCATTTCGTAGCCACATATTGGAAT
+TAGCTGCCTTCAGAGTGATAATTAATCGCATAGGTAGGAGCACCCTCGTGAGGTCTTACTTGCCGGCCCGGTTTCATTC
+CAGAATCTGAGTTACCCGTGTTATGTCATGATCCTTGTATGCGTACTCTTGATAGGTAACCCGGAGTGCCCACCACGCA
+AGTTTATATAATCCCCGGGGAACAGGCTGTTGCCCAAAAGACTAGGCCCGTGTAGCTTTGCCCCGGATTCTCGTTAGTC
+GAGCGTTATGCTTTATATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
diff --git a/bio/gffread/wrapper.py b/bio/gffread/wrapper.py
@@ -0,0 +1,87 @@
+__author__ = "Thibault Dayris"
+__copyright__ = "Copyright 2023, Thibault Dayris"
+__mail__ = "thibault.dayris@gustaveroussy.fr"
+__license__ = "MIT"
+
+
+from snakemake.shell import shell
+
+extra = snakemake.params.get("extra", "")
+log = snakemake.log_fmt_shell(stdout=False, stderr=True)
+
+annotation = snakemake.input.annotation
+records = snakemake.output.records
+
+# Input format control
+if annotation.endswith(".bed"):
+    extra += " --in-bed "
+elif annotation.endswith(".tlf"):
+    extra += " --in-tlf "
+elif annotation.endswith(".gtf"):
+    pass
+else:
+    raise ValueError("Unknown annotation format")
+
+
+# Output format control
+if records.endswith((".gtf", ".gff", ".gff3")):
+    extra += " -T "
+elif records.endswith(".bed"):
+    extra += " --bed "
+elif records.endswith(".tlf"):
+    extra += " --tlf "
+elif records.endswith((".fasta", ".fa", ".fna")):
+    pass
+else:
+    raise ValueError("Unknown records format")
+
+
+# Optional input files
+ids = snakemake.input.get("ids", "")
+if ids:
+    extra += f" --ids {ids} "
+
+nids = snakemake.input.get("nids", "")
+if nids:
+    if ids:
+        raise ValueError(
+            "Provide either sequences ids to keep, or to drop."
+            " Or else, an empty file is produced."
+        )
+    extra += f" --nids {nids} "
+
+seq_info = snakemake.input.get("seq_info", "")
+if seq_info:
+    extra += f" -s {seq_info} "
+
+sort_by = snakemake.input.get("sort_by", "")
+if sort_by:
+    extra += f" --sort-by {sort_by} "
+
+attr = snakemake.input.get("attr", "")
+if attr:
+    if not records.endswith((".gtf", ".gff", ".gff3")):
+        raise ValueError(
+            "GTF attributes specified in input, "
+            "but records are not in GTF/GFF format."
+        )
+    extra += f" --attrs {attr} "
+
+chr_replace = snakemake.input.get("chr_replace", "")
+if chr_replace:
+    extra += f" -m {chr_replace} "
+
+
+# Optional output files
+dupinfo = snakemake.output.get("dupinfo", "")
+if dupinfo:
+    extra += f" -d {dupinfo} "
+
+
+shell(
+    "gffread {extra} "
+    "-o {records} "
+    "{snakemake.input.fasta} "
+    "{annotation} "
+    "{log} "
+)
diff --git a/test.py b/test.py
@@ -255,6 +255,20 @@ def test_seqkit_rmdup():
     )
 
 
+@skip_if_not_modified
+def test_gffread():
+    run(
+        "bio/gffread",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "--use-conda",
+            "-F",
+            "transcripts.fa"
+        ]
+    )
+
 @skip_if_not_modified
 def test_seqkit_fx2tab():
     run(