feat: Gseapy (#1822)

### Description This PR adds `gseapy` to the list of available wrappers. I plan to add replot, dotplots, heatmaps, and barplots functions in the future. ### QC  * [X] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --------- Co-authored-by: tdayris <tdayris@gustaveroussy.fr> Co-authored-by: tdayris <thibault.dayris@gustaveroussy.fr> Co-authored-by: Johannes Köster <johannes.koester@uni-due.de> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: snakedeploy-bot[bot] <115615832+snakedeploy-bot[bot]@users.noreply.github.com> Co-authored-by: Felix Mölder <felix.moelder@uni-due.de> Co-authored-by: Christopher Schröder <christopher.schroeder@tu-dortmund.de>
snakemake · Oct 30, 2023 · 2a50eb0 · 2a50eb0
1 parent 99e9f60
commit 2a50eb0
Show file tree

Hide file tree

Showing 10 changed files with 320 additions and 0 deletions.
diff --git a/bio/gseapy/gsea/environment.yaml b/bio/gseapy/gsea/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - gseapy =1.0.6
diff --git a/bio/gseapy/gsea/meta.yaml b/bio/gseapy/gsea/meta.yaml
@@ -0,0 +1,20 @@
+name: gseapy gsea
+url: https://github.com/zqfang/GSEApy
+description: GSEApy, Gene Set Enrichment Analysis in Python
+authors:
+  - Thibault Dayris
+input:
+  - gmt: Optional path to gene sets (GMT formatted). If no GMT file is provided, then a `gene_sets` value must be given in `params`.
+  - expr: Path to expression table, required for `gsea` and `ssgsea`. (RNK formatted)
+  - cls: Path to categorical class file, required for `gsea`. (CLS formatted)
+  - rank: Path to pre-ranked genes, required for `prerank`. (TSV/CSV formatted)
+  - gene_list: Path to gene list file, required for `enrichr`. (TXT/TSV/CSV formatted)
+  - background: Optional path to a background file for `enrichr`. (TXT/TSV/CSV formatted)
+output:
+  - outdir: Path to output directory
+  - pkl: Path to serialized results (Pickle)
+params:
+  - extra: Dictionnary of arguments given to GSEApy's subfunction, besides IO and threading options.
+  - gene_sets: Non-file gene set names from Biomart.
+notes: |
+  GSEApy's subfunctions are selected directly from input files.
diff --git a/bio/gseapy/gsea/test/Snakefile b/bio/gseapy/gsea/test/Snakefile
@@ -0,0 +1,77 @@
+rule test_gseapy_enrichr:
+    input:
+        gene_list="genes_list.txt",
+        # gmt="", Optional path to a GMT file
+    output:
+        enrichr_dir=directory("KEGG_2016"),
+    threads: 1
+    params:
+        extra={"cutoff": 1},
+        gene_sets=["KEGG_2016"],  # Optional list of gene sets available on biomart
+    log:
+        wrapper="logs/gseapy/wrapper.enrichr.log",
+        gseapy="logs/gseapy/enrichr.log",
+    wrapper:
+        "master/bio/gseapy/gsea"
+
+
+rule test_gseapy_gsea:
+    input:
+        expr="genes_expr.tsv",
+        cls="sample_class.cls",
+        gmt="geneset.gmt",
+    output:
+        gmt="gene_sets.gmt",
+        csv="gsea.results.csv",
+    threads: 1
+    params:
+        extra={
+            "min_size": 1,
+        },
+        # gene_sets = [""] # Optional list of gene sets available on biomart
+    log:
+        wrapper="logs/gseapy/wrapper.gsea.log",
+        gseapy="logs/gseapy/gsea.log",
+    wrapper:
+        "master/bio/gseapy/gsea"
+
+
+rule test_gseapy_ssgsea:
+    input:
+        expr="genes_expr.tsv",
+        gmt="geneset.gmt",
+    output:
+        gmt="gene_sets.gmt",
+        csv="ssgsea.results.csv",
+    threads: 1
+    params:
+        extra={
+            "min_size": 1,
+        },
+        # gene_sets = [""] # Optional list of gene sets available on biomart
+    log:
+        wrapper="logs/gseapy/wrapper.ssgsea.log",
+        gseapy="logs/gseapy/ssgsea.log",
+    wrapper:
+        "master/bio/gseapy/gsea"
+
+
+rule test_gseapy_prerank:
+    input:
+        rank="genes.rnk",
+        gmt="geneset.gmt",
+    output:
+        gmt="gene_sets.gmt",
+        csv="prerank.results.csv",
+        prerank=directory("prerank_results_dir"),
+    threads: 1
+    params:
+        extra={
+            "min_size": 1,
+        },
+        # gene_sets = [""] # Optional list of gene sets available on biomart
+    log:
+        wrapper="logs/gseapy/wrapper.prerank.log",
+        gseapy="logs/gseapy/prerank.log",
+    wrapper:
+        "master/bio/gseapy/gsea"
diff --git a/bio/gseapy/gsea/test/genes.rnk b/bio/gseapy/gsea/test/genes.rnk
@@ -0,0 +1,6 @@
+SCARA3	0.9
+CLIC6	0.8
+TACSTD2	0.02
+TINAGL1	-0.2
+PTX3	-0.4
+CTLA2B	-0.98
diff --git a/bio/gseapy/gsea/test/genes_expr.tsv b/bio/gseapy/gsea/test/genes_expr.tsv
@@ -0,0 +1,9 @@
+Gene_id	Sample1	Sample2	Sample3	Sample4	Sample5	Sample6
+SCARA3	1	1	3	4	1	5
+CLIC6	0	0	14	18	0	15
+TACSTD2	0	0	0	0	0	0
+TINAGL1	15	15	15	15	15	15
+PTX3	1	1	3	4	1	5
+CLIC6	0	0	14	18	0	15
+TACSTD2	0	0	0	0	0	0
+TINAGL1	15	15	15	15	15	15
diff --git a/bio/gseapy/gsea/test/genes_list.txt b/bio/gseapy/gsea/test/genes_list.txt
@@ -0,0 +1,13 @@
+CTLA2B
+SCARA3
+LOC100044683
+CMBL
+CLIC6
+IL13RA1
+TACSTD2
+DKKL1
+CSF1
+CITED1
+SYNPO2L
+TINAGL1
+PTX3
diff --git a/bio/gseapy/gsea/test/geneset.gmt b/bio/gseapy/gsea/test/geneset.gmt
@@ -0,0 +1,3 @@
+PosGS	PositiveGeneSet	SCARA3	CLIC6	TACSTD2
+NegGS	NegativeGeneSet	TINAGL1	PTX3	Gene12	CTLA2B
+MisGS	FilteredOutGeneSet	Gene145	Gene39
diff --git a/bio/gseapy/gsea/test/sample_class.cls b/bio/gseapy/gsea/test/sample_class.cls
@@ -0,0 +1,3 @@
+6 3 3
+# WT Mut
+WT WT Mut Mut WT Mut
diff --git a/bio/gseapy/gsea/wrapper.py b/bio/gseapy/gsea/wrapper.py
@@ -0,0 +1,132 @@
+# coding: utf-8
+
+__author__ = "Thibault Dayris"
+__copyright__ = "Copyright 2023, Thibault Dayris"
+__mail__ = "thibault.dayris@gustaveroussy.fr"
+__license__ = "MIT"
+
+import glob
+import gseapy
+import logging
+import shutil
+
+from tempfile import TemporaryDirectory
+
+
+def mouve_output_files(method, tempdir):
+    """
+    Move the predictable output files individually
+    and let the rest be in an additional directory
+    """
+    log = snakemake.log.get("gseapy")
+    if log:
+        logging.info(f"Moving gseapy logs to {log}")
+        shutil.move(
+            # The logging file contains a timestamp
+            # there is only one logging file
+            glob.glob(f"{tempdir}/gseapy.{method}.*.log")[0],
+            log,
+        )
+
+    if method != "enrichr":
+        # `enrichr` method does not produce gmt files
+        gmt = snakemake.output.get("gmt")
+        if gmt:
+            logging.info(f"Moving gseapy GMT to {gmt}")
+            shutil.move(f"{tempdir}/gene_sets.gmt", gmt)
+
+        # `enrichr` method does not produce any csv
+        csv = snakemake.output.get("csv")
+        if csv:
+            logging.info(f"Moving gseapy CSV to {csv}")
+            if method == "gsea":
+                shutil.move(f"{tempdir}/gseapy.phenotype.{method}.report.csv", csv)
+            else:
+                shutil.move(f"{tempdir}/gseapy.gene_set.{method}.report.csv", csv)
+
+        # Only produced with pre-ranking method.
+        # File names depend on gseapy results and are not predictable
+        prerank_dir = snakemake.output.get("prerank")
+        if method == "prerank" and prerank_dir:
+            logging.info(f"Moving gseapy pre-rank results to {prerank_dir}")
+            shutil.move(f"{tempdir}/{method}/", prerank_dir)
+
+    # Only produced with enrichr method.
+    # File names depend on gseapy results and are not predictable
+    else:
+        # Since logging file has already been moved, only
+        # PDF/TXT results remains
+        enrichr_dir = snakemake.output.get("enrichr_dir")
+        if enrichr_dir:
+            logging.info(f"Moving gseapy-biomart enrichr to {enrichr_dir}")
+            shutil.move(f"{tempdir}/", enrichr_dir)
+
+
+class TooManyThreadsRequested(Exception):
+    def __init__(self):
+        super().__init__("This subcommand uses only one threads.")
+
+
+# Building Gene sets object
+gmt_path = snakemake.input.get("gmt", [])
+gene_sets = snakemake.params.get("gene_sets", [])
+if isinstance(gene_sets, str):
+    gene_sets = [gene_sets]
+
+if isinstance(gmt_path, list):
+    gene_sets += gmt_path
+else:
+    gene_sets.append(gmt_path)
+
+extra = snakemake.params.get("extra", {})
+
+# Running gseapy
+result = None
+
+with TemporaryDirectory() as tempdir:
+    if snakemake.input.get("rank"):
+        print("Using Pre-rank method")
+        result = gseapy.prerank(
+            rnk=snakemake.input.rank,
+            gene_sets=gene_sets,
+            outdir=tempdir,
+            threads=snakemake.threads,
+            **extra,
+        )
+        mouve_output_files(method="prerank", tempdir=tempdir)
+
+    elif snakemake.input.get("expr"):
+        if snakemake.input.get("cls"):
+            print("Using GSEA method")
+            result = gseapy.gsea(
+                data=snakemake.input.expr,
+                gene_sets=gene_sets,
+                cls=snakemake.input.cls,
+                threads=snakemake.threads,
+                outdir=tempdir,
+                **extra,
+            )
+            mouve_output_files(method="gsea", tempdir=tempdir)
+        else:
+            if snakemake.threads > 1:
+                raise TooManyThreadsRequested()
+
+            print("Using Single-Sample GSEA method")
+            result = gseapy.ssgsea(
+                data=snakemake.input.expr, gene_sets=gene_sets, outdir=tempdir, **extra
+            )
+            mouve_output_files(method="ssgsea", tempdir=tempdir)
+    elif snakemake.input.get("gene_list"):
+        if snakemake.threads > 1:
+            raise TooManyThreadsRequested()
+
+        print("Using Biomart EnrichR method")
+        result = gseapy.enrichr(
+            gene_list=snakemake.input.gene_list,
+            gene_sets=gene_sets,
+            outdir=tempdir,
+            **extra,
+        )
+        mouve_output_files(method="enrichr", tempdir=tempdir)
+    else:
+        raise ValueError("Could not decide between GSEApy functions")
diff --git a/test.py b/test.py
@@ -4456,6 +4456,57 @@ def test_salmon_quant():
     )
 
 
+@skip_if_not_modified
+def test_gseapy_gsea():
+    run(
+        "bio/gseapy/gsea",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "--use-conda",
+            "-F",
+            "KEGG_2016"
+        ]
+    )
+
+    run(
+        "bio/gseapy/gsea",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "--use-conda",
+            "-F",
+            "gsea.results.csv"
+        ]
+    )
+
+    run(
+        "bio/gseapy/gsea",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "--use-conda",
+            "-F",
+            "ssgsea.results.csv"
+        ]
+    )
+
+    run(
+        "bio/gseapy/gsea",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "--use-conda",
+            "-F",
+            "prerank_results_dir"
+        ]
+    )
+
+
 @skip_if_not_modified
 def test_sourmash_compute():
     run(