feat: Add csvtk wrapper (#2681)

Add wrapper for CSVtk ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * the `environment.yaml` pinning has been updated by running `snakedeploy pin-conda-envs environment.yaml` on a linux machine, * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · Feb 27, 2024 · 495053e · 495053e
1 parent 1edbead
commit 495053e
Show file tree

Hide file tree

Showing 13 changed files with 253 additions and 17 deletions.
diff --git a/test.py b/test.py
@@ -1111,6 +1111,35 @@ def test_arriba_star_meta():
     )
 
 
+@skip_if_not_modified
+def test_csvtk():
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/uniq.txt"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/stats.txt"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/split"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/sort.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/cut.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/grep.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/sample.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/join.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/headers.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/frequency.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/summary.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/summary_tsv.csv"])
+
+    run("utils/csvtk", ["snakemake", "--cores", "1", "--use-conda", "csvtk/cat.csv"])
+
+
 @skip_if_not_modified
 def test_xsv():
     run("utils/xsv", ["snakemake", "--cores", "1", "--use-conda", "xsv/split/0.csv"])

diff --git a/utils/csvtk/environment.linux-64.pin.txt b/utils/csvtk/environment.linux-64.pin.txt
@@ -0,0 +1,5 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+@EXPLICIT
+https://conda.anaconda.org/bioconda/linux-64/csvtk-0.29.0-h9ee0642_0.tar.bz2#3524147f82edc377c613ef53bc830985
diff --git a/utils/csvtk/environment.yaml b/utils/csvtk/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - csvtk =0.29.0
diff --git a/utils/csvtk/meta.yaml b/utils/csvtk/meta.yaml
@@ -0,0 +1,13 @@
+name: csvtk
+url: https://bioinf.shenwei.me/csvtk/
+description: >
+  Perform various operations over CSV/TSV tables.
+authors:
+  - Filipe G. Vieira
+input:
+  - Path to CSV/TSV table.
+output:
+  - Path the result file / directory
+params:
+  - extra: Optional arguments for `csvtk` (for TSV files, `--delimiter` is automatically set).
+  - subcommand: csvtk subcommand among `cat`, `count`, `fixlengths`, `flatten`, `fmt`, `frequency`, `headers`, `index`, `input`, `join`, `sample`, `search`, `select`, `slice`, `sort`, `split`, `stats`, or `table`
diff --git a/utils/csvtk/test/Snakefile b/utils/csvtk/test/Snakefile
@@ -0,0 +1,147 @@
+### Concatenation subcommand ###
+rule test_csvtk_cat:
+    input:
+        table=["table.csv", "right.csv"],
+    output:
+        "csvtk/cat.csv",
+    log:
+        "logs/cat.log",
+    params:
+        subcommand="cat",
+        extra="",
+    threads: 1
+    wrapper:
+        "master/utils/csvtk"
+
+
+### Summary subcommand ###
+rule test_csvtk_summary:
+    input:
+        table="table.csv",
+    output:
+        "csvtk/summary.csv",
+    log:
+        "logs/summary_csv.log",
+    params:
+        subcommand="summary",
+        extra="--fields s1,s3",
+    threads: 1
+    wrapper:
+        "master/utils/csvtk"
+
+
+use rule test_csvtk_summary as test_csvtk_summary_tsv_input with:
+    input:
+        table="table.tsv",
+    output:
+        "csvtk/summary_tsv.csv",
+    log:
+        "logs/summary_tsv.log",
+
+
+### Frequency subcommand ###
+use rule test_csvtk_summary as test_csvtk_frequency with:
+    output:
+        "csvtk/frequency.csv",
+    log:
+        "logs/frequency.log",
+    params:
+        subcommand="freq",
+
+
+### Headers subcommand ###
+use rule test_csvtk_summary as test_csvtk_headers with:
+    output:
+        "csvtk/headers.csv",
+    log:
+        "logs/headers.log",
+    params:
+        subcommand="headers",
+
+
+### Join subcommand ###
+use rule test_csvtk_cat as test_csvtk_join with:
+    output:
+        "csvtk/join.csv",
+    log:
+        "logs/join.log",
+    params:
+        subcommand="join",
+        col1="gene_id",
+        col2="gene_id",
+
+
+### Sample subcommand ###
+use rule test_csvtk_summary as test_csvtk_sample with:
+    output:
+        "csvtk/sample.csv",
+    log:
+        "logs/sample.log",
+    params:
+        subcommand="sample",
+        extra="-s 123 -p 0.5",
+
+
+### Grep subcommand ###
+use rule test_csvtk_summary as test_csvtk_grep with:
+    output:
+        "csvtk/grep.csv",
+    log:
+        "logs/grep.log",
+    params:
+        subcommand="grep",
+        extra="--fields gene_id --pattern ENSG[0-9]+",
+
+
+### Cut subcommand ###
+use rule test_csvtk_summary as test_csvtk_cut with:
+    output:
+        "csvtk/cut.csv",
+    log:
+        "logs/cut.log",
+    params:
+        subcommand="cut",
+        extra="-f 2",
+
+
+### Sort subcommand ###
+use rule test_csvtk_summary as test_csvtk_sort with:
+    output:
+        "csvtk/sort.csv",
+    log:
+        "logs/sort.log",
+    params:
+        subcommand="sort",
+        extra="--keys 1",
+
+
+### Split subcommand ###
+use rule test_csvtk_summary as test_csvtk_split with:
+    output:
+        directory("csvtk/split"),
+    log:
+        "logs/split.log",
+    params:
+        subcommand="split",
+        extra="-f gene_id",
+
+
+### Stats subcommand ###
+use rule test_csvtk_summary as test_csvtk_stats with:
+    output:
+        "csvtk/stats.txt",
+    log:
+        "logs/stats.log",
+    params:
+        subcommand="stats",
+
+
+### Uniq subcommand ###
+use rule test_csvtk_summary as test_csvtk_uniq with:
+    output:
+        "csvtk/uniq.txt",
+    log:
+        "logs/uniq.log",
+    params:
+        subcommand="uniq",
+        extra="-f gene_id",
diff --git a/utils/csvtk/test/right.csv b/utils/csvtk/test/right.csv
@@ -0,0 +1,3 @@
+gene_id,s4,s5,s6
+ENSG03,24.5,15,85
+ENSG02,12,157,0.2
diff --git a/utils/csvtk/test/table.csv b/utils/csvtk/test/table.csv
@@ -0,0 +1,3 @@
+gene_id,s1,s2,s3
+ENSG01,14.5,15,75
+ENSG02,12,57,0.2
diff --git a/utils/csvtk/test/table.tsv b/utils/csvtk/test/table.tsv
@@ -0,0 +1,3 @@
+gene_id	s1	s2	s3
+ENSG01	14.5	15	75
+ENSG02	12	57	0.2
diff --git a/utils/csvtk/wrapper.py b/utils/csvtk/wrapper.py
@@ -0,0 +1,30 @@
+__author__ = "Filipe G. Vieira"
+__copyright__ = "Copyright 2024, Filipe G. Vieira"
+__license__ = "MIT"
+
+from pathlib import Path
+from snakemake.shell import shell
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+subcommand = snakemake.params["subcommand"]
+extra = snakemake.params.get("extra", "")
+
+# Input TSV delimiter
+if len(snakemake.input) == 1:
+    if str(snakemake.input).removesuffix(".gz").endswith(".tsv"):
+        extra += " --tabs"
+elif all(input.removesuffix(".gz").endswith(".tsv") for input in snakemake.input):
+    extra += " --tabs"
+
+
+# Output TSV delimiter
+if len(snakemake.output) == 1:
+    if str(snakemake.output).removesuffix(".gz").endswith(".tsv"):
+        extra += " --out-tabs"
+elif all(output.removesuffix(".gz").endswith(".tsv") for output in snakemake.output):
+    extra += " --out-tabs"
+
+
+shell(
+    "csvtk {subcommand} --num-cpus {snakemake.threads} {extra} --out-file {snakemake.output} {snakemake.input} {log}"
+)
diff --git a/utils/xsv/test/right.csv b/utils/xsv/test/right.csv
@@ -1,3 +1,3 @@
 gene_id,s4,s5,s6
 ENSG03,24.5,15,85
-ENSG02,12,157,0.2
+ENSG02,12,157,0.2
diff --git a/utils/xsv/test/table.csv b/utils/xsv/test/table.csv
@@ -1,3 +1,3 @@
 gene_id,s1,s2,s3
 ENSG01,14.5,15,75
-ENSG02,12,57,0.2
+ENSG02,12,57,0.2
diff --git a/utils/xsv/test/table.tsv b/utils/xsv/test/table.tsv
@@ -1,3 +1,3 @@
 gene_id	s1	s2	s3
 ENSG01	14.5	15	75
-ENSG02	12	57	0.2
+ENSG02	12	57	0.2
diff --git a/utils/xsv/wrapper.py b/utils/xsv/wrapper.py
@@ -11,38 +11,35 @@
 extra = snakemake.params.get("extra", "")
 
 # TSV delimiter
-if len(snakemake.input["table"]) == 1:
-    if str(snakemake.input["table"]).endswith(".tsv"):
-        extra += " --delimiter $'\t' "
-elif all(str(table).endswith(".tsv") for table in snakemake.input["table"]):
-    extra += " --delimiter $'\t' "
+if len(snakemake.input) == 1:
+    if str(snakemake.input).endswith(".tsv"):
+        extra += " --delimiter $'\t'"
+elif all(table.endswith(".tsv") for table in snakemake.input):
+    extra += " --delimiter $'\t'"
 
 
 # Automatic multithreading when possible
 if subcommand in ["frequency", "split", "stats"]:
-    extra += f" --jobs {snakemake.threads} "
+    extra += f" --jobs {snakemake.threads}"
 elif snakemake.threads > 1:
     raise Warning("Only one thread is required")
 
 # Command line building
 if subcommand == "join":
     shell(
         "xsv {subcommand} {extra} "
-        "{snakemake.params.col1} {snakemake.input.table[0]} "
-        "{snakemake.params.col2} {snakemake.input.table[1]} "
+        "{snakemake.params.col1} {snakemake.input[0]} "
+        "{snakemake.params.col2} {snakemake.input[1]} "
         "> {snakemake.output} {log}"
     )
 elif subcommand == "index":
     log = snakemake.log_fmt_shell(stdout=True, stderr=True)
-    shell("xsv {subcommand} {extra} {snakemake.input.table} {log}")
+    shell("xsv {subcommand} {extra} {snakemake.input} {log}")
 elif subcommand == "split":
     log = snakemake.log_fmt_shell(stdout=True, stderr=True)
     outdir = snakemake.output
     if len(outdir) > 1:
         outdir = os.path.dirname(outdir[0])
-    shell("xsv {subcommand} {extra} {outdir} {snakemake.input.table} {log}")
+    shell("xsv {subcommand} {extra} {outdir} {snakemake.input} {log}")
 else:
-    shell(
-        "xsv {subcommand} {extra} {snakemake.input.table} "
-        " > {snakemake.output} {log}"
-    )
+    shell("xsv {subcommand} {extra} {snakemake.input} > {snakemake.output} {log}")