feat: added several Seqkit wrappers (#1128)

### Description  Added wrappers for `seqkit` `grep`, `stats`, `rmdup`, and `fx2tab`. ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · Mar 22, 2023 · 9c4e667 · 9c4e667
1 parent dea181a
commit 9c4e667
Show file tree

Hide file tree

Showing 23 changed files with 370 additions and 0 deletions.
diff --git a/bio/seqkit/fx2tab/environment.yaml b/bio/seqkit/fx2tab/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - seqkit =2.3.1
diff --git a/bio/seqkit/fx2tab/meta.yaml b/bio/seqkit/fx2tab/meta.yaml
@@ -0,0 +1,14 @@
+name: SeqKit fx2tab
+url: https://bioinf.shenwei.me/seqkit/usage/#fx2tab
+description: |
+  Run SeqKit f2tab convert FASTA/Q to tabular format.
+authors:
+  - Filipe G. Vieira
+input:
+  - fastx: Input FASTA/Q file
+output:
+  - tsv: Output TSV file
+params:
+  - extra: Optional parameters
+notes: |
+  * The `extra` param allows for additional program arguments.
diff --git a/bio/seqkit/fx2tab/test/Snakefile b/bio/seqkit/fx2tab/test/Snakefile
@@ -0,0 +1,12 @@
+rule seqkit_fx2tab:
+    input:
+        fastx="reads/{sample}.fastq",
+    output:
+        tsv="out/fx2tab/{sample}.tsv",
+    log:
+        "logs/fx2tab/{sample}.log",
+    params:
+        extra="--name",
+    threads: 2
+    wrapper:
+        "master/bio/seqkit/fx2tab"
diff --git a/bio/seqkit/fx2tab/test/reads/a.fastq b/bio/seqkit/fx2tab/test/reads/a.fastq
@@ -0,0 +1,8 @@
+@1
+ACGGCAT
++
+!!!!!!!
+@2
+ATGGCAT
++
+!!!!!!!
diff --git a/bio/seqkit/fx2tab/wrapper.py b/bio/seqkit/fx2tab/wrapper.py
@@ -0,0 +1,18 @@
+__author__ = "Filipe G. Vieira"
+__copyright__ = "Copyright 2023, Filipe G. Vieira"
+__license__ = "MIT"
+
+from snakemake.shell import shell
+
+extra = snakemake.params.get("extra", "")
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+
+shell(
+    "seqkit fx2tab"
+    " --threads {snakemake.threads}"
+    " {extra}"
+    " --out-file {snakemake.output.tsv}"
+    " {snakemake.input.fastx}"
+    " {log}"
+)
diff --git a/bio/seqkit/grep/environment.yaml b/bio/seqkit/grep/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - seqkit =2.3.1
diff --git a/bio/seqkit/grep/meta.yaml b/bio/seqkit/grep/meta.yaml
@@ -0,0 +1,15 @@
+name: SeqKit grep
+url: https://bioinf.shenwei.me/seqkit/usage/#grep
+description: |
+  Run SeqKit grep to search sequences by ID, name, sequence, or sequence motifs.
+authors:
+  - Filipe G. Vieira
+input:
+  - fastx: Input FASTA/Q file
+  - pattern: pattern file (one record per line)
+output:
+  - fastx: Output FASTA/Q file
+params:
+  - extra: Optional parameters
+notes: |
+  * The `extra` param allows for additional program arguments.
diff --git a/bio/seqkit/grep/test/Snakefile b/bio/seqkit/grep/test/Snakefile
@@ -0,0 +1,28 @@
+rule seqkit_grep_name:
+    input:
+        fastx="reads/{sample}.fastq",
+        patterns="reads/name.txt",
+    output:
+        fastx="out/grep_name/{sample}.fastq.gz",
+    log:
+        "logs/grep_name/{sample}.log",
+    params:
+        extra="--by-name",
+    threads: 2
+    wrapper:
+        "master/bio/seqkit/grep"
+
+
+rule seqkit_grep_seq:
+    input:
+        fastx="reads/{sample}.fastq",
+        patterns="reads/seq.txt",
+    output:
+        fastx="out/grep_seq/{sample}.fastq.gz",
+    log:
+        "logs/grep_seq/{sample}.log",
+    params:
+        extra="--by-seq",
+    threads: 2
+    wrapper:
+        "master/bio/seqkit/grep"
diff --git a/bio/seqkit/grep/test/reads/a.fastq b/bio/seqkit/grep/test/reads/a.fastq
@@ -0,0 +1,8 @@
+@1
+ACGGCAT
++
+!!!!!!!
+@2
+ATGGCAT
++
+!!!!!!!
diff --git a/bio/seqkit/grep/test/reads/name.txt b/bio/seqkit/grep/test/reads/name.txt
@@ -0,0 +1 @@
+1
diff --git a/bio/seqkit/grep/test/reads/seq.txt b/bio/seqkit/grep/test/reads/seq.txt
@@ -0,0 +1 @@
+ATGGCAT
diff --git a/bio/seqkit/grep/wrapper.py b/bio/seqkit/grep/wrapper.py
@@ -0,0 +1,19 @@
+__author__ = "Filipe G. Vieira"
+__copyright__ = "Copyright 2023, Filipe G. Vieira"
+__license__ = "MIT"
+
+from snakemake.shell import shell
+
+extra = snakemake.params.get("extra", "")
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+
+shell(
+    "seqkit grep"
+    " --threads {snakemake.threads}"
+    " --pattern-file {snakemake.input.patterns}"
+    " {extra}"
+    " --out-file {snakemake.output.fastx}"
+    " {snakemake.input.fastx}"
+    " {log}"
+)
diff --git a/bio/seqkit/rmdup/environment.yaml b/bio/seqkit/rmdup/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - seqkit =2.3.1
diff --git a/bio/seqkit/rmdup/meta.yaml b/bio/seqkit/rmdup/meta.yaml
@@ -0,0 +1,16 @@
+name: SeqKit rmdup
+url: https://bioinf.shenwei.me/seqkit/usage/#rmdup
+description: |
+  Run SeqKit rmdup grep to remove duplicated sequences by ID, name, or sequence.
+authors:
+  - Filipe G. Vieira
+input:
+  - fastx: Input FASTA/Q file
+output:
+  - fastx: Output FASTA/Q file
+  - dup_num: Output file to save number and list of duplicated seqs
+  - dup_seq: Output file to save duplicated seqs
+params:
+  - extra: Optional parameters
+notes: |
+  * The `extra` param allows for additional program arguments.
diff --git a/bio/seqkit/rmdup/test/Snakefile b/bio/seqkit/rmdup/test/Snakefile
@@ -0,0 +1,30 @@
+rule seqkit_rmdup_name:
+    input:
+        fastx="reads/{sample}.fastq",
+    output:
+        fastx="out/rmdup_name/{sample}.fastq.gz",
+        dup_num="out/rmdup_name/{sample}.num.txt",
+        dup_seq="out/rmdup_name/{sample}.seq.txt",
+    log:
+        "logs/rmdup_name/{sample}.log",
+    params:
+        extra="--by-name",
+    threads: 2
+    wrapper:
+        "master/bio/seqkit/rmdup"
+
+
+rule seqkit_rmdup_seq:
+    input:
+        fastx="reads/{sample}.fastq",
+    output:
+        fastx="out/rmdup_seq/{sample}.fastq.gz",
+        dup_num="out/rmdup_seq/{sample}.num.txt",
+        dup_seq="out/rmdup_seq/{sample}.seq.txt",
+    log:
+        "logs/rmdup_seq/{sample}.log",
+    params:
+        extra="--by-seq",
+    threads: 2
+    wrapper:
+        "master/bio/seqkit/rmdup"
diff --git a/bio/seqkit/rmdup/test/reads/a.fastq b/bio/seqkit/rmdup/test/reads/a.fastq
@@ -0,0 +1,16 @@
+@1
+ACGGCAT
++
+!!!!!!!
+@2
+ATGGCAT
++
+!!!!!!!
+@1
+NCGGCAT
++
+!!!!!!!
+@3
+ATGGCAT
++
+!!!!!!!
diff --git a/bio/seqkit/rmdup/wrapper.py b/bio/seqkit/rmdup/wrapper.py
@@ -0,0 +1,30 @@
+__author__ = "Filipe G. Vieira"
+__copyright__ = "Copyright 2023, Filipe G. Vieira"
+__license__ = "MIT"
+
+from snakemake.shell import shell
+
+extra = snakemake.params.get("extra", "")
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+
+dup_num = snakemake.output.get("dup_num", "")
+if dup_num:
+    dup_num = f"--dup-num-file {dup_num}"
+
+
+dup_seq = snakemake.output.get("dup_seq", "")
+if dup_seq:
+    dup_seq = f"--dup-seqs-file {dup_seq}"
+
+
+shell(
+    "seqkit rmdup"
+    " --threads {snakemake.threads}"
+    " {extra}"
+    " --out-file {snakemake.output.fastx}"
+    " {dup_num}"
+    " {dup_seq}"
+    " {snakemake.input.fastx}"
+    " {log}"
+)
diff --git a/bio/seqkit/stats/environment.yaml b/bio/seqkit/stats/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - seqkit =2.3.1
diff --git a/bio/seqkit/stats/meta.yaml b/bio/seqkit/stats/meta.yaml
@@ -0,0 +1,14 @@
+name: SeqKit stats
+url: https://bioinf.shenwei.me/seqkit/usage/#stats
+description: |
+  Run SeqKit stats to get simple statistics of FASTA/Q files.
+authors:
+  - Filipe G. Vieira
+input:
+  - fastx: Input FASTA/Q file
+output:
+  - stats: Output stats file
+params:
+  - extra: Optional parameters
+notes: |
+  * The `extra` param allows for additional program arguments.
diff --git a/bio/seqkit/stats/test/Snakefile b/bio/seqkit/stats/test/Snakefile
@@ -0,0 +1,12 @@
+rule seqkit_stats:
+    input:
+        fastx="reads/{sample}.fastq",
+    output:
+        stats="out/stats/{sample}.tsv",
+    log:
+        "logs/stats/{sample}.log",
+    params:
+        extra="--all --tabular",
+    threads: 2
+    wrapper:
+        "master/bio/seqkit/stats"
diff --git a/bio/seqkit/stats/test/reads/a.fastq b/bio/seqkit/stats/test/reads/a.fastq
@@ -0,0 +1,8 @@
+@1
+ACGGCAT
++
+!!!!!!!
+@2
+ATGGCAT
++
+!!!!!!!
diff --git a/bio/seqkit/stats/wrapper.py b/bio/seqkit/stats/wrapper.py
@@ -0,0 +1,18 @@
+__author__ = "Filipe G. Vieira"
+__copyright__ = "Copyright 2023, Filipe G. Vieira"
+__license__ = "MIT"
+
+from snakemake.shell import shell
+
+extra = snakemake.params.get("extra", "")
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+
+shell(
+    "seqkit stats"
+    " --threads {snakemake.threads}"
+    " {extra}"
+    " --out-file {snakemake.output.stats}"
+    " {snakemake.input.fastx}"
+    " {log}"
+)
diff --git a/test.py b/test.py
@@ -140,6 +140,84 @@ def run(wrapper, cmd, check_log=None):
 
 
 
+@skip_if_not_modified
+def test_seqkit_stats():
+    run(
+        "bio/seqkit/stats",
+        [
+            "snakemake",
+            "--cores",
+            "2",
+            "--use-conda",
+            "-F",
+            "out/stats/a.tsv",
+        ],
+    )
+
+@skip_if_not_modified
+def test_seqkit_rmdup():
+    run(
+        "bio/seqkit/rmdup",
+        [
+            "snakemake",
+            "--cores",
+            "2",
+            "--use-conda",
+            "-F",
+            "out/rmdup_name/a.fastq.gz",
+        ],
+    )
+    run(
+        "bio/seqkit/rmdup",
+        [
+            "snakemake",
+            "--cores",
+            "2",
+            "--use-conda",
+            "-F",
+            "out/rmdup_seq/a.fastq.gz",
+        ],
+    )
+
+@skip_if_not_modified
+def test_seqkit_fx2tab():
+    run(
+        "bio/seqkit/fx2tab",
+        [
+            "snakemake",
+            "--cores",
+            "2",
+            "--use-conda",
+            "-F",
+            "out/fx2tab/a.tsv",
+        ],
+    )
+
+@skip_if_not_modified
+def test_seqkit_grep():
+    run(
+        "bio/seqkit/grep",
+        [
+            "snakemake",
+            "--cores",
+            "2",
+            "--use-conda",
+            "-F",
+            "out/grep_name/a.fastq.gz",
+        ],
+    )
+    run(
+        "bio/seqkit/grep",
+        [
+            "snakemake",
+            "--cores",
+            "2",
+            "--use-conda",
+            "-F",
+            "out/grep_seq/a.fastq.gz",
+        ],
+    )
+
 @skip_if_not_modified
 def test_sickle_pe():
     run(