fix: RSEM-calculate-expression wrapper with FASTQ-files as input + In…

…tegration of test case (#970)   ### Description - Replaced deprecated argument for alignment inputs with appropriate argument (`--bam`-> `--alignments`) - Fixed wrong argument for FASTQ-input: Deleted `input_bam = False` - Integrated test-case for FASTQ-input - Fixed environment-yaml-file: bowtie is needed for FASTQ-file input - updated meta.yaml ### Notes The original provided test-case from the authors for BAM-file inputs includes only an empty BAM-file (except of the header). My test case also includes reads, which do not map to the reference sequence (since the provided reference sequence is too short anyways). So as I understand, the purpose of these tests is just to ensure, whether the pipeline is able to run through. However, to make it more robust additional tests with real working examples would be needed. Co-authored-by: Küchler, Oliver <oliver.kuechler@bih-charite.de> Co-authored-by: Johannes Köster <johannes.koester@uni-due.de>
snakemake · Jan 20, 2023 · ea9c897 · ea9c897
1 parent 26271ac
commit ea9c897
Show file tree

Hide file tree

Showing 14 changed files with 50 additions and 6 deletions.
diff --git a/bio/rsem/calculate-expression/environment.yaml b/bio/rsem/calculate-expression/environment.yaml
@@ -4,3 +4,4 @@ channels:
   - nodefaults
 dependencies:
   - rsem =1.3.3
+  - bowtie =1.3.1
diff --git a/bio/rsem/calculate-expression/meta.yaml b/bio/rsem/calculate-expression/meta.yaml
@@ -8,6 +8,8 @@ input:
   - bam: BAM file with reads aligned to transcriptome
   - fq_one: FASTQ file of reads (read_1 for paired-end sequencing)
   - fq_two: Optional second FASTQ file of reads (read_2 for paired-end sequencing)
+  - reference: Index files created by rsem-prepare-reference
+  - reference_bowtie: Additionally needed for FASTQ input; Index files created (by bowtie-build) from the reference transcriptome
 output:
   - genes_results: This file contains per-gene quantification data for the sample
   - isoforms_results: This file contains per-transcript quantification data for the sample

diff --git a/bio/rsem/calculate-expression/test/Snakefile b/bio/rsem/calculate-expression/test/Snakefile
@@ -3,8 +3,10 @@ rule calculate_expression:
         # input.bam or input.fq_one must be specified (and if input.fq_one, optionally input.fq_two if paired-end)
         # an aligned to transcriptome BAM
         bam="mapped/a.bam",
-        # one of the index files created by rsem-prepare-reference; the file suffix is stripped and passed on to rsem
-        reference="index/reference.seq",
+        # Index files created by rsem-prepare-reference
+        reference=multiext("index/reference", ".grp", ".ti", ".transcripts.fa", ".seq", ".idx.fa", ".n2g.idx.fa"),
+        # reference_bowtie: Additionally needed for FASTQ input; Index files created (by bowtie-build) from the reference transcriptome
+        # reference_bowtie=multiext("index/reference", ".1.ebwt", ".2.ebwt", ".3.ebwt", ".4.ebwt", ".rev.1.ebwt", ".rev.2.ebwt"),
     output:
         # genes_results must end in .genes.results; this suffix is stripped and passed to rsem as an output name prefix
         # this file contains per-gene quantification data for the sample

diff --git a/bio/rsem/calculate-expression/test/Snakefile_fastq b/bio/rsem/calculate-expression/test/Snakefile_fastq
@@ -0,0 +1,26 @@
+rule calculate_expression:
+    input:
+        # input.bam or input.fq_one must be specified (and if input.fq_one, optionally input.fq_two if paired-end)
+        fq_one = "fastq/a_R1.fastq",
+        fq_two = "fastq/a_R2.fastq",
+        # Index files created by rsem-prepare-reference
+        reference=multiext("index/reference", ".grp", ".ti", ".transcripts.fa", ".seq", ".idx.fa", ".n2g.idx.fa"),
+        # reference_bowtie: Additionally needed for FASTQ input; Index files created (by bowtie-build) from the reference transcriptome
+        reference_bowtie=multiext("index/reference", ".1.ebwt", ".2.ebwt", ".3.ebwt", ".4.ebwt", ".rev.1.ebwt", ".rev.2.ebwt"),
+    output:
+        # genes_results must end in .genes.results; this suffix is stripped and passed to rsem as an output name prefix
+        # this file contains per-gene quantification data for the sample
+        genes_results="output/a.genes.results",
+        # isoforms_results must end in .isoforms.results and otherwise have the same prefix as genes_results
+        # this file contains per-transcript quantification data for the sample
+        isoforms_results="output/a.isoforms.results",
+    params:
+        # optional, specify if sequencing is paired-end
+        paired_end=True,
+        # additional optional parameters to pass to rsem, for example,
+        extra="--seed 42",
+    log:
+        "logs/rsem/calculate_expression/a.log",
+    threads: 2
+    wrapper:
+        "master/bio/rsem/calculate-expression"
diff --git a/bio/rsem/calculate-expression/test/fastq/a_R1.fastq b/bio/rsem/calculate-expression/test/fastq/a_R1.fastq
@@ -0,0 +1,4 @@
+@test:sequence:a/1
+AATAAAGTAAAAGCATGTTCCAGTTTAAGATCAATGCCAAATTTCTGAACTGGGGCATAATACTTACTCTGCATCTTTTACAACACTTCTAAGAGTTAATCACATAGATCGGAAGAGCACACGTC
++
+/<BBBF<FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
diff --git a/bio/rsem/calculate-expression/test/fastq/a_R2.fastq b/bio/rsem/calculate-expression/test/fastq/a_R2.fastq
@@ -0,0 +1,4 @@
+@test:sequence:a/2
+TTATTTCATTTTCGTACAAGGTCAAATTCTAGTTACGGTTTAAAGACTTGACCCCGTATTATGAATGAGACGTAGAAAATGTTGTGAAGATTCTCAATTAGTGTATCTAGCCTTCTCGTGTGCAG
++
+BBBBBFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFB
diff --git a/bio/rsem/calculate-expression/test/index/reference.1.ebwt b/bio/rsem/calculate-expression/test/index/reference.1.ebwt
diff --git a/bio/rsem/calculate-expression/test/index/reference.2.ebwt b/bio/rsem/calculate-expression/test/index/reference.2.ebwt
diff --git a/bio/rsem/calculate-expression/test/index/reference.3.ebwt b/bio/rsem/calculate-expression/test/index/reference.3.ebwt
diff --git a/bio/rsem/calculate-expression/test/index/reference.4.ebwt b/bio/rsem/calculate-expression/test/index/reference.4.ebwt
diff --git a/bio/rsem/calculate-expression/test/index/reference.rev.1.ebwt b/bio/rsem/calculate-expression/test/index/reference.rev.1.ebwt
diff --git a/bio/rsem/calculate-expression/test/index/reference.rev.2.ebwt b/bio/rsem/calculate-expression/test/index/reference.rev.2.ebwt
diff --git a/bio/rsem/calculate-expression/wrapper.py b/bio/rsem/calculate-expression/wrapper.py
@@ -14,13 +14,12 @@
 if bam:
     if fq_one:
         raise Exception("Only input.bam or input.fq_one expected, got both.")
-    input_bam = " --bam"
+    input_bam = "--alignments"
     input_string = bam
     paired_end = snakemake.params.get("paired-end", False)
 else:
     input_bam = ""
     if fq_one:
-        input_bam = False
         if isinstance(fq_one, list):
             num_fq_one = len(fq_one)
             input_string = ",".join(fq_one)
@@ -62,7 +61,7 @@
         "(rsem will append .isoforms.results suffix)"
     )
 
-reference_prefix = os.path.splitext(snakemake.input.reference)[0]
+reference_prefix = os.path.splitext(snakemake.input.reference[0])[0]
 
 extra = snakemake.params.get("extra", "")
 threads = snakemake.threads

diff --git a/test.py b/test.py
@@ -4932,11 +4932,17 @@ def test_collectgcbiasmetrics():
 
 
 @skip_if_not_modified
-def test_calculate_expression():
+def test_rsem_calculate_expression():
     run(
         "bio/rsem/calculate-expression",
         ["snakemake", "--cores", "1", "--use-conda", "-F"],
     )
+
+    run(
+        "bio/rsem/calculate-expression",
+        ["snakemake", "--cores", "1", "--use-conda", "-F",
+        "-s", "Snakefile_fastq"],
+    )
 
 
 @skip_if_not_modified